In [100]:
import pandas as pd
import requests
import praw
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from datetime import datetime, timedelta
import pytz

In [101]:
data = pd.read_csv("completeNewsSet2019.csv")
data2 = pd.read_csv("completeNewsSet2020.csv")
data3 = pd.concat([data, data2], ignore_index=True, sort=False)
data3.head(3)

headlines_train = []
for ind, row in data3.iterrows():
    headlines_train.append(" ".join(str(x) for x in row[2:]))

In [102]:
vectorizer = TfidfVectorizer(min_df=0.03, max_features=100000, ngram_range=(2,2))
X_train = vectorizer.fit_transform(headlines_train)
y_train = data3['Result']

In [103]:
def get_reddit_news_month():
    # New York timezone
    ny = pytz.timezone("America/New_York")
    
    # Establish Reddit API connection
    reddit = praw.Reddit(client_id='J32hsxMDopeDEPrSFZz5zQ', \
                     client_secret='r8heiWleQYzL7JyKWU7QbwlXFCLmDg', \
                     user_agent='aleksanm', \
                     username='aleksanm', \
                     password='red26!red')
    
    # Get top daily headlines for last month
    # Convert dates to NY time    
    subreddit = reddit.subreddit("worldnews")
    headlines = []
    headline_dates = []
    for submission in subreddit.top("month", limit=500):
        headlines.append(submission.title)
        headline_dates.append(datetime.fromtimestamp(int(submission.created_utc)).astimezone(ny).date())
    
    # Store in dataframe
    month_news = pd.DataFrame(columns={"Date", "Headline"})
    month_news["Date"] = headline_dates
    month_news["Headline"] = headlines
    month_news.sort_values("Date", inplace=True)
    
    # Merge daily headlines into single string for each date
    # Limit to 25 headlines per date
    headlines = [month_news.iloc[0]["Headline"]]
    count = 0
    for i in range(1, len(month_news)):
        if month_news.iloc[i]["Date"] == month_news.iloc[i-1]["Date"]:
            count += 1
            if count <= 25:
                headlines[-1] = headlines[-1] + " " + month_news.iloc[i]["Headline"]
        else:
            headlines.append(month_news.iloc[i]["Headline"])
            count = 0
            
    # Get unique headline dates
    unique_dates = []
    for date in sorted(headline_dates):
        if date not in unique_dates:
            unique_dates.append(date)
            
    return headlines, unique_dates, month_news

headlines_test, unique_dates, news_test = get_reddit_news_month()
news_test

Unnamed: 0,Headline,Date
494,Macron says Europe must assert independence fr...,2021-09-28
409,Thousands of women took to the streets across ...,2021-09-28
135,Canada grants asylum to four people who hid Ed...,2021-09-28
440,British army to start driving tankers as queue...,2021-09-29
282,Insulting Chinese national flag on the interne...,2021-09-29
...,...,...
244,World is failing to make changes needed to avo...,2021-10-28
372,Germany: Inflation reaches 28-year high amid s...,2021-10-28
36,‘Starve to death’: Kim Jong Un orders North Ko...,2021-10-28
81,2% of Elon Musk's wealth could solve world hun...,2021-10-28


In [104]:
#ticker = "^GSPC"

ticker = "^DJI"
# Create time filter
today = datetime.today().strftime("%d/%m/%Y")
today = datetime.strptime(today + " +0000", "%d/%m/%Y %z")
thirty_days_ago = today-timedelta(days=30)
to = int(today.timestamp())
fro = int(thirty_days_ago.timestamp())
# Put stock price data in dataframe
url = "https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={fro}&period2={to}&interval=1d&events=history".format(ticker=ticker, fro=fro, to=to)
dj = pd.read_csv(url)
# Convert date to timestamp and make index
dj.index = dj["Date"].apply(lambda x: pd.Timestamp(x))
dj.drop("Date", axis=1, inplace=True)

dj["Returns"] = dj["Adj Close"]/dj["Adj Close"].shift(1) - 1
dj.dropna(inplace=True)

In [105]:
# Get X_test and y_test
# Positive/zero return: y = 1, negative return: y = 0
X_test = []        
y_test = []
for j in range(len(unique_dates)):
    for i in range(1, len(dj)):
        if dj.index[i-1] == unique_dates[j]:
            ret = dj.iloc[i]["Returns"]
            if ret >= 0:
                y_test.append(1)
            else:
                y_test.append(0)
            X_test.append(headlines_test[j])
            
X_test = vectorizer.transform(X_test)

In [110]:
# Fit logistic regression model
model = LogisticRegression(solver="saga", penalty="l1", max_iter = 500)
model = model.fit(X_train, y_train)
# Get test predictions
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Logic Regression accuracy: ', acc)
confusion_matrix(y_test, y_pred)

Logic Regression accuracy:  0.45


array([[ 8,  0],
       [11,  1]])