In [None]:
import pandas as pd
import numpy as np
import datetime
import re
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pytz
from nltk.probability import FreqDist
from custom_scripts import *
%matplotlib inline

# Trading on Sentiment

### Data Cleaning Notebook

In this notebook we will address the data cleaning steps needed in order to have a dataset suitable for modeling and analysis. We will import all of the yearly articles, convert the datetimes to Eastern Standard Time, tag the sentiment of each article, and combine each article with its historical stock price information for the day it was published. Finally we will tokenize, remove stop words, then aggregate all of the values so that we have one row of information per day. 

##### Import yearly data.

In [None]:
df1 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2020.csv', index_col=0)
df2 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2019.csv', index_col=0)
df3 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2018.csv', index_col=0)
df4 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2017.csv', index_col=0)
df5 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2016.csv', index_col=0)
df6 = pd.read_csv('/Volumes/pimyllifeupshare/training_data/Goldman Sachs2015.csv', index_col=0)
df = pd.concat([df1,df2, df3, df4, df5, df6])
df.dropna(subset=['fulltext'], inplace=True)
df.shape

##### Clean newlines and special characters

In [None]:
%%time
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

##### Changing the UTC time to EST.

In [None]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [None]:
%%time
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

### Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [None]:
%%time
df['news_outlet'] = df['source'].apply(get_outlet)
print('The dataset contains {} different articles from {} news outlets \n'.format(df.shape[0],df.news_outlet.nunique()))

# Getting historical Stock Prices

In [None]:
years_we_need = [2015, 2016, 2017, 2018, 2019, 2020]

full_date_list = []

for year in years_we_need:
    res = get_month_day_range(year)
    full_date_list += res

The `get_past_prices` custom function uses a list of dates and a ticker symbol to call the twelvedata.com API for all of the dates in the provided list. In our case we want historical prices over the past 5 years, because we have 5 years worth of articles.

In [None]:
historical_prices = get_past_prices(full_date_list, 'GS')

The loop below iterates through the historical prices and calculates the change in a stock price from one open to another. Adding a 0 if the stock decreased or there was not change, and adding a 1 if the stock increased. This is an initial tagging step, the threshold for targets can be adjusted later using the 'day_change' column.

In [None]:
prices_index = historical_prices.index.strftime('%Y-%m-%d').to_list()

In [None]:
historical_prices.sort_index(inplace=True)

In [None]:
df_res = pd.DataFrame(columns = ['day_change', 'increase', 'date'])
for i,stock_price in enumerate(prices_index):
    try:
        today = historical_prices.loc[prices_index[i]].open
        tomorrow = historical_prices.loc[prices_index[i+1]].open
        direction = tomorrow - today
        if direction < 0:
            increase = 0
        else:
            increase = 1
        df_res = df_res.append({'day_change': direction, 'increase':increase, 'date':stock_price}, ignore_index=True)
    except Exception as e:
        continue

In [None]:
df_res.tail(5)

In [None]:
df_res["date"]= pd.to_datetime(df_res["date"])
df_res = df_res.set_index('date')

In [None]:
df_res.sort_index(inplace=True)
targets = df_res

In [None]:
#merge the historical prices with the daily change we calculated and the targets. 
targs=pd.merge(targets,historical_prices, how='outer', left_index=True, right_index=True)

In [None]:
targs.tail(4)

In [None]:
#Save the targets
targs.to_csv('gs_targs.csv')

In [None]:
#Merge targets and main data on the date
df=pd.merge(df,targs, how='outer', left_index=True, right_index=True)

To account for weekends and holidays when the market is closed. Forward filling of the previous non-NA value is used. 

In [None]:
df.fillna(method='ffill', inplace = True)

#Drop the few late 2014 values where we have not price data. 
df.dropna(subset=['increase', 'open', 'high', 'low', 'close'], inplace = True)

# Predict Sentiment for each Article with VADER

To tag the sentiment of each article, we will use the [VADER](https://github.com/cjhutto/vaderSentiment) sentiment analyzer. The `sentiment_analyzer_scores` custom function inputs a string and output the result of the VADER sentiment prediction. Vader is primarily used for social media text; however, is effective with news articles as well. 

In [None]:
%%time

#tag the sentiment for each article using VADER. This will take a few minutes.
df['sentiment'] = df['fulltext'].apply(sentiment_analyzer_scores)

After predicting sentiment of the article, we can create dummies of the values.

In [None]:
sentiment_dummies = pd.get_dummies(df['sentiment'], prefix='sent')
df = pd.concat([df, sentiment_dummies], axis=1)

In [None]:
df.head(2)

# Tokenize

In [None]:
df['tokens'] = df['cleaned_text'].apply(toke)

# Lemmatize/Stop Word Removal

Three custom functions: `remove_stopwords`, `lemmatize_text`, `unlist` will be used to process the word tokens we created.

In [None]:
pre_process = [remove_stopwords, lemmatize_text, unlist]

for action in pre_process:
    df.tokens = df.tokens.apply(action)
    print('Completed: {}'.format(str(action)))

In [None]:
#Make sure there are no duplicate articles.
df.drop_duplicates(subset=['tokens'], inplace=True)

# Filtering out irellevant articles.

Put all of the articles into a bag of words.

In [None]:
bOw = df.fulltext.to_list()

Generate a wordcount distribution of the full corpus.

In [None]:
corp_dist = corpus_dist(bOw)

Make a new columns called `relevancy_score` which is the proportion of irrelevant text in the article.

In [None]:
df['relevancy_score'] = df['fulltext'].apply(filter_articles,corp_list=corp_dist)

Filter out only irrelevant articles.

In [None]:
df = df.loc[(df['relevancy_score'] < .001)]

# Aggregate the daily news articles

We will perform modeling on the aggregated article text per day. Our data is in a format that has each row as a new article, we want to aggregate all of the articles on a given day into a single row.

In [None]:
agged = df.copy()
agged.reset_index(inplace=True)
agged['date'] = pd.to_datetime(agged['index'])
agged.drop('index', axis=1, inplace=True)

In [None]:
#Add a column of 1's for when we aggregate all info into one column, we can add the 1's later to get the total articles per day. 
agged['total_articles'] = 1

In [None]:
sentiment = agged.groupby('date')['sent_negative', 'sent_positive', 'total_articles'].agg(np.sum)
text = agged.groupby('date')['tokens'].agg(''.join)

In [None]:
agged = pd.merge(text, sentiment, how='inner', left_index=True, right_index=True)
#Merge targets
agged = pd.merge(agged, targs, how='inner', left_index=True, right_index=True)

In [None]:
agged.head(2)

# Save the cleaned dataframe 

This dataframe will be used in the modeling process.

In [None]:
# agged.to_pickle('main_data/gs/maindf.pkl')

In [None]:
# df.to_pickle('main_data/gs/seperated_data.pkl')