In [27]:
import pandas as pd
import numpy as np
import datetime
import re
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pytz
from nltk.probability import FreqDist
from custom_scripts import *
%matplotlib inline

# Trading on Sentiment

### Data Cleaning Notebook

In this notebook we will address the data cleaning steps needed in order to have a dataset suitable for modeling and analysis. We will import all of the yearly articles, convert the datetimes to Eastern Standard Time, tag the sentiment of each article, and combine each article with its historical stock price information for the day it was published. Finally we will tokenize, remove stop words, then aggregate all of the values so that we have one row of information per day. 

##### Import yearly data.

In [28]:
df1 = pd.read_csv('yearly_articles/apple2020.csv', index_col=0)
df2 = pd.read_csv('yearly_articles/apple2019.csv', index_col=0)
df3 = pd.read_csv('yearly_articles/apple2018.csv', index_col=0)
df4 = pd.read_csv('yearly_articles/apple2017.csv', index_col=0)
df5 = pd.read_csv('yearly_articles/apple2016.csv', index_col=0)
df6 = pd.read_csv('yearly_articles/apple2015.csv', index_col=0)
df = pd.concat([df1,df2, df3, df4, df5, df6])
df.dropna(subset=['fulltext'], inplace=True)
df.shape

(39512, 7)

##### Clean newlines and special characters

In [29]:
%%time
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

CPU times: user 10.8 s, sys: 217 ms, total: 11 s
Wall time: 11.3 s


##### Changing the UTC time to EST.

In [30]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [31]:
%%time
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

CPU times: user 283 ms, sys: 5.28 ms, total: 288 ms
Wall time: 290 ms


### Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [32]:
%%time
df['news_outlet'] = df['source'].apply(get_outlet)
print('The dataset contains {} different articles from {} news outlets \n'.format(df.shape[0],df.news_outlet.nunique()))

The dataset contains 39512 different articles from 448 news outlets 

CPU times: user 463 ms, sys: 11 ms, total: 474 ms
Wall time: 487 ms


# Getting historical Stock Prices

In [33]:
years_we_need = [2015, 2016, 2017, 2018, 2019, 2020]

full_date_list = []

for year in years_we_need:
    res = get_month_day_range(year)
    full_date_list += res

The `get_past_prices` custom function uses a list of dates and a ticker symbol to call the twelvedata.com API for all of the dates in the provided list. In our case we want historical prices over the past 5 years, because we have 5 years worth of articles.

In [34]:
historical_prices = get_past_prices(full_date_list, 'AAPL')

1) 2015-01-01 to 2015-01-31
2) 2015-02-01 to 2015-02-28
3) 2015-03-01 to 2015-03-31
4) 2015-04-01 to 2015-04-30
5) 2015-05-01 to 2015-05-31
6) 2015-06-01 to 2015-06-30
7) 2015-07-01 to 2015-07-31
8) 2015-08-01 to 2015-08-31
9) 2015-09-01 to 2015-09-30
10) 2015-10-01 to 2015-10-31
11) 2015-11-01 to 2015-11-30
12) 2015-12-01 to 2015-12-31
13) 2016-01-01 to 2016-01-31
14) 2016-02-01 to 2016-02-29
15) 2016-03-01 to 2016-03-31
16) 2016-04-01 to 2016-04-30
17) 2016-05-01 to 2016-05-31
18) 2016-06-01 to 2016-06-30
19) 2016-07-01 to 2016-07-31
20) 2016-08-01 to 2016-08-31
21) 2016-09-01 to 2016-09-30
22) 2016-10-01 to 2016-10-31
23) 2016-11-01 to 2016-11-30
24) 2016-12-01 to 2016-12-31
25) 2017-01-01 to 2017-01-31
26) 2017-02-01 to 2017-02-28
27) 2017-03-01 to 2017-03-31
28) 2017-04-01 to 2017-04-30
29) 2017-05-01 to 2017-05-31
30) 2017-06-01 to 2017-06-30
31) 2017-07-01 to 2017-07-31
32) 2017-08-01 to 2017-08-31
33) 2017-09-01 to 2017-09-30
34) 2017-10-01 to 2017-10-31
35) 2017-11-01 to 2017-

The loop below iterates through the historical prices and calculates the change in a stock price from one open to another. Adding a 0 if the stock decreased or there was not change, and adding a 1 if the stock increased. This is an initial tagging step, the threshold for targets can be adjusted later using the 'day_change' column.

In [58]:
prices_index = historical_prices.index.strftime('%Y-%m-%d').to_list()

In [59]:
historical_prices.sort_index(inplace=True)

In [60]:
df_res = pd.DataFrame(columns = ['day_change', 'increase', 'date'])
for i,stock_price in enumerate(prices_index):
    try:
        today = historical_prices.loc[prices_index[i]].open
        tomorrow = historical_prices.loc[prices_index[i+1]].open
        direction = tomorrow - today
        if direction < 0:
            increase = 0
        else:
            increase = 1
        df_res = df_res.append({'day_change': direction, 'increase':increase, 'date':stock_price}, ignore_index=True)
    except Exception as e:
        continue

In [65]:
df_res.tail(5)

Unnamed: 0,day_change,increase,date
1450,1.55001,1,2020-12-16
1451,-0.08001,0,2020-12-17
1452,-3.86,0,2020-12-18
1453,6.59,1,2020-12-21
1454,0.4469,1,2020-12-22


In [39]:
df_res["date"]= pd.to_datetime(df_res["date"])
df_res = df_res.set_index('date')

In [40]:
df_res.sort_index(inplace=True)
targets = df_res

In [41]:
#merge the historical prices with the daily change we calculated and the targets. 
targs=pd.merge(targets,historical_prices, how='outer', left_index=True, right_index=True)

In [66]:
targs.tail(4)

Unnamed: 0,day_change,increase,open,high,low,close,volume
2020-12-18,0.08001,1,128.88,129.10001,126.12,126.65,108795507
2020-12-21,3.86,1,125.02,128.31,123.449,128.23,121251553
2020-12-22,-6.59,0,131.61,134.41,129.64999,131.88,168904800
2020-12-23,-0.4469,0,132.0569,132.42,130.78,132.04269,48470341


In [43]:
#Save the targets
targs.to_csv('targs_revision.csv')

In [44]:
#Merge targets and main data on the date
df=pd.merge(df,targs, how='outer', left_index=True, right_index=True)

To account for weekends and holidays when the market is closed. Forward filling of the previous non-NA value is used. 

In [45]:
df.fillna(method='ffill', inplace = True)

#Drop the few late 2014 values where we have not price data. 
df.dropna(subset=['increase', 'open', 'high', 'low', 'close'], inplace = True)

# Predict Sentiment for each Article with VADER

To tag the sentiment of each article, we will use the [VADER](https://github.com/cjhutto/vaderSentiment) sentiment analyzer. The `sentiment_analyzer_scores` custom function inputs a string and output the result of the VADER sentiment prediction. Vader is primarily used for social media text; however, is effective with news articles as well. 

In [46]:
%%time

#tag the sentiment for each article using VADER. This will take a few minutes.
df['sentiment'] = df['fulltext'].apply(sentiment_analyzer_scores)

CPU times: user 16min 13s, sys: 18 s, total: 16min 30s
Wall time: 20min 47s


After predicting sentiment of the article, we can create dummies of the values.

In [48]:
sentiment_dummies = pd.get_dummies(df['sentiment'], prefix='sent')
df = pd.concat([df, sentiment_dummies], axis=1)

In [49]:
df.head(2)

Unnamed: 0,update,source,author,fulltext,summary,title,cleaned_text,cleaned_authors,news_outlet,day_change,increase,open,high,low,close,volume,sentiment,sent_negative,sent_neutral,sent_positive
2015-01-02,2015-01-03 00:00:00,http://mg.co.za/article/2015-01-03-storage-war...,['Staff Reporter'],Apple on Friday faced a lawsuit accusing it of...,,Storage war: Lawsuit accuses Apple of deceivin...,apple on friday faced a lawsuit accusing it of...,staff reporter,mg,0.0,1.0,0.0,0.0,0.0,0.0,53204600.0,negative,1,0,0
2015-01-02,2015-01-03 00:00:00,http://www.independent.ie/business/technology/...,[],Apple is being sued for lack of storage space ...,,Apple sued over lack of storage space on devices,apple is being sued for lack of storage space ...,,independent,0.0,1.0,0.0,0.0,0.0,0.0,53204600.0,negative,1,0,0


# Tokenize

In [50]:
df['tokens'] = df['cleaned_text'].apply(toke)

# Lemmatize/Stop Word Removal

Three custom functions: `remove_stopwords`, `lemmatize_text`, `unlist` will be used to process the word tokens we created.

In [51]:
pre_process = [remove_stopwords, lemmatize_text, unlist]

for action in pre_process:
    df.tokens = df.tokens.apply(action)
    print('Completed: {}'.format(str(action)))

Completed: <function remove_stopwords at 0x7ff1e82f8950>
Completed: <function lemmatize_text at 0x7ff1e82f8840>
Completed: <function unlist at 0x7ff1e82f8730>


In [52]:
#Make sure there are no duplicate articles.
df.drop_duplicates(subset=['tokens'], inplace=True)

# Aggregate the daily news articles

We will perform modeling on the aggregated article text per day. Our data is in a format that has each row as a new article, we want to aggregate all of the articles on a given day into a single row.

In [53]:
agged = df.copy()
agged.reset_index(inplace=True)
agged['date'] = pd.to_datetime(agged['index'])
agged.drop('index', axis=1, inplace=True)

In [54]:
#Add a column of 1's for when we aggregate all info into one column, we can add the 1's later to get the total articles per day. 
agged['total_articles'] = 1

In [55]:
sentiment = agged.groupby('date')['sent_negative', 'sent_positive', 'total_articles'].agg(np.sum)
text = agged.groupby('date')['tokens'].agg(''.join)

In [56]:
agged = pd.merge(text, sentiment, how='inner', left_index=True, right_index=True)
#Merge targets
agged = pd.merge(agged, targs, how='inner', left_index=True, right_index=True)

In [57]:
agged.head(2)

Unnamed: 0,tokens,sent_negative,sent_positive,total_articles,day_change,increase,open,high,low,close,volume
2015-01-02,"apple, friday, faced, lawsuit, accusing, promi...",5,2,7,0.0,1,0.0,0.0,0.0,0.0,53204600
2015-01-05,"heard, much, hyped, apple, watch, would, arriv...",2,11,13,0.0,1,0.0,0.0,0.0,0.0,64285500


# Save the cleaned dataframe 

This dataframe will be used in the modeling process.

In [None]:
agged.to_pickle('main_data/maindf.pkl')

In [None]:
df.to_pickle('main_data/seperated_data.pkl')