In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pytz
from nltk.probability import FreqDist
from custom_scripts import *
%matplotlib inline

In [2]:
df1 = pd.read_csv('yearly_articles/apple2020.csv', index_col=0)
df2 = pd.read_csv('yearly_articles/apple2019.csv', index_col=0)
df3 = pd.read_csv('yearly_articles/apple2018.csv', index_col=0)
df4 = pd.read_csv('yearly_articles/apple2017.csv', index_col=0)
df5 = pd.read_csv('yearly_articles/apple2016.csv', index_col=0)
df6 = pd.read_csv('yearly_articles/apple2015.csv', index_col=0)
df = pd.concat([df1,df2, df3, df4, df5, df6])
df.dropna(subset=['fulltext'], inplace=True)
df.shape

(39512, 7)

# Clean newlines and special characters

In [3]:
%%time
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

CPU times: user 8.64 s, sys: 112 ms, total: 8.75 s
Wall time: 8.87 s


#### Changing the UTC time to EST.

In [4]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [5]:
%%time
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

CPU times: user 237 ms, sys: 4.33 ms, total: 242 ms
Wall time: 243 ms


# Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [6]:
%%time
df['news_outlet'] = df['source'].apply(get_outlet)
print('The dataset contains {} different articles from {} news outlets \n'.format(df.shape[0],df.news_outlet.nunique()))

The dataset contains 39512 different articles from 448 news outlets 

CPU times: user 372 ms, sys: 6.63 ms, total: 379 ms
Wall time: 382 ms


# Getting historical Stock Prices

In [7]:
monthly_ranges = get_month_day_range(2010)
monthly_ranges

[('2010-01-01', '2010-01-31'),
 ('2010-02-01', '2010-02-28'),
 ('2010-03-01', '2010-03-31'),
 ('2010-04-01', '2010-04-30'),
 ('2010-05-01', '2010-05-31'),
 ('2010-06-01', '2010-06-30'),
 ('2010-07-01', '2010-07-31'),
 ('2010-08-01', '2010-08-31'),
 ('2010-09-01', '2010-09-30'),
 ('2010-10-01', '2010-10-31'),
 ('2010-11-01', '2010-11-30'),
 ('2010-12-01', '2010-12-31')]

In [51]:
stock_prices_aapl = get_past_prices(monthly_ranges, 'AAPL')

1) 2010-01-01 to 2010-01-31
2) 2010-02-01 to 2010-02-28
3) 2010-03-01 to 2010-03-31
4) 2010-04-01 to 2010-04-30
5) 2010-05-01 to 2010-05-31
6) 2010-06-01 to 2010-06-30
7) 2010-07-01 to 2010-07-31
8) 2010-08-01 to 2010-08-31
9) 2010-09-01 to 2010-09-30
10) 2010-10-01 to 2010-10-31
11) 2010-11-01 to 2010-11-30
12) 2010-12-01 to 2010-12-31
Final shape: (252, 5)


In [12]:
# stock_prices_aapl['day_change'] = np.nan
# stock_prices_aapl['increase'] = np.nan

The loop below iterates through the historical prices and calculates the change in a stock price from one open to another. Adding a 0 if the stock decreased or there was not change, and adding a 1 if the stock increased. This is an initial tagging step, the threshold for targets can be adjusted later using the 'day_change' column.

In [52]:
prices_index = stock_prices_aapl.index.strftime('%Y-%m-%d').to_list()

In [53]:
df_res = pd.DataFrame(columns = ['day_change', 'increase', 'date'])
for i,stock_price in enumerate(prices_index):
    try:
        start = stock_prices_aapl.loc[prices_index[i]].open[0]
        stop = stock_prices_aapl.loc[prices_index[i+1]].open[0]
        direction = start - stop
        if direction < 0:
            increase = 0
        else:
            increase = 1
        df_res = df_res.append({'day_change': direction, 'increase':increase, 'date':stock_price}, ignore_index=True)
    except:
        continue

In [54]:
df_res["date"]= pd.to_datetime(df_res["date"])
df_res = df_res.set_index('date')

In [55]:
df_res.head(3)

Unnamed: 0_level_0,day_change,increase
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-29,-0.1375,0
2010-01-28,-0.06857,0
2010-01-27,0.03214,1


In [56]:
#shift all targets by one day because we want to predict one day in the future.
targets2020 = df_res.shift(periods=1, fill_value=0)
targets2020.head()

Unnamed: 0_level_0,day_change,increase
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-29,0.0,0
2010-01-28,-0.1375,0
2010-01-27,-0.06857,0
2010-01-26,0.03214,1
2010-01-25,0.12286,1


In [34]:
#merge the historical prices with the daily change we calculated and the targets. 
targs=pd.merge(targets2020,stock_prices_aapl, how='outer', left_index=True, right_index=True)
targs.drop(['day_change_y', 'increase_y'], axis=1, inplace=True)

In [36]:
targs.head(2)

Unnamed: 0,day_change_x,increase_x,open,high,low,close,volume,day_change_y,increase_y
2010-01-04,0.04179,1,7.6225,7.66071,7.585,7.64321,123432400,,
2010-01-05,-0.00786,0,7.66429,7.69964,7.61607,7.65643,150476200,,


In [None]:
#save the targets to the proper folder
targs.to_csv('yearly_targets/targets.csv')

In [None]:
#Merge targets and main data on the date
df=pd.merge(df,targs, how='outer', left_index=True, right_index=True)

# Predict Sentiment for each Article with VADER

In [None]:
%%time

#tag the sentiment for each article using VADER. This will take a few minutes.
df['sentiment'] = df['fulltext'].apply(sentiment_analyzer_scores)

In [None]:
#visualize the sentiment distributions.
sns.countplot(df.sentiment);

# Tokenize

In [None]:
df['tokens'] = df['cleaned_text'].apply(toke)

# Lemmatize/Stop Word Removal

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words=list(set(stopwords.words("english")))

In [None]:
eda_stopwords = [
    'x', 'u', "'", 'e', 'a', 'i', 'n', 'u', 'd', 'c', 'p', 's', 'i',
    'o', 'r', 't', 'journalism', 'support', 'u', 'editor', 'fair', 'informed',
    'cookie', 'miamiaccording', 'article', 'expired', 'no', 'longer', 'want',
    'search', 'google', 'every', 'term', 'newswire', 'subscribe', 'button', 'close',
    'accept', 'goal', 'achieve', 'u', 'subscribed', 'many', 'continue', 'offer',
    'hard', 'provide', 'dear', 'reader', 'standard', 'always', 'strived', 'miamiinterested',
    'adopting', 'pet', 'gazing', 'lovable', 'pup', 'adoption', 'dog', 'animal', 'shelter',
    'ziprecruiter', 'miami', 'policy', 'clicking', 'explicit', 'consent',
    'please', 'see', 'even', 'better', 'relevant', 'goal', 'le', 'u,', 'philip', 'schiller',
    'believe', 'getty', 'josh', 'edelson', 'topical', 'issue', 'relevance',
    'seen', 'man', 'forward', 'dunkin', 'late', 'wife', 'bagelsee', 'rental', 'site', 'zumper',
    'quarantinefind', 'irvine', 'using', 'yelp', 'find', 'devon', 'horse', 'show',
    'urge', 'turn', 'ad', 'blocker', 'telegraph', 'barbecue', 'stop', 'crunched',
    'porch', 'ebay', 'amazon', 'curry', 'weeknightsset', 'easy', 'dinner', 'matter', 'partner',
    'find', 'detailed', 'description', 'apartment', 'got', 'news', 'mission', 'day', 'impersonal',
    'get', 'tip', 'top', 'mirror', 'newsletter', 'sign', 'thank', 'subscribing',
    'newsletter', 'invalid', 'full', 'swing', 'keen', 'get', 'hand', 'high', 'street',
    'john', 'lewis', 'curry', 'ton', 'currently', 'available', 'actual', 'check', 'back', 'also', 'honor',
    'writer', 'try', 'put', 'apartment', 'rent', 'via', 'go', 'rounded', 'dog', 'shelter', 'pup',
    'dozen', 'donut', 'south', 'targeted', 'practise', 'floridado', 'love', 'florida', 'doggy',
    'cancer', 'hide', 'caption', 'cooky', 'browser', 'sauce', 'pandemicthe',
    'something', 'penguina', 'eagle', 'email', 'notification', 'irvinein', 'hoodline',
    'recipe', 'perfect', 'meal', 'googlethe', 'v', 'doggy', 'delightful',
    'place', 'live', 'retire', 'takeout', 'youtubethe', 'barnes', 'museum',
    'cooking', 'nonstick', 'cookware', 'pretzelslearn', 'homemade', 'soft',
    'collectionsmany', 'franklin', 'u', 'gotten', 'tour', 'familiesthis',
    'best', 'spot', 'noticed', 'adblocking', 'help', 'fund', 'award', 'winning',
    'image', 'curry', 'ton', 'miamimiami', 'new', 'jersey', 'photographer',
    'authoritative', 'apartment', 'cheapest', 'downtown', 'bedroom', 'adventure',
    'aquarium', 'artwork', 'pretzel', 'click', 'play', 'tap', 'play',
    'aught', 'newsletter', 'pear', 'david', 'nield', 'gizmodo', 'pic', 'twitter',
    'com', 'thimbleweed', 'monument', 'pas', 'afp', 'u', 'prepear' 
]


stop_words.extend(eda_stopwords)

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [None]:
df.tokens = df.tokens.apply(unlist)
df.tokens = df.tokens.apply(remove_stopwords)
df.tokens = df.tokens.apply(lemmatize_text)

In [None]:
#Drop duplicate tokens.
df.tokens.drop_duplicates(inplace=True)

# Look for additional stop words

In [None]:
cloud = df.drop_duplicates(subset=['tokens'])
text = cloud.tokens

In [None]:
text = " ".join(word for word in text)

In [None]:
from wordcloud import WordCloud

In [None]:
fig, ax = plt.subplots(figsize=(12,17))

wordcloud = WordCloud(max_words=200,collocations=False, width=1000, height=700, background_color="black").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

plt.show()
#wordcloud.to_file('all_tweets_wordcloud.png')

# Save the cleaned dataframe 

In [None]:
df.to_csv('main_data/cleaned_data.csv')