In [None]:
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import re
import matplotlib.pyplot as plt
import time
import seaborn as sns
from nltk.probability import FreqDist
%matplotlib inline

In [None]:
df1 = pd.read_csv('yearly_articles/apple2020.csv', index_col=0)
df2 = pd.read_csv('yearly_articles/apple2019.csv', index_col=0)
df3 = pd.read_csv('yearly_articles/apple2018.csv', index_col=0)
df4 = pd.read_csv('yearly_articles/apple2017.csv', index_col=0)
df5 = pd.read_csv('yearly_articles/apple2016.csv', index_col=0)
df6 = pd.read_csv('yearly_articles/apple2015.csv', index_col=0)
df = pd.concat([df1,df2, df3, df4, df5, df6])
df.dropna(subset=['fulltext'], inplace=True)
df.shape

# Clean newlines and special characters

In [None]:
def clean_text(text):
    text.replace("\n"," ")
    text =  ' '.join(re.sub("([^0-9A-Za-z])"," ",text).split())
    return text.lower()

In [None]:
%%time
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

#### Changing the UTC time to EST.

In [None]:
import pytz

In [None]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [None]:
%%time
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

# Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [None]:
import tldextract

In [None]:
def get_outlet(link):
    res = tldextract.extract(link)
    return res.domain

In [None]:
%%time
df['news_outlet'] = df['source'].apply(get_outlet)
print('The dataset contains {} different articles from {} news outlets \n'.format(df.shape[0],df.news_outlet.nunique()))

# Getting historical Stock Prices

In [None]:
from twelvedata import TDClient
td = TDClient(apikey="ef26202dacaf412fb157a05403f81ca3") 

In [None]:
def get_month_day_range(year):
    ranges = []
    year = [(pd.datetime(year,1,1)), (pd.datetime(year,2,1)), (pd.datetime(year,3,1)),
           (pd.datetime(year,4,1)), (pd.datetime(year,5,1)), (pd.datetime(year,6,1)),
           (pd.datetime(year,7,1)), (pd.datetime(year,8,1)), (pd.datetime(year,9,1)),
           (pd.datetime(year,10,1)), (pd.datetime(year,11,1)), (pd.datetime(year,12,1))]
    for date in year:
        last_day = date + relativedelta(day=1, months=+1, days=-1)
        first_day = date + relativedelta(day=1)
        ranges.append((first_day.strftime('%Y-%m-%d'), last_day.strftime('%Y-%m-%d')))
    return ranges

months_2020 = get_month_day_range(2020)
months_2020

In [None]:
months_2018 = get_month_day_range(2010)
months_2018

In [None]:
times = []

In [None]:
#Gather historical data from twelvedata API
counter =1
for start,end in months_2018:
    time.sleep(25)
    ts = td.time_series(
    symbol="AAPL",
    interval="1day",
    start_date=start,
    end_date=end
    ).as_pandas()
    times.append(ts)
    print(counter, start,end)
    counter +=1
    

In [None]:
stock_prices_appl = pd.concat(times)

In [None]:
#Normalize the datetime indexes
stock_prices_appl
stock_prices_appl.index = stock_prices_appl.index.normalize()

In [None]:
#Merge prices and main df's on the date
merge=pd.merge(df,stock_prices_appl, how='outer', left_index=True, right_index=True)

In [None]:
#localize the price df
stock_prices_appl.index = stock_prices_appl.index.tz_localize(None)

In [None]:
stock_prices_appl['day_change'] = np.nan
stock_prices_appl['increase'] = np.nan

In [None]:
prices_index = stock_prices_appl.index.strftime('%Y-%m-%d').to_list()

In [None]:
df_res = pd.DataFrame(columns = ['day_change', 'increase', 'date'])
for i,stock_price in enumerate(prices_index):
    try:
        start = stock_prices_appl.loc[prices_index[i]].open[0]
        stop = stock_prices_appl.loc[prices_index[i+1]].open[0]
        direction = start - stop
        if direction < 0:
            increase = 0
        else:
            increase = 1
        df_res = df_res.append({'day_change': direction, 'increase':increase, 'date':stock_price}, ignore_index=True)
    except:
        continue

In [None]:
df_res["date"]= pd.to_datetime(df_res["date"])
df_res = df_res.set_index('date')
df_res.index = df_res.index.normalize()

In [None]:
targets2020 = df_res.shift(periods=1, fill_value=0)
targets2020.head()

In [None]:
targets2020.to_csv('yearly_targets/targets2010.csv')

# Predict Sentiment for each Article with VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(article):
    '''
    VADER Sentiment used to tag the.
    Returns the predicted labels: positive/negative/neutral.
    Instantiate analyzer before running this function:
    analyzer = SentimentIntensityAnalyzer()
    '''
    global analyzer

    score = analyzer.polarity_scores(article)

    if score['compound'] >= .05:
        sent = 'positive'
    elif score['compound'] <= -.05:
        sent = 'negative'
    else:
        sent = 'neutral'

    return sent

In [None]:
%%time
df['sentiment'] = df['fulltext'].apply(sentiment_analyzer_scores)

In [None]:
sns.countplot(df.sentiment);

In [None]:
# for i,art in enumerate(df.cleaned_text[:200]):
#     score = sentiment_analyzer_scores(art)
#     print(i, score)

# Tokenize

In [None]:
from nltk.tokenize import regexp_tokenize 

def toke(text):
    tokens = regexp_tokenize(text, "[\w']+")
    return tokens

df['tokens'] = df['cleaned_text'].apply(toke)

# Lemmatize 

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
def unlist(x):
    return ", ".join(x)

In [None]:
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stop_words=list(set(stopwords.words("english")))

In [None]:
eda_stopwords = [
    'x', 'u', "'", 'e', 'a', 'i', 'n', 'u', 'd', 'c', 'p', 's', 'i',
    'o', 'r', 't', 'journalism', 'support', 'u', 'editor', 'fair', 'informed',
    'cookie', 'miamiaccording', 'article', 'expired', 'no', 'longer', 'want',
    'search', 'google', 'every', 'term', 'newswire', 'subscribe', 'button', 'close',
    'accept', 'goal', 'achieve', 'u', 'subscribed', 'many', 'continue', 'offer',
    'hard', 'provide', 'dear', 'reader', 'standard', 'always', 'strived', 'miamiinterested',
    'adopting', 'pet', 'gazing', 'lovable', 'pup', 'adoption', 'dog', 'animal', 'shelter',
    'ziprecruiter', 'miami', 'policy', 'clicking', 'explicit', 'consent',
    'please', 'see', 'even', 'better', 'relevant', 'goal', 'le', 'u,', 'philip', 'schiller',
    'believe', 'getty', 'josh', 'edelson', 'topical', 'issue', 'relevance',
    'seen', 'man', 'forward', 'dunkin', 'late', 'wife', 'bagelsee', 'rental', 'site', 'zumper',
    'quarantinefind', 'irvine', 'using', 'yelp', 'find', 'devon', 'horse', 'show',
    'urge', 'turn', 'ad', 'blocker', 'telegraph', 'barbecue', 'stop', 'crunched',
    'porch', 'ebay', 'amazon', 'curry', 'weeknightsset', 'easy', 'dinner', 'matter', 'partner',
    'find', 'detailed', 'description', 'apartment', 'got', 'news', 'mission', 'day', 'impersonal',
    'get', 'tip', 'top', 'mirror', 'newsletter', 'sign', 'thank', 'subscribing',
    'newsletter', 'invalid', 'full', 'swing', 'keen', 'get', 'hand', 'high', 'street',
    'john', 'lewis', 'curry', 'ton', 'currently', 'available', 'actual', 'check', 'back', 'also', 'honor',
    'writer', 'try', 'put', 'apartment', 'rent', 'via', 'go', 'rounded', 'dog', 'shelter', 'pup',
    'dozen', 'donut', 'south', 'targeted', 'practise', 'floridado', 'love', 'florida', 'doggy',
    'cancer', 'hide', 'caption', 'cooky', 'browser', 'sauce', 'pandemicthe',
    'something', 'penguina', 'eagle', 'email', 'notification', 'irvinein', 'hoodline',
    'recipe', 'perfect', 'meal', 'googlethe', 'v' 
]


stop_words.extend(eda_stopwords)

In [None]:
def tokenize(text):
    return nltk.word_tokenize(text)
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]
def lemmatize_text(text):
     return [lemmatizer.lemmatize(word) for word in text]

In [None]:
df.tokens = df.tokens.apply(remove_stopwords)

In [None]:
df.tokens = df.tokens.apply(lemmatize_text)

In [None]:
df.tokens = df.tokens.apply(unlist)

In [None]:
df.tokens[4]

# Freqdist

In [None]:
fdist=FreqDist(df.tokens)
fdist.plot(15);

# Look for additional stop words

In [None]:
cloud = df.drop_duplicates(subset=['tokens'])
text = cloud.tokens

In [None]:
text = " ".join(tweet for tweet in text)

In [None]:
from wordcloud import WordCloud

In [None]:
fig, ax = plt.subplots(figsize=(12,17))

wordcloud = WordCloud(max_words=200,collocations=False, width=1000, height=700, background_color="black").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

plt.show()
#wordcloud.to_file('all_tweets_wordcloud.png')

# TFIDF

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer 

rf_feats = df.tokens.values
tfidfconverter = TfidfVectorizer(max_features=10000, ngram_range=(1,3))  

X = tfidfconverter.fit_transform(rf_feats).toarray()

tfidf_df = pd.DataFrame(X)

In [None]:
df = df.reset_index()
frames = [df, tfidf_df]
main_df = pd.concat(frames, axis=1) 

In [None]:
main_df.to_csv('main_data/maindf.csv')

In [None]:
main_df.head()

In [None]:
df.groupby(df.index).sum()

In [None]:
df.head()