In [344]:
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import re

In [195]:
df = pd.read_csv('Applenews2019.csv', index_col=0)
df.dropna(subset=['fulltext'], inplace=True)
df.shape

(6545, 7)

# Clean newlines and special characters

In [196]:
def clean_text(text):
    text.replace("\n"," ")
    text =  ' '.join(re.sub("([^0-9A-Za-z])"," ",text).split())
    return text.lower()

In [198]:
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

#### Changing the UTC time to EST.

In [199]:
import pytz

In [200]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [201]:
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

# Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [202]:
import tldextract

In [203]:
def get_outlet(link):
    res = tldextract.extract(link)
    return res.domain

In [204]:
df['news_outlet'] = df['source'].apply(get_outlet)

# Getting historical Stock Prices

In [166]:
from twelvedata import TDClient
td = TDClient(apikey="ef26202dacaf412fb157a05403f81ca3") 

In [59]:
 ts = td.time_series(
    symbol="AAPL",
    interval="1day",
    start_date="2020-8-01",
    end_date="2020-8-30"
    ).as_pandas()

In [64]:
times = []

In [61]:
times.append(ts)

In [346]:
def get_month_day_range(year):
    ranges = []
    year = [(pd.datetime(year,1,1)), (pd.datetime(year,2,1)), (pd.datetime(year,3,1)),
           (pd.datetime(year,4,1)), (pd.datetime(year,5,1)), (pd.datetime(year,6,1)),
           (pd.datetime(year,7,1)), (pd.datetime(year,8,1)), (pd.datetime(year,9,1)),
           (pd.datetime(year,10,1)), (pd.datetime(year,11,1)), (pd.datetime(year,12,1))]
    for date in year:
        last_day = date + relativedelta(day=1, months=+1, days=-1)
        first_day = date + relativedelta(day=1)
        ranges.append((first_day.strftime('%Y-%m-%d'), last_day.strftime('%Y-%m-%d')))
    return ranges

months_2020 = get_month_day_range(2020)
months_2020

[('2020-01-01', '2020-01-31'),
 ('2020-02-01', '2020-02-29'),
 ('2020-03-01', '2020-03-31'),
 ('2020-04-01', '2020-04-30'),
 ('2020-05-01', '2020-05-31'),
 ('2020-06-01', '2020-06-30'),
 ('2020-07-01', '2020-07-31'),
 ('2020-08-01', '2020-08-31'),
 ('2020-09-01', '2020-09-30'),
 ('2020-10-01', '2020-10-31'),
 ('2020-11-01', '2020-11-30'),
 ('2020-12-01', '2020-12-31')]

In [None]:
dates = []

In [67]:
#Gather historical data from twelvedata API
counter =1
for start,end in months_2020:
    ts = td.time_series(
    symbol="AAPL",
    interval="1day",
    start_date=start,
    end_date=end
    ).as_pandas()
    times.append(ts)
    print(counter, start,end)
    counter +=1
    

1 2020-9-1 2020-9-30
2 2020-10-1 2020-10-31
3 2020-11-1 2020-11-30


In [73]:
stock_prices_2019_appl = pd.concat(times)

In [112]:
#Normalize the datetime indexes
stock_prices_2019_appl
stock_prices_2019_appl.index = stock_prices_2019_appl.index.normalize()

In [175]:
#Merge prices and main df's on the date
merge=pd.merge(df,stock_prices_2019_appl, how='outer', left_index=True, right_index=True)

In [123]:
#localize the price df
stock_prices_2019_appl.index = stock_prices_2019_appl.index.tz_localize(None)

In [267]:
stock_prices_2019_appl['day_change'] = np.nan
stock_prices_2019_appl['increase'] = np.nan

In [216]:
prices_index = stock_prices_2019_appl.index.strftime('%Y-%m-%d').to_list()

In [321]:
start_dt = stock_prices_2019_appl[prices_index[1]].index
start = stock_prices_2019_appl[prices_index[1]].open[0]
start_dt, start

(DatetimeIndex(['2020-01-30'], dtype='datetime64[ns]', name='datetime', freq=None),
 80.135)

In [322]:
stop_dt = stock_prices_2019_appl[prices_index[2]].index
stop = stock_prices_2019_appl[prices_index[2]].open[0]
stop_dt, stop

(DatetimeIndex(['2020-01-29'], dtype='datetime64[ns]', name='datetime', freq=None),
 81.1125)

In [323]:
start-stop

-0.977499999999992

In [327]:
df_res = pd.DataFrame(columns = ['day_change', 'increase', 'date'])
for i,stock_price in enumerate(prices_index):
    try:
        start = stock_prices_2019_appl.loc[prices_index[i]].open[0]
        stop = stock_prices_2019_appl.loc[prices_index[i+1]].open[0]
        direction = start - stop
        if direction < 0:
            increase = 0
        else:
            increase = 1
        df_res = df_res.append({'day_change': direction, 'increase':increase, 'date':stock_price}, ignore_index=True)
    except:
        continue

In [335]:
stock_prices_2019_appl['2020-2-12']

Unnamed: 0_level_0,open,high,low,close,volume,day_change,increase
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-12,80.3675,81.805,80.3675,81.8,28432600,,


In [328]:
df_res

Unnamed: 0,day_change,increase,date
0,0.0975,1,2020-01-31
1,-0.9775,0,2020-01-30
2,2.9625,1,2020-01-29
3,0.6350,1,2020-01-28
4,-2.5475,0,2020-01-27
...,...,...,...
220,2.1800,1,2020-11-09
221,0.3700,1,2020-11-06
222,3.8100,1,2020-11-05
223,4.4800,1,2020-11-04


In [329]:
df_res["date"]= pd.to_datetime(df_res["date"])
df_res = df_res.set_index('date')
df_res.index = df_res.index.normalize()

In [342]:
targets2020 = df_res.shift(periods=1, fill_value=0)
targets2020.head()

Unnamed: 0_level_0,day_change,increase
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-31,0.0,0
2020-01-30,0.0975,1
2020-01-29,-0.9775,0
2020-01-28,2.9625,1
2020-01-27,0.635,1


In [343]:
targets2020.to_csv('yearly_targets/targets2020.csv')

# Predict Sentiment for each Article with VADER

In [134]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [135]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(article):
    '''
    VADER Sentiment used to tag the.
    Returns the predicted labels: positive/negative/neutral.
    Instantiate analyzer before running this function:
    analyzer = SentimentIntensityAnalyzer()
    '''
    global analyzer

    score = analyzer.polarity_scores(article)

    if score['compound'] >= .05:
        sent = 'positive'
    elif score['compound'] <= -.05:
        sent = 'negative'
    else:
        sent = 'neutral'

    return sent

In [148]:
sentiment_analyzer_scores(df.cleaned_text[47])

'positive'

In [159]:
df.cleaned_text[79]

'a texas court has ordered apple to pay more than 500 million roughly rs 3 742 crores in damages and interest for 4g patent infringements held by intellectual property company panoptis the us tech giant now worth almost 2 trillion roughly rs 1 49 70 000 crores vowed to appeal tuesday s decision we thank the jury for their time but are disappointed with the verdict and plan to appeal apple said in an email response to an afp inquiry lawsuits like this by companies who accumulate patents simply to harass the industry only serve to stifle innovation and harm consumers panoptis which specialises in licensing patents took apple to court in february last year claiming it refused to pay for the use of 4g lte technologies in its smartphones tablets and watches the complainant s have repeatedly negotiated with apple to reach an agreement for a frand license to the complainant s patent portfolios which apple is infringing the court filing read frand refers to terms that are fair reasonable and n

In [151]:
for i,art in enumerate(df.cleaned_text[:200]):
    score = sentiment_analyzer_scores(art)
    print(i, score)

0 positive
1 positive
2 negative
3 positive
4 positive
5 positive
6 positive
7 positive
8 positive
9 positive
10 positive
11 positive
12 positive
13 positive
14 negative
15 positive
16 positive
17 positive
18 positive
19 positive
20 negative
21 positive
22 negative
23 positive
24 positive
25 positive
26 positive
27 positive
28 positive
29 positive
30 positive
31 positive
32 positive
33 positive
34 positive
35 positive
36 positive
37 negative
38 negative
39 positive
40 positive
41 positive
42 positive
43 positive
44 positive
45 positive
46 positive
47 positive
48 positive
49 positive
50 positive
51 positive
52 positive
53 positive
54 positive
55 positive
56 positive
57 negative
58 negative
59 negative
60 positive
61 negative
62 positive
63 positive
64 positive
65 positive
66 positive
67 positive
68 positive
69 positive
70 positive
71 negative
72 positive
73 negative
74 negative
75 positive
76 positive
77 positive
78 positive
79 negative
80 positive
81 positive
82 positive
83 positive
84