In [9]:
import pandas as pd
import numpy as np
import re

In [195]:
df = pd.read_csv('Applenews2019.csv', index_col=0)
df.dropna(subset=['fulltext'], inplace=True)
df.shape

(6545, 7)

# Clean newlines and special characters

In [196]:
def clean_text(text):
    text.replace("\n"," ")
    text =  ' '.join(re.sub("([^0-9A-Za-z])"," ",text).split())
    return text.lower()

In [198]:
df['cleaned_text'] = df['fulltext'].apply(clean_text)
df['cleaned_authors'] = df['author'].apply(clean_text)

#### Changing the UTC time to EST.

In [199]:
import pytz

In [200]:
#Change to Datetime
df["date"]= pd.to_datetime(df["date"])
df = df.set_index('date')
df.index = df.index.normalize()

In [201]:
#convert DateTime index to eastern time. 
eastern = pytz.timezone('US/Eastern')
df.index = df.index.tz_convert(eastern).tz_localize(None)
#put into year/month/day format
df.index = df.index.strftime('%Y-%m-%d')

# Using [tldextract](https://pypi.org/project/tldextract/) to extract company names from url's

In [202]:
import tldextract

In [203]:
def get_outlet(link):
    res = tldextract.extract(link)
    return res.domain

In [204]:
df['news_outlet'] = df['source'].apply(get_outlet)

In [209]:
df.author[6]

'[\'Chris Smith\', \'Andy Meek\', \'Jacob Siegal\', \'Yoni Heisler\', \'Chris Smith Started Writing About Gadgets As A Hobby\', "Before He Knew It He Was Sharing His Views On Tech Stuff With Readers Around The World. Whenever He\'S Not Writing About Gadgets He Miserably Fails To Stay Away Them", "Although He Desperately Tries. But That\'S Not Necessarily A Bad Thing."]'

# Getting historical Stock Prices

In [166]:
from twelvedata import TDClient
td = TDClient(apikey="ef26202dacaf412fb157a05403f81ca3") 

In [59]:
 ts = td.time_series(
    symbol="AAPL",
    interval="1day",
    start_date="2020-8-01",
    end_date="2020-8-30"
    ).as_pandas()

In [64]:
times = []

In [61]:
times.append(ts)

In [66]:
# months_2020 = [('2020-1-1', '2020-1-31'), ('2020-2-1', '2020-2-29'),
#               ('2020-3-1', '2020-3-31'), ('2020-4-1', '2020-4-30'),
#               ('2020-5-1', '2020-5-31'), ('2020-6-1', '2020-6-30'),
#               ('2020-7-1', '2020-7-31'), ('2020-8-1', '2020-8-31'),
#               ('2020-9-1','2020-9-30'), ('2020-10-1', '2020-10-31'),
#               ('2020-11-1','2020-11-30')]

months_2020 = [('2020-9-1','2020-9-30'), ('2020-10-1', '2020-10-31'),
              ('2020-11-1','2020-11-30')]

In [None]:
dates = []

In [67]:
#Gather historical data from twelvedata API
counter =1
for start,end in months_2020:
    ts = td.time_series(
    symbol="AAPL",
    interval="1day",
    start_date=start,
    end_date=end
    ).as_pandas()
    times.append(ts)
    print(counter, start,end)
    counter +=1
    

1 2020-9-1 2020-9-30
2 2020-10-1 2020-10-31
3 2020-11-1 2020-11-30


In [73]:
stock_prices_2019_appl = pd.concat(times)

In [112]:
#Normalize the datetime indexes
stock_prices_2019_appl
stock_prices_2019_appl.index = stock_prices_2019_appl.index.normalize()

In [175]:
#Merge prices and main df's on the date
merge=pd.merge(df,stock_prices_2019_appl, how='outer', left_index=True, right_index=True)

In [123]:
#localize the price df
stock_prices_2019_appl.index = stock_prices_2019_appl.index.tz_localize(None)

In [124]:
stock_prices_2019_appl

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-31,80.2325,80.6700,77.0725,77.3775,49897100
2020-01-30,80.1350,81.0225,79.6875,80.9675,31685800
2020-01-29,81.1125,81.9625,80.3450,81.0850,54057300
2020-01-28,78.1500,79.6000,78.0475,79.4225,40558500
2020-01-27,77.5150,77.9425,76.2200,77.2375,40485000
...,...,...,...,...,...
2020-11-06,118.3200,119.2000,116.1300,118.6900,114283600
2020-11-05,117.9500,119.6200,116.8700,119.0300,125734400
2020-11-04,114.1400,115.5900,112.3500,114.9500,137809900
2020-11-03,109.6600,111.4900,108.7300,110.4400,107020000


In [176]:
merge.tail(150)

Unnamed: 0,update,source,author,fulltext,summary,title,cleaned_text,open,high,low,close,volume
2020-08-18,2020-08-19 09:44:35-04:00,https://www.ibtimes.com/apple-safari-how-make-...,[],KEY POINTS Private browsing mode lets Safari u...,,Apple Safari: How To Make Private Browsing Mor...,key points private browsing mode lets safari u...,114.435,115.9975,114.025,115.5625,13908263.0
2020-08-18,2020-08-19 10:54:14-04:00,https://www.theglobeandmail.com/investing/inve...,['Jack Nicas'],Open this photo in gallery Apple logo is seen ...,,"Apple is worth US$2-trillion in market value, ...",open this photo in gallery apple logo is seen ...,114.435,115.9975,114.025,115.5625,13908263.0
2020-08-18,,https://globalnews.ca/news/7285909/apple-two-t...,['The Associated Press'],Send this page to someone via email\n\nNEW YOR...,,Apple is now worth $2 trillion — doubling in j...,send this page to someone via email new york a...,114.435,115.9975,114.025,115.5625,13908263.0
2020-08-18,2020-08-19 13:18:25+00:00,https://www.dailystar.co.uk/music/apple-music-...,"['Jack Hardwick', 'Image', 'Getty Images For A...",The Daily Star's FREE newsletter is spectacula...,,Apple Music launch two new radio stations with...,the daily star s free newsletter is spectacula...,114.435,115.9975,114.025,115.5625,13908263.0
2020-08-18,2020-08-19 17:42:58+00:00,https://www.independent.co.uk/life-style/gadge...,[],Apple has become the first ever company to be ...,,Apple becomes first ever company to be worth $...,apple has become the first ever company to be ...,114.435,115.9975,114.025,115.5625,13908263.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-20,,,,,,,,118.640,118.7700,117.290,117.3400,73391400.0
2020-11-23,,,,,,,,117.180,117.6200,113.750,113.8500,127959318.0
2020-11-24,,,,,,,,113.910,115.8500,112.590,115.1700,113585600.0
2020-11-25,,,,,,,,115.550,116.7500,115.170,116.0300,76375300.0


# Predict Sentiment for each Article with VADER

In [134]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [135]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(article):
    '''
    VADER Sentiment used to tag the.
    Returns the predicted labels: positive/negative/neutral.
    Instantiate analyzer before running this function:
    analyzer = SentimentIntensityAnalyzer()
    '''
    global analyzer

    score = analyzer.polarity_scores(article)

    if score['compound'] >= .05:
        sent = 'positive'
    elif score['compound'] <= -.05:
        sent = 'negative'
    else:
        sent = 'neutral'

    return sent

In [148]:
sentiment_analyzer_scores(df.cleaned_text[47])

'positive'

In [159]:
df.cleaned_text[79]

'a texas court has ordered apple to pay more than 500 million roughly rs 3 742 crores in damages and interest for 4g patent infringements held by intellectual property company panoptis the us tech giant now worth almost 2 trillion roughly rs 1 49 70 000 crores vowed to appeal tuesday s decision we thank the jury for their time but are disappointed with the verdict and plan to appeal apple said in an email response to an afp inquiry lawsuits like this by companies who accumulate patents simply to harass the industry only serve to stifle innovation and harm consumers panoptis which specialises in licensing patents took apple to court in february last year claiming it refused to pay for the use of 4g lte technologies in its smartphones tablets and watches the complainant s have repeatedly negotiated with apple to reach an agreement for a frand license to the complainant s patent portfolios which apple is infringing the court filing read frand refers to terms that are fair reasonable and n

In [151]:
for i,art in enumerate(df.cleaned_text[:200]):
    score = sentiment_analyzer_scores(art)
    print(i, score)

0 positive
1 positive
2 negative
3 positive
4 positive
5 positive
6 positive
7 positive
8 positive
9 positive
10 positive
11 positive
12 positive
13 positive
14 negative
15 positive
16 positive
17 positive
18 positive
19 positive
20 negative
21 positive
22 negative
23 positive
24 positive
25 positive
26 positive
27 positive
28 positive
29 positive
30 positive
31 positive
32 positive
33 positive
34 positive
35 positive
36 positive
37 negative
38 negative
39 positive
40 positive
41 positive
42 positive
43 positive
44 positive
45 positive
46 positive
47 positive
48 positive
49 positive
50 positive
51 positive
52 positive
53 positive
54 positive
55 positive
56 positive
57 negative
58 negative
59 negative
60 positive
61 negative
62 positive
63 positive
64 positive
65 positive
66 positive
67 positive
68 positive
69 positive
70 positive
71 negative
72 positive
73 negative
74 negative
75 positive
76 positive
77 positive
78 positive
79 negative
80 positive
81 positive
82 positive
83 positive
84