In [7]:
# Import libraries
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
nltk.downloader.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhangzixin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers = ['AMZN', 'TSLA', 'GOOG']
for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url, headers={
                  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})
    response = urlopen(req)
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

In [8]:
# Read one single day of headlines for ‘AMZN’
amzn = news_tables["AMZN"]
# Get all the table rows tagged in HTML with <tr> into ‘amzn_tr’
amzn_tr = amzn.findAll("tr")
for i, table_row in enumerate(amzn_tr):
    # Read the text of the element ‘a’ into ‘link_text’
    a_text = table_row.a.text
    # Read the text of the element ‘td’ into ‘data_text’
    td_text = table_row.td.text
    # Print the contents of ‘link_text’ and ‘data_text’
    print(a_text)
    print(td_text)
    # Exit after printing 4 rows of data
    if i == 3:
        break

Weekly Roundup
Nov-18-22 06:44PM
Target, Amazon, Starbucks: Stocks That Defined the Week
06:10PM
Warren Buffetts Top 10 Dividend Stock Picks
06:05PM
Target, Amazon and 4 More Retailers That Will Reward You for Turning in Your Old Stuff
03:01PM


In [15]:
news_tables

{'AMZN': <table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer" id="news-table" width="100%">
 <tr><td align="right" width="130">Nov-18-22 06:44PM</td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://aap.thestreet.com/story/16108979/1/weekly-roundup.html" onclick="trackAndOpenNews(event, 'TheStreet.com', 'https://aap.thestreet.com/story/16108979/1/weekly-roundup.html');" target="_blank">Weekly Roundup</a></div><div class="news-link-right"><span style="color:#aa6dc0;font-size:9px"> TheStreet.com</span></div></div></td></tr>
 <tr><td align="right" width="130">06:10PM</td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://www.wsj.com/articles/target-amazon-starbucks-stocks-that-defined-the-week-11668813059?siteid=yhoof2" onclick="trackAndOpenNews(event, 'The Wall Street Journal', 'https://www.wsj.com/articles/target-amazon-starbucks-s

In [19]:
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        if x.a is not None:
            text = x.a.get_text()
            # splite text in the td tag into a list
            date_scrape = x.td.text.split()
            # if the length of 'date_scrape' is 1, load 'time' as the only element


        if len(date_scrape) == 1:
                    time = date_scrape[0]

        # else load 'date' as the 1st element and 'time' as the second
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
        
parsed_news[:5] # print first 5 rows of news

[['AMZN', 'Nov-18-22', '06:44PM', 'Weekly Roundup'],
 ['AMZN',
  'Nov-18-22',
  '06:10PM',
  'Target, Amazon, Starbucks: Stocks That Defined the Week'],
 ['AMZN',
  'Nov-18-22',
  '06:05PM',
  'Warren Buffetts Top 10 Dividend Stock Picks'],
 ['AMZN',
  'Nov-18-22',
  '03:01PM',
  'Target, Amazon and 4 More Retailers That Will Reward You for Turning in Your Old Stuff'],
 ['AMZN',
  'Nov-18-22',
  '02:42PM',
  "Is Kroger Stock A Buy Amid Albertsons Merger Drama? Here's What Analysis, Charts Show"]]

In [20]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(
    vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(
    scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(
    parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,2022-11-18,06:44PM,Weekly Roundup,0.0,1.0,0.0,0.0
1,AMZN,2022-11-18,06:10PM,"Target, Amazon, Starbucks: Stocks That Defined...",0.0,0.805,0.195,0.1779
2,AMZN,2022-11-18,06:05PM,Warren Buffetts Top 10 Dividend Stock Picks,0.0,0.769,0.231,0.2023
3,AMZN,2022-11-18,03:01PM,"Target, Amazon and 4 More Retailers That Will ...",0.0,0.707,0.293,0.6597
4,AMZN,2022-11-18,02:42PM,Is Kroger Stock A Buy Amid Albertsons Merger D...,0.0,1.0,0.0,0.0


In [28]:
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

In [62]:
AMZN = pd.read_csv("amzn_data.csv")
AMZN["Ticker"] = "AMZN"
GOOG = pd.read_csv("goog_data.csv")
GOOG["Ticker"] = "GOOG"
TSLA = pd.read_csv("tsla_data.csv")
TSLA["Ticker"] = "TSLA"
total_data = pd.concat([AMZN, GOOG, TSLA])
total_data

Unnamed: 0,Date,Price,Ticker
0,11/11/2022 09:30,97.915,AMZN
1,11/11/2022 09:31,98.480,AMZN
2,11/11/2022 09:32,97.230,AMZN
3,11/11/2022 09:33,97.490,AMZN
4,11/11/2022 09:34,97.280,AMZN
...,...,...,...
2208,11/18/2022 13:48,178.350,TSLA
2209,11/18/2022 13:49,178.250,TSLA
2210,11/18/2022 13:50,178.200,TSLA
2211,11/18/2022 13:51,178.120,TSLA


In [47]:
parsed_and_scored_news["date_str"] = [datetime.strftime(date, "%m/%d/%Y") for date in parsed_and_scored_news["date"]]
parsed_and_scored_news["date_str"]

0      11/18/2022
1      11/18/2022
2      11/18/2022
3      11/18/2022
4      11/18/2022
          ...    
304    11/14/2022
305    11/14/2022
306    11/14/2022
307    11/14/2022
308    11/14/2022
Name: date_str, Length: 309, dtype: object

In [51]:
parsed_and_scored_news["time_str"] = parsed_and_scored_news["date_str"].str.cat(
    parsed_and_scored_news["time"], join="inner", sep=" ")
parsed_and_scored_news["time_str"]

0      11/18/2022 06:44PM
1      11/18/2022 06:10PM
2      11/18/2022 06:05PM
3      11/18/2022 03:01PM
4      11/18/2022 02:42PM
              ...        
304    11/14/2022 02:56PM
305    11/14/2022 01:48PM
306    11/14/2022 11:45AM
307    11/14/2022 11:35AM
308    11/14/2022 11:35AM
Name: time_str, Length: 309, dtype: object

In [52]:
parsed_and_scored_news["time_dt"] = [datetime.strptime(
    date, "%m/%d/%Y %I:%M%p") for date in parsed_and_scored_news["time_str"]]
parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound,time_lagged,date_str,time_str,time_dt
0,AMZN,2022-11-18,06:44PM,Weekly Roundup,0.000,1.000,0.000,0.0000,1900-01-01 18:44:00,11/18/2022,11/18/2022 06:44PM,2022-11-18 18:44:00
1,AMZN,2022-11-18,06:10PM,"Target, Amazon, Starbucks: Stocks That Defined...",0.000,0.805,0.195,0.1779,1900-01-01 18:10:00,11/18/2022,11/18/2022 06:10PM,2022-11-18 18:10:00
2,AMZN,2022-11-18,06:05PM,Warren Buffetts Top 10 Dividend Stock Picks,0.000,0.769,0.231,0.2023,1900-01-01 18:05:00,11/18/2022,11/18/2022 06:05PM,2022-11-18 18:05:00
3,AMZN,2022-11-18,03:01PM,"Target, Amazon and 4 More Retailers That Will ...",0.000,0.707,0.293,0.6597,1900-01-01 15:01:00,11/18/2022,11/18/2022 03:01PM,2022-11-18 15:01:00
4,AMZN,2022-11-18,02:42PM,Is Kroger Stock A Buy Amid Albertsons Merger D...,0.000,1.000,0.000,0.0000,1900-01-01 14:42:00,11/18/2022,11/18/2022 02:42PM,2022-11-18 14:42:00
...,...,...,...,...,...,...,...,...,...,...,...,...
304,GOOG,2022-11-14,02:56PM,Pennsylvania to receive $19.7M payout from Goo...,0.000,1.000,0.000,0.0000,1900-01-01 14:56:00,11/14/2022,11/14/2022 02:56PM,2022-11-14 14:56:00
305,GOOG,2022-11-14,01:48PM,Google to Pay $391.5 Million Over Crafty Locat...,0.149,0.851,0.000,-0.1027,1900-01-01 13:48:00,11/14/2022,11/14/2022 01:48PM,2022-11-14 13:48:00
306,GOOG,2022-11-14,11:45AM,"Amazon reportedly plans to lay off about 10,00...",0.000,0.825,0.175,0.1779,1900-01-01 11:45:00,11/14/2022,11/14/2022 11:45AM,2022-11-14 11:45:00
307,GOOG,2022-11-14,11:35AM,UPDATE 3-Google to pay nearly $400 million to ...,0.113,0.887,0.000,-0.1027,1900-01-01 11:35:00,11/14/2022,11/14/2022 11:35AM,2022-11-14 11:35:00


In [65]:
parsed_and_scored_news["time_lagged"] = parsed_and_scored_news["time_dt"] + \
    timedelta(seconds=60)

In [63]:
total_data["date_dt"] = [datetime.strptime(
    date, "%m/%d/%Y %H:%M") for date in total_data.Date]
total_data

Unnamed: 0,Date,Price,Ticker,date_dt
0,11/11/2022 09:30,97.915,AMZN,2022-11-11 09:30:00
1,11/11/2022 09:31,98.480,AMZN,2022-11-11 09:31:00
2,11/11/2022 09:32,97.230,AMZN,2022-11-11 09:32:00
3,11/11/2022 09:33,97.490,AMZN,2022-11-11 09:33:00
4,11/11/2022 09:34,97.280,AMZN,2022-11-11 09:34:00
...,...,...,...,...
2208,11/18/2022 13:48,178.350,TSLA,2022-11-18 13:48:00
2209,11/18/2022 13:49,178.250,TSLA,2022-11-18 13:49:00
2210,11/18/2022 13:50,178.200,TSLA,2022-11-18 13:50:00
2211,11/18/2022 13:51,178.120,TSLA,2022-11-18 13:51:00


In [67]:
merged_data = total_data.merge(parsed_and_scored_news, left_on=[
                 "date_dt", "Ticker"], right_on=["time_lagged", "ticker"])
merged_data

Unnamed: 0,Date,Price,Ticker,date_dt,ticker,date,time,headline,neg,neu,pos,compound,time_lagged,date_str,time_str,time_dt
0,11/15/2022 11:52,100.775,AMZN,2022-11-15 11:52:00,AMZN,2022-11-15,11:51AM,"Buy Amazon, Analyst Says. Its Still the Best P...",0.000,0.491,0.509,0.8074,2022-11-15 11:52:00,11/15/2022,11/15/2022 11:51AM,2022-11-15 11:51:00
1,11/15/2022 12:54,99.403,AMZN,2022-11-15 12:54:00,AMZN,2022-11-15,12:53PM,Amazons New Telehealth Service to Offer Care f...,0.089,0.650,0.260,0.4767,2022-11-15 12:54:00,11/15/2022,11/15/2022 12:53PM,2022-11-15 12:53:00
2,11/15/2022 13:00,99.680,AMZN,2022-11-15 13:00:00,AMZN,2022-11-15,12:59PM,Amazon Unveils New Healthcare Service: Is the ...,0.000,0.825,0.175,0.1779,2022-11-15 13:00:00,11/15/2022,11/15/2022 12:59PM,2022-11-15 12:59:00
3,11/15/2022 13:24,98.680,AMZN,2022-11-15 13:24:00,AMZN,2022-11-15,01:23PM,Amazon heads for 3rd-straight day with trillio...,0.000,0.805,0.195,0.1779,2022-11-15 13:24:00,11/15/2022,11/15/2022 01:23PM,2022-11-15 13:23:00
4,11/15/2022 13:38,97.619,AMZN,2022-11-15 13:38:00,AMZN,2022-11-15,01:37PM,Amazon Launches Affordable Virtual Care Servic...,0.000,0.431,0.569,0.6808,2022-11-15 13:38:00,11/15/2022,11/15/2022 01:37PM,2022-11-15 13:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,11/18/2022 10:01,178.940,TSLA,2022-11-18 10:01:00,TSLA,2022-11-18,10:00AM,3 Reasons to Get Out of Tesla,0.000,1.000,0.000,0.0000,2022-11-18 10:01:00,11/18/2022,11/18/2022 10:00AM,2022-11-18 10:00:00
123,11/18/2022 10:24,178.277,TSLA,2022-11-18 10:24:00,TSLA,2022-11-18,10:23AM,"Bahamas regulators seize FTX wallets, Tesla re...",0.118,0.882,0.000,-0.1531,2022-11-18 10:24:00,11/18/2022,11/18/2022 10:23AM,2022-11-18 10:23:00
124,11/18/2022 10:24,178.277,TSLA,2022-11-18 10:24:00,TSLA,2022-11-18,10:23AM,"Tesla Recalls Nearly 30,000 Vehicles Due to Ai...",0.252,0.748,0.000,-0.4019,2022-11-18 10:24:00,11/18/2022,11/18/2022 10:23AM,2022-11-18 10:23:00
125,11/18/2022 10:36,178.906,TSLA,2022-11-18 10:36:00,TSLA,2022-11-18,10:35AM,"AQR Capitals Performance, AUM and Top Stock Picks",0.000,0.795,0.205,0.2023,2022-11-18 10:36:00,11/18/2022,11/18/2022 10:35AM,2022-11-18 10:35:00


In [75]:
olsdata = merged_data[["Ticker", "Price", "neg", "neu", "pos"]]

In [77]:
import statsmodels.api as sm

X = olsdata[["neg", "pos"]]
y = olsdata[["Price"]]
X = sm.add_constant(X)
static_model = sm.OLS(y, X).fit()
static_model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Price,R-squared:,0.01
Model:,OLS,Adj. R-squared:,-0.006
Method:,Least Squares,F-statistic:,0.6376
Date:,"Fri, 18 Nov 2022",Prob (F-statistic):,0.53
Time:,19:52:08,Log-Likelihood:,-643.15
No. Observations:,127,AIC:,1292.0
Df Residuals:,124,BIC:,1301.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,125.3619,5.925,21.160,0.000,113.636,137.088
neg,-20.6960,29.723,-0.696,0.488,-79.526,38.134
pos,-27.4395,25.411,-1.080,0.282,-77.735,22.856

0,1,2,3
Omnibus:,25.838,Durbin-Watson:,0.065
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.065
Skew:,1.09,Prob(JB):,8.05e-07
Kurtosis:,2.256,Cond. No.,9.68


In [76]:

def myreg(ticker):
    X = olsdata[olsdata["Ticker"] == ticker][["neg", "pos"]]
    y = olsdata[olsdata["Ticker"] == ticker][["Price"]]
    X = sm.add_constant(X)
    static_model = sm.OLS(y, X).fit()
    print(static_model.summary())


myreg("AMZN")


                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.125
Model:                            OLS   Adj. R-squared:                  0.087
Method:                 Least Squares   F-statistic:                     3.273
Date:                Fri, 18 Nov 2022   Prob (F-statistic):             0.0469
Time:                        19:51:15   Log-Likelihood:                -93.984
No. Observations:                  49   AIC:                             194.0
Df Residuals:                      46   BIC:                             199.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         96.1447      0.538    178.644      0.0

  x = pd.concat(x[::order], 1)


In [78]:
myreg("GOOG")


                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.043
Method:                 Least Squares   F-statistic:                   0.08757
Date:                Fri, 18 Nov 2022   Prob (F-statistic):              0.916
Time:                        19:52:18   Log-Likelihood:                -64.923
No. Observations:                  45   AIC:                             135.8
Df Residuals:                      42   BIC:                             141.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         98.1982      0.255    385.085      0.0

In [79]:
myreg("TSLA")


                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                 -0.048
Method:                 Least Squares   F-statistic:                    0.2701
Date:                Fri, 18 Nov 2022   Prob (F-statistic):              0.765
Time:                        19:52:37   Log-Likelihood:                -86.913
No. Observations:                  33   AIC:                             179.8
Df Residuals:                      30   BIC:                             184.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        184.5049      0.923    199.994      0.0

In [80]:
import pickle
filename = "finalized_model.sav"
pickle.dump(static_model, open(filename, "wb"))