# SENTIMENT INDEX

In [164]:
import numpy as np
import pandas as pd
import pickle

In [165]:
# load pickles
# import pickles 
stocks_pkl = open("stocks.pkl", "rb")
stocks = pickle.load(stocks_pkl)

tweet_sent_pkl = open("stock_tweet_sentiment.pkl", "rb")
tweets = pickle.load(tweet_sent_pkl)

reviews_pkl = open("reviews.pkl", "rb")
ratings = pickle.load(reviews_pkl)

news_sent_pkl = open("news_sent.pkl", "rb")
news = pickle.load(news_sent_pkl)

## Make final dataframe

In [166]:
# check on the dates 
stocks.head()

Unnamed: 0,Date,Open,High,Low,Volume,Adj Close
0,2010-06-29,19.0,25.0,17.540001,18766300,23.889999
1,2010-06-30,25.790001,30.42,23.299999,17187100,23.83
2,2010-07-01,25.0,25.92,20.27,8218800,21.959999
3,2010-07-02,23.0,23.1,18.709999,5139800,19.200001
4,2010-07-06,20.0,20.0,15.83,6866900,16.110001


In [167]:
tweets.sort_values(by="Date",inplace=True)
tweets.rename(columns={"Value":"Stock_Price","Sentiment":"Tweet_Sentiment"},inplace=True)
tweets.head()

Unnamed: 0,Date,Tweet_Sentiment,Stock_Price
4810,2012-11-16,0.4199,
4808,2012-11-19,0.6989,32.519999
4809,2012-11-19,0.6989,32.52
4807,2012-11-20,-0.1027,32.7275
4805,2012-11-20,0.0,32.7275


In [168]:
ratings.rename(columns={"date":"Date", "rating":"Employee_Sentiment"},inplace=True)
ratings.head()

Unnamed: 0,Date,Employee_Sentiment
0,2012-04-02,4.0
1,2012-06-28,3.0
2,2012-07-24,3.0
3,2012-08-08,5.0
4,2012-08-26,5.0


In [169]:
news["Date"] = pd.to_datetime(news["Date"])
news.head()

Unnamed: 0,Date,News_Sentiment
0,2010-06-22,-0.296
1,2012-05-31,0.4939
2,2012-06-25,0.0
3,2012-08-03,0.0
4,2012-10-23,0.3818


In [170]:
# make one dataframe, join on date
sent_idx = tweets.merge(ratings, how="outer")
sent_idx = news.merge(sent_idx, how="outer")

In [171]:
# check for missing values 
sent_idx.isna().sum()

Date                     0
News_Sentiment        7550
Tweet_Sentiment       2819
Stock_Price           2675
Employee_Sentiment    4365
dtype: int64

In [172]:
# fill nans with previous values 
sent_idx.fillna(method="backfill", inplace=True)
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,5.0
1,2012-05-31,0.4939,0.6249,34.005,5.0
2,2012-06-25,0.0,0.6249,34.005,5.0
3,2012-08-03,0.0,0.6249,34.005,5.0
4,2012-10-23,0.3818,0.6249,34.005,5.0


In [175]:
# fill nans with previous values 
sent_idx.fillna(method="ffill", inplace=True)
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,5.0
1,2012-05-31,0.4939,0.6249,34.005,5.0
2,2012-06-25,0.0,0.6249,34.005,5.0
3,2012-08-03,0.0,0.6249,34.005,5.0
4,2012-10-23,0.3818,0.6249,34.005,5.0


In [176]:
sent_idx.tail(50)

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment
8033,2019-08-02,0.2023,0.8271,341.54025,2.0
8034,2019-08-03,0.2023,0.8271,341.54025,2.0
8035,2019-08-03,0.2023,0.8271,341.54025,3.0
8036,2019-08-04,0.2023,0.8271,341.54025,1.0
8037,2019-08-04,0.2023,0.8271,341.54025,5.0
8038,2019-08-04,0.2023,0.8271,341.54025,2.0
8039,2019-08-04,0.2023,0.8271,341.54025,3.0
8040,2019-08-04,0.2023,0.8271,341.54025,3.0
8041,2019-08-04,0.2023,0.8271,341.54025,1.0
8042,2019-08-05,0.2023,0.8271,341.54025,4.0


In [177]:
# check for missing values 
sent_idx.isna().sum()

Date                  0
News_Sentiment        0
Tweet_Sentiment       0
Stock_Price           0
Employee_Sentiment    0
dtype: int64

In [178]:
# drop all 2019
sent_idx = sent_idx[sent_idx["Date"].dt.year < 2018]

In [179]:
# check for missing values 
sent_idx.isna().sum()

Date                  0
News_Sentiment        0
Tweet_Sentiment       0
Stock_Price           0
Employee_Sentiment    0
dtype: int64

In [180]:
# change employee sentiment to string for scaling 
sent_idx["Employee_Sentiment"] = sent_idx["Employee_Sentiment"].astype(str)
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,5.0
1,2012-05-31,0.4939,0.6249,34.005,5.0
2,2012-06-25,0.0,0.6249,34.005,5.0
3,2012-08-03,0.0,0.6249,34.005,5.0
4,2012-10-23,0.3818,0.6249,34.005,5.0


In [181]:
# rescale the employee sentiment
scaled_sent = {"1.0":"-1", "2.0":"-.5", "3.0":"0", "4.0":".5", "5.0":"1"}
sent_idx.replace({"Employee_Sentiment":scaled_sent},inplace=True)
sent_idx["Employee_Sentiment"] = (sent_idx["Employee_Sentiment"]).astype(float)
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,1.0
1,2012-05-31,0.4939,0.6249,34.005,1.0
2,2012-06-25,0.0,0.6249,34.005,1.0
3,2012-08-03,0.0,0.6249,34.005,1.0
4,2012-10-23,0.3818,0.6249,34.005,1.0


In [182]:
# make total sentiment column with average
sent_idx["Average_Sentiment"] = sent_idx["News_Sentiment"] * sent_idx["Tweet_Sentiment"] * sent_idx["Employee_Sentiment"] / 3
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment,Average_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,1.0,-0.061657
1,2012-05-31,0.4939,0.6249,34.005,1.0,0.102879
2,2012-06-25,0.0,0.6249,34.005,1.0,0.0
3,2012-08-03,0.0,0.6249,34.005,1.0,0.0
4,2012-10-23,0.3818,0.6249,34.005,1.0,0.079529


In [183]:
# reset index
sent_idx.reset_index(inplace=True)
sent_idx.drop(columns=["index"],inplace=True)
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment,Average_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,1.0,-0.061657
1,2012-05-31,0.4939,0.6249,34.005,1.0,0.102879
2,2012-06-25,0.0,0.6249,34.005,1.0,0.0
3,2012-08-03,0.0,0.6249,34.005,1.0,0.0
4,2012-10-23,0.3818,0.6249,34.005,1.0,0.079529


In [184]:
sent_idx.tail(100)

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment,Average_Sentiment
6563,2017-10-15,0.2023,0.8271,341.54025,-0.5,-0.027887
6564,2017-10-16,0.2023,0.8271,341.54025,0.0,0.000000
6565,2017-10-16,0.2023,0.8271,341.54025,-1.0,-0.055774
6566,2017-10-16,0.2023,0.8271,341.54025,0.5,0.027887
6567,2017-10-16,0.2023,0.8271,341.54025,0.5,0.027887
6568,2017-10-17,0.2023,0.8271,341.54025,-0.5,-0.027887
6569,2017-10-18,0.2023,0.8271,341.54025,1.0,0.055774
6570,2017-10-18,0.2023,0.8271,341.54025,-1.0,-0.055774
6571,2017-10-18,0.2023,0.8271,341.54025,0.0,0.000000
6572,2017-10-19,0.2023,0.8271,341.54025,0.0,0.000000


In [186]:
sent_idx.head()

Unnamed: 0,Date,News_Sentiment,Tweet_Sentiment,Stock_Price,Employee_Sentiment,Average_Sentiment
0,2010-06-22,-0.296,0.6249,34.005,1.0,-0.061657
1,2012-05-31,0.4939,0.6249,34.005,1.0,0.102879
2,2012-06-25,0.0,0.6249,34.005,1.0,0.0
3,2012-08-03,0.0,0.6249,34.005,1.0,0.0
4,2012-10-23,0.3818,0.6249,34.005,1.0,0.079529
