In [1]:
# Import all libraries 
import pandas as pd
import numpy as np 
import datetime
import matplotlib.pyplot as plt

In [2]:
# Import csv as pandas dataframe
df_sentiment = pd.read_csv('sentiment_analysis_output.csv')
df_topic = pd.read_csv('topic_modelling_output.csv')
df_stock = pd.read_csv('tweets_stocks_combined_final.csv')

In [3]:
# Create dummy variables for topic of tweets
dummy = pd.get_dummies(df_topic['topic'])
dummy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0


In [4]:
# Combine relevant columns from imported csv into df_regression
df_regression = pd.DataFrame(df_stock,columns=['cleaned_text','cleaned_text_2','favorites','retweets','price_60mins_after', '60mins_price_diff_abs', '60mins_price_diff_perc'])
compound_score = df_sentiment['compound']
df_regression = pd.concat([df_regression, compound_score], axis=1)
df_regression = pd.concat([df_regression, dummy], axis=1)
df_regression = df_regression.dropna()
df_regression

Unnamed: 0,cleaned_text,cleaned_text_2,favorites,retweets,price_60mins_after,60mins_price_diff_abs,60mins_price_diff_perc,compound,0,1,2,3,4,5,6,7,8,9
0,Thank you Rand!,thank you rand,42793.0,9125.0,261.485000,0.385000,0.001475,0.4199,0,1,0,0,0,0,0,0,0,0
1,"Join me live from Fort Myer in Arlington, Virg...",join me live from fort myer in arlington virgi...,36009.0,4891.0,244.260000,0.590000,0.002421,0.2960,0,0,1,0,0,0,0,0,0,0
2,Thank you Nicole!,thank you nicole,43367.0,8275.0,239.940000,0.065000,0.000271,0.4199,0,0,0,0,0,0,1,0,0,0
3,Thank you to Shawn Steel for the nice words on...,thank you to shawn steel for the nice words on,50956.0,7465.0,237.022857,0.142857,0.000603,0.6486,0,0,0,1,0,0,0,0,0,0
4,MAKE AMERICA GREAT AGAIN!,make america great again,134210.0,36346.0,243.326476,0.005714,0.000023,0.8622,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214,If the person placed very early into my campai...,if the person placed very early into my campai...,78529.0,20098.0,271.930000,0.890000,0.003284,-0.6537,0,0,0,0,0,0,0,1,0,0
2215,So General Michael Flynn’s life can be totally...,so general michael flynn’s life can be totally...,93569.0,25259.0,267.025000,0.205000,0.000768,0.8221,1,0,0,0,0,0,0,0,0,0
2216,"My thoughts, prayers and condolences are with ...",my thoughts prayers and condolences are with t...,62645.0,16081.0,269.070000,0.450000,0.001675,0.6476,0,0,1,0,0,0,0,0,0,0
2217,Today’s Court decision means that Congress mus...,today’s court decision means that congress mus...,56749.0,12426.0,270.695000,0.095000,0.000351,-0.3365,0,0,1,0,0,0,0,0,0,0


In [5]:
# Define X and Y for linear regression
x = df_regression[['favorites', 'retweets', 'compound', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
y = df_regression['60mins_price_diff_abs'].values

In [6]:
# Print Pearson Correlation Coefficient of X variables
print('Pearson Correlation Coefficient of variables in X\n')
print(x.corr())

Pearson Correlation Coefficient of variables in X

           favorites  retweets  compound         0         1         2  \
favorites   1.000000  0.892556 -0.040830 -0.033172 -0.030713  0.009656   
retweets    0.892556  1.000000 -0.018324 -0.016586 -0.026447  0.000676   
compound   -0.040830 -0.018324  1.000000  0.210746 -0.032772 -0.006489   
0          -0.033172 -0.016586  0.210746  1.000000 -0.160192 -0.170120   
1          -0.030713 -0.026447 -0.032772 -0.160192  1.000000 -0.122802   
2           0.009656  0.000676 -0.006489 -0.170120 -0.122802  1.000000   
3           0.024583  0.009192 -0.220925 -0.170120 -0.122802 -0.130413   
4          -0.004808  0.006008  0.015526 -0.088107 -0.063601 -0.067543   
5           0.034808  0.030901  0.054842 -0.155884 -0.112526 -0.119500   
6          -0.007741 -0.002611  0.021454 -0.147450 -0.106438 -0.113034   
7           0.013519 -0.004824 -0.009190 -0.120370 -0.086890 -0.092275   
8          -0.013643 -0.000750  0.148933 -0.171618 -0.123884 

In [7]:
# Import linear regression models
from sklearn import linear_model, model_selection, metrics, preprocessing

In [8]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.2, random_state = 2020)

# Normalize all values
x_train = preprocessing.scale(x_train)
y_train = preprocessing.scale(y_train)
x_test = preprocessing.scale(x_test)
y_test = preprocessing.scale(y_test)

# Linear regression
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)

print('Linear metrics for stock price 60 mins after tweet')
print('Linear score:', regr.score(x_test, y_test))
print(regr.coef_, '\n')

y_pred = regr.predict(x_test)
print('-'*64)

num_C = 5
C = [1.0] * num_C
for i in range(num_C):
    C[i] = pow(10, i-5)
print('Ridge and Lasso metrics for stock price 60 mins after tweet')
for i in range(5):
    
    # Lasso Regression
    lasso = linear_model.Lasso(alpha = C[i])
    lasso.fit(x_train, y_train)
    print('Alpha = ', C[i])
    print('Lasso score:', lasso.score(x_test, y_test))
    print('Lasso coefs:', lasso.coef_, '\n')
    
    # Ridge regression
    ridge = linear_model.Ridge(alpha = C[i])
    ridge.fit(x_train, y_train)
    print('Ridge score:', ridge.score(x_test, y_test))
    print('Ridge coefs:', ridge.coef_, '\n')
    print('-'*64)


Linear metrics for stock price 60 mins after tweet
Linear score: -0.005088507398341768
[-7.22489739e-03  1.39771296e-02  1.85412432e-02 -5.12660444e-05
 -8.62324443e-03  4.47878056e-02 -2.31753783e-02  1.23409955e-03
  3.39899070e-02 -4.22674289e-02  6.43438788e-03 -1.85238283e-02
  8.77602119e-03] 

----------------------------------------------------------------
Ridge and Lasso metrics for stock price 60 mins after tweet
Alpha =  1e-05
Lasso score: -0.005084290583208251
Lasso coefs: [-0.00713974  0.01389269  0.01852917 -0.         -0.00857814  0.04481515
 -0.02313124  0.00124809  0.03401651 -0.04222455  0.00645156 -0.01847414
  0.00879543] 

Ridge score: -0.005088507358560257
Ridge coefs: [-7.22489688e-03  1.39771291e-02  1.85412431e-02 -5.12634368e-05
 -8.62324236e-03  4.47878075e-02 -2.31753761e-02  1.23410079e-03
  3.39899087e-02 -4.22674268e-02  6.43438941e-03 -1.85238260e-02
  8.77602291e-03] 

----------------------------------------------------------------
Alpha =  0.0001
Lass