In [1]:
%matplotlib inline
# Dependencies and Setup
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import export_graphviz
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Read all files as indicated in subsequent cells containing data for labels and NLP formulated features of our project.

In [2]:
L_cap = pd.read_csv("./data/stock_spy.csv")
L_cap = L_cap.loc[L_cap['date']>'2016-01-01']
L_cap = L_cap.loc[:, ["date", "change"]]
L_cap.head(1)

Unnamed: 0,date,change
4068,2016-01-04,0.002644


In [3]:
M_cap = pd.read_csv("./data/stock_mdy.csv")
M_cap = M_cap.loc[M_cap['date']>'2016-01-01']
M_cap = M_cap.loc[:, ["date", "change"]]
M_cap.head(1)

Unnamed: 0,date,change
4068,2016-01-04,0.003601


In [4]:
S_cap = pd.read_csv("./data/stock_ijr.csv")
S_cap = S_cap.loc[S_cap['date']>'2016-01-01']
S_cap = S_cap.loc[:, ["date", "change"]]
S_cap.head(1)

Unnamed: 0,date,change
3924,2016-01-04,-0.006651


In [5]:
powell_econ = pd.read_csv("./data/powell_daydf_count.csv")
powell_econ.head(1)

Unnamed: 0,Date,Count
0,2021-02-10,11


In [6]:
powell_sentbyday = pd.read_csv("./data/powell_sentbydaydf_count.csv")
powell_sentbyday.head(1)

Unnamed: 0,Date,Sent_Count
0,2021-03-18,10


In [7]:
powell_sentbydaypos = pd.read_csv("./data/powell_sentbydayposdf_count.csv")
powell_sentbydaypos.head(1)

Unnamed: 0,Date,Pos_Count
0,2021-03-18,18


In [8]:
powell_sentbydayneg = pd.read_csv("./data/powell_sentbydaynegdf_count.csv")
powell_sentbydayneg.head(1)

Unnamed: 0,Date,Neg_Count
0,2021-03-18,8


Join data frames and create classification labels.

In [10]:
powell_mkt = L_cap.merge(powell_sentbydayneg, how='inner', left_on="date" ,right_on="Date")
powell_mkt = powell_mkt.merge(powell_sentbydaypos, how='inner' ,on="Date")
powell_mkt =  powell_mkt.merge(powell_sentbyday, how='inner' ,on="Date")
powell_mkt =  powell_mkt.merge(powell_econ, how='inner' ,on="Date")
powell_mkt['pos_neg'] = np.where (powell_mkt['change'] > 0,1,0)
powell_mkt['pos'] = np.where (powell_mkt['change'] > 0.002,1,0)
powell_mkt['neg'] = np.where (powell_mkt['change'] < -0.002,1,0)
powell_mkt['flat'] = np.where ((powell_mkt['change'] <= 0.002) & (powell_mkt['change'] >= -0.002) ,1,0)

powell_mkt.head()

Unnamed: 0,date,change,Date,Neg_Count,Pos_Count,Sent_Count,Count,pos_neg,pos,neg,flat
0,2016-02-26,-0.007529,2016-02-26,20,15,-5,3,0,0,1,0
1,2016-05-26,-0.000477,2016-05-26,59,61,2,20,0,0,0,1
2,2016-06-21,0.000672,2016-06-21,15,5,-10,8,1,0,0,1
3,2016-06-28,0.008537,2016-06-28,67,72,5,40,1,1,0,0
4,2016-09-29,-0.007948,2016-09-29,57,51,-6,4,0,0,1,0


Add % changes of fundamental factors as features.  Since they are monthly released data, the values of subsequent daily rows after each releasing date would be equal to their prior row until they reach the new releasing date.

In [18]:
powell_rgdp = pd.read_csv("./Data/RGDP.csv")
powell_rgdp.head()
powell_rgdp['Date']=pd.to_datetime(powell_rgdp['Date'])

In [19]:
powell_mkt['growthgap']=np.nan
for x in range(len(powell_rgdp.index)-1):
   powell_mkt.loc[((powell_mkt['date']>= powell_rgdp.loc[x][0]) & (powell_mkt['date']< powell_rgdp.loc[x+1][0])),['growthgap']]=powell_rgdp.loc[x][3]
powell_mkt.head()

Unnamed: 0,date,change,Date,Neg_Count,Pos_Count,Sent_Count,Count,pos_neg,pos,neg,flat,growthgap,job_change
0,2016-02-26,-0.007529,2016-02-26,20,15,-5,3,0,0,1,0,-0.4,108.0
1,2016-05-26,-0.000477,2016-05-26,59,61,2,20,0,0,0,1,1.8,197.0
2,2016-06-21,0.000672,2016-06-21,15,5,-10,8,1,0,0,1,1.5,41.0
3,2016-06-28,0.008537,2016-06-28,67,72,5,40,1,1,0,0,1.2,41.0
4,2016-09-29,-0.007948,2016-09-29,57,51,-6,4,0,0,1,0,-0.1,143.0


In [16]:
powell_empsit = pd.read_csv("./Data/empsit.csv")
powell_empsit.head()
powell_mkt['date'] = pd.to_datetime(powell_mkt['date'])

In [17]:
powell_mkt['job_change']=np.nan
for x in range(len(powell_empsit.index)-1):
   powell_mkt.loc[((powell_mkt['date']>= powell_empsit.loc[x][0]) & (powell_mkt['date']< powell_empsit.loc[x+1][0])),['job_change']]=powell_empsit.loc[x][1]
#    print(wsb_mkt.loc[(wsb_mkt['date']>= wsb_empsit.loc[x][0]) & (wsb_mkt['date']< wsb_empsit.loc[x+1][0])]['job_change'])
powell_mkt.head()

Unnamed: 0,date,change,Date,Neg_Count,Pos_Count,Sent_Count,Count,pos_neg,pos,neg,flat,growthgap,job_change
0,2016-02-26,-0.007529,2016-02-26,20,15,-5,3,0,0,1,0,,108.0
1,2016-05-26,-0.000477,2016-05-26,59,61,2,20,0,0,0,1,,197.0
2,2016-06-21,0.000672,2016-06-21,15,5,-10,8,1,0,0,1,,41.0
3,2016-06-28,0.008537,2016-06-28,67,72,5,40,1,1,0,0,,41.0
4,2016-09-29,-0.007948,2016-09-29,57,51,-6,4,0,0,1,0,,143.0


Create lag values to add to features given that signals may adjust in lagged time.

In [23]:
powell_mkt['negcount1day'] = powell_mkt['Neg_Count'].shift(1)
powell_mkt['poscount1day'] = powell_mkt['Pos_Count'].shift(1)
#powell_mkt['sentcount1day'] = powell_mkt['Sent_Count'].shift(1)
powell_mkt['econcount1day'] = powell_mkt['Count'].shift(1)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(1)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(1)
powell_mkt['change1day'] = powell_mkt['change'].shift(1)
powell_mkt['negcount2day'] = powell_mkt['Neg_Count'].shift(2)
powell_mkt['poscount2day'] = powell_mkt['Pos_Count'].shift(2)
#powell_mkt['sentcount2day'] = powell_mkt['Sent_Count'].shift(2)
powell_mkt['econcount2day'] = powell_mkt['Count'].shift(2)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(2)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(2)
powell_mkt['change2day'] = powell_mkt['change'].shift(2)
powell_mkt['negcount3day'] = powell_mkt['Neg_Count'].shift(3)
powell_mkt['poscount3day'] = powell_mkt['Pos_Count'].shift(3)
#powell_mkt['sentcount3day'] = powell_mkt['Sent_Count'].shift(3)
powell_mkt['econcount3day'] = powell_mkt['Count'].shift(3)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(3)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(3)
powell_mkt['change3day'] = powell_mkt['change'].shift(3)
powell_mkt['negcount4day'] = powell_mkt['Neg_Count'].shift(4)
powell_mkt['poscount4day'] = powell_mkt['Pos_Count'].shift(4)
#powell_mkt['sentcount4day'] = powell_mkt['Sent_Count'].shift(4)
powell_mkt['econcount4day'] = powell_mkt['Count'].shift(4)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(4)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(4)
powell_mkt['change4day'] = powell_mkt['change'].shift(4)
powell_mkt['negcount5day'] = powell_mkt['Neg_Count'].shift(5)
powell_mkt['poscount5day'] = powell_mkt['Pos_Count'].shift(5)
#powell_mkt['sentcount5day'] = powell_mkt['Sent_Count'].shift(5)
powell_mkt['econcount5day'] = powell_mkt['Count'].shift(5)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(5)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(5)
powell_mkt['change5day'] = powell_mkt['change'].shift(5)
powell_mkt['negcount6day'] = powell_mkt['Neg_Count'].shift(6)
powell_mkt['poscount6day'] = powell_mkt['Pos_Count'].shift(6)
#powell_mkt['sentcount6day'] = powell_mkt['Sent_Count'].shift(6)
powell_mkt['econcount6day'] = powell_mkt['Count'].shift(6)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(6)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(6)
powell_mkt['change6day'] = powell_mkt['change'].shift(6)
powell_mkt['negcount7day'] = powell_mkt['Neg_Count'].shift(7)
powell_mkt['poscount7day'] = powell_mkt['Pos_Count'].shift(7)
#powell_mkt['sentcount7day'] = powell_mkt['Sent_Count'].shift(7)
powell_mkt['econcount7day'] = powell_mkt['Count'].shift(7)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(7)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(7)
powell_mkt['change7day'] = powell_mkt['change'].shift(7)
powell_mkt['negcount8day'] = powell_mkt['Neg_Count'].shift(8)
powell_mkt['poscount8day'] = powell_mkt['Pos_Count'].shift(8)
#powell_mkt['sentcount8day'] = powell_mkt['Sent_Count'].shift(8)
powell_mkt['econcount8day'] = powell_mkt['Count'].shift(8)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(8)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(8)
powell_mkt['change8day'] = powell_mkt['change'].shift(8)
powell_mkt['negcount9day'] = powell_mkt['Neg_Count'].shift(9)
powell_mkt['poscount9day'] = powell_mkt['Pos_Count'].shift(9)
#powell_mkt['sentcount9day'] = powell_mkt['Sent_Count'].shift(9)
powell_mkt['econcount9day'] = powell_mkt['Count'].shift(9)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(9)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(9)
powell_mkt['change9day'] = powell_mkt['change'].shift(9)
powell_mkt['negcount10day'] = powell_mkt['Neg_Count'].shift(10)
powell_mkt['poscount10day'] = powell_mkt['Pos_Count'].shift(10)
#powell_mkt['sentcount10day'] = powell_mkt['Sent_Count'].shift(10)
powell_mkt['econcount10day'] = powell_mkt['Count'].shift(10)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(10)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(10)
powell_mkt['change10day'] = powell_mkt['change'].shift(10)
powell_mkt['negcount11day'] = powell_mkt['Neg_Count'].shift(11)
powell_mkt['poscount11day'] = powell_mkt['Pos_Count'].shift(11)
#powell_mkt['sentcount11day'] = powell_mkt['Sent_Count'].shift(11)
powell_mkt['econcount11day'] = powell_mkt['Count'].shift(11)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(11)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(11)
powell_mkt['change11day'] = powell_mkt['change'].shift(11)
powell_mkt['negcount12day'] = powell_mkt['Neg_Count'].shift(12)
powell_mkt['poscount12day'] = powell_mkt['Pos_Count'].shift(12)
#powell_mkt['sentcount12day'] = powell_mkt['Sent_Count'].shift(12)
powell_mkt['econcount12day'] = powell_mkt['Count'].shift(12)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(12)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(12)
powell_mkt['change12day'] = powell_mkt['change'].shift(12)
powell_mkt['negcount13day'] = powell_mkt['Neg_Count'].shift(13)
powell_mkt['poscount13day'] = powell_mkt['Pos_Count'].shift(13)
#powell_mkt['sentcount13day'] = powell_mkt['Sent_Count'].shift(13)
powell_mkt['econcount13day'] = powell_mkt['Count'].shift(13)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(13)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(13)
powell_mkt['change13day'] = powell_mkt['change'].shift(13)
powell_mkt['negcount14day'] = powell_mkt['Neg_Count'].shift(14)
powell_mkt['poscount14day'] = powell_mkt['Pos_Count'].shift(14)
#powell_mkt['sentcount14day'] = powell_mkt['Sent_Count'].shift(14)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(14)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(14)
powell_mkt['econcount14day'] = powell_mkt['change'].shift(14)
powell_mkt['negcount15day'] = powell_mkt['Neg_Count'].shift(15)
powell_mkt['poscount15day'] = powell_mkt['Pos_Count'].shift(15)
#powell_mkt['sentcount15day'] = powell_mkt['Sent_Count'].shift(15)
powell_mkt['econcount15day'] = powell_mkt['Count'].shift(15)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(15)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(15)
powell_mkt['change15day'] = powell_mkt['change'].shift(15)
powell_mkt

Unnamed: 0,date,change,Date,Neg_Count,Pos_Count,Sent_Count,Count,pos_neg,pos,neg,...,poscount13day,econcount13day,change13day,negcount14day,poscount14day,econcount14day,negcount15day,poscount15day,econcount15day,change15day
0,2016-02-26,-0.007529,2016-02-26,20,15,-5,3,0,0,1,...,,,,,,,,,,
1,2016-05-26,-0.000477,2016-05-26,59,61,2,20,0,0,0,...,,,,,,,,,,
2,2016-06-21,0.000672,2016-06-21,15,5,-10,8,1,0,0,...,,,,,,,,,,
3,2016-06-28,0.008537,2016-06-28,67,72,5,40,1,1,0,...,,,,,,,,,,
4,2016-09-29,-0.007948,2016-09-29,57,51,-6,4,0,0,1,...,,,,,,,,,,
5,2016-10-24,-0.000512,2016-10-24,8,2,-6,4,0,0,0,...,,,,,,,,,,
6,2016-11-18,-0.002602,2016-11-18,33,36,3,20,0,0,1,...,,,,,,,,,,
7,2016-11-29,0.001769,2016-11-29,49,47,-2,23,1,0,0,...,,,,,,,,,,
8,2016-11-30,-0.00564,2016-11-30,15,23,8,4,0,0,1,...,,,,,,,,,,
9,2017-02-22,0.001102,2017-02-22,12,13,1,3,1,0,0,...,,,,,,,,,,


In [24]:
powell_mkt.to_csv('./Data/powell_lmkt.csv', index=False)

In [25]:
for x in powell_mkt.columns:
    print(x)

date
change
Date
Neg_Count
Pos_Count
Sent_Count
Count
pos_neg
pos
neg
flat
growthgap
job_change
negcount1day
poscount1day
econcount1day
job_change1day
growthgap1day
change1day
negcount2day
poscount2day
econcount2day
change2day
negcount3day
poscount3day
econcount3day
change3day
negcount4day
poscount4day
econcount4day
change4day
negcount5day
poscount5day
econcount5day
change5day
negcount6day
poscount6day
econcount6day
change6day
negcount7day
poscount7day
econcount7day
change7day
negcount8day
poscount8day
econcount8day
change8day
negcount9day
poscount9day
econcount9day
change9day
negcount10day
poscount10day
econcount10day
change10day
negcount11day
poscount11day
econcount11day
change11day
negcount12day
poscount12day
econcount12day
change12day
negcount13day
poscount13day
econcount13day
change13day
negcount14day
poscount14day
econcount14day
negcount15day
poscount15day
econcount15day
change15day


Machine Learning Models

In [69]:
powell_mkt.dropna(inplace=True)
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Pos_Count, Neg_Count, Count, job_change, growthgap
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 Neg_Count, Pos_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head()
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

Unnamed: 0_level_0,Neg_Count,Pos_Count,Count,growthgap,job_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-06-26,26,33,2,1.1,141.0
2017-07-06,58,39,6,0.9,141.0
2017-10-05,14,18,12,-1.4,190.0
2017-10-12,31,22,5,-1.4,42.0
2017-10-18,22,47,2,-1.4,42.0


In [70]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y3, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#1 %Change

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

-0.5913638757571293
-0.1093561976849664


  This is separate from the ipykernel package so we can avoid doing imports until


In [71]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Pos_Count, Neg_Count, Count, job_change, growthgap
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 Pos_Count, Neg_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head()
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

Unnamed: 0_level_0,Neg_Count,Pos_Count,Count,growthgap,job_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-06-26,26,33,2,1.1,141.0
2017-07-06,58,39,6,0.9,141.0
2017-10-05,14,18,12,-1.4,190.0
2017-10-12,31,22,5,-1.4,42.0
2017-10-18,22,47,2,-1.4,42.0


In [72]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#2 Pos_Count, Neg_Count, Count, job_change, growthgap

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

-0.18666666666666676
-0.14468731713005023




In [73]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Pos_Count, Neg_Count, Count, job_change, growthgap
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 Pos_Count, Neg_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head()
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

Unnamed: 0_level_0,Neg_Count,Pos_Count,Count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-26,26,33,2
2017-07-06,58,39,6
2017-10-05,14,18,12
2017-10-12,31,22,5
2017-10-18,22,47,2


In [74]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#3 Pos_Count, Neg_Count, Count

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

-0.08000000000000007
-0.08793750936453315




In [100]:
powell_mkt.dropna(inplace=True)
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:16, 19:74]] #4 with 15 lags
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:4, 6:7, 11:12, 13:14, 22:23, 27:30,]] #5 with feature important lags
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:16, 18:19]] #6 with 1 lag

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head()
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

Unnamed: 0_level_0,Neg_Count,Pos_Count,Count,negcount1day,poscount1day,econcount1day,change1day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-06-26,26,33,2,65.0,41.0,10.0,0.000906
2017-07-06,58,39,6,26.0,33.0,2.0,-0.002501
2017-10-05,14,18,12,58.0,39.0,6.0,-0.00554
2017-10-12,31,22,5,14.0,18.0,12.0,0.004417
2017-10-18,22,47,2,31.0,22.0,5.0,-7.9e-05


In [101]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#5 with 1 lag

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:16, 18:19]]), reverse=True)

0.1133333333333334
0.05270388450256058




[(0.4472818863723842, 'negcount1day'),
 (0.25882113010322805, 'date'),
 (0.1081058260624824, 'econcount1day'),
 (0.0956418557351815, 'Pos_Count'),
 (0.03929320928974908, 'poscount1day'),
 (0.02781045751633987, 'Neg_Count'),
 (0.023045634920634918, 'Count')]

In [99]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#4 with 15 lags

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:16, 19:74]]), reverse=True)

0.17666666666666675
0.20474098366424975




[(0.4800925301605624, 'negcount1day'),
 (0.1, 'change2day'),
 (0.09038699690402476, 'poscount7day'),
 (0.08203125000000001, 'change10day'),
 (0.08174019607843137, 'change5day'),
 (0.052083333333333336, 'date'),
 (0.029487179487179483, 'poscount12day'),
 (0.01796875, 'econcount4day'),
 (0.01796875, 'econcount12day'),
 (0.016428571428571424, 'econcount15day'),
 (0.01597222222222222, 'negcount3day'),
 (0.015840220385674945, 'negcount11day'),
 (0.0, 'poscount9day'),
 (0.0, 'poscount8day'),
 (0.0, 'poscount6day'),
 (0.0, 'poscount5day'),
 (0.0, 'poscount4day'),
 (0.0, 'poscount3day'),
 (0.0, 'poscount2day'),
 (0.0, 'poscount1day'),
 (0.0, 'poscount15day'),
 (0.0, 'poscount14day'),
 (0.0, 'poscount13day'),
 (0.0, 'poscount11day'),
 (0.0, 'poscount10day'),
 (0.0, 'negcount9day'),
 (0.0, 'negcount8day'),
 (0.0, 'negcount7day'),
 (0.0, 'negcount6day'),
 (0.0, 'negcount5day'),
 (0.0, 'negcount4day'),
 (0.0, 'negcount2day'),
 (0.0, 'negcount15day'),
 (0.0, 'negcount14day'),
 (0.0, 'negcount13day'