In [None]:
%matplotlib inline
# Dependencies and Setup
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import export_graphviz
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.widgets import Slider
import seaborn as sns; sns.set()
import ppscore as pps
import seaborn as seabornInstance



In [None]:
import warnings
warnings.filterwarnings('ignore')

Read all files as indicated in subsequent cells containing data for labels and NLP formulated features of our project.

In [None]:
L_cap = pd.read_csv("./data/stock_spy.csv")
L_cap = L_cap.loc[L_cap['date']>'2016-01-01']
L_cap = L_cap.loc[:, ["date", "change"]]
L_cap.head(1)

In [None]:
M_cap = pd.read_csv("./data/stock_mdy.csv")
M_cap = M_cap.loc[M_cap['date']>'2016-01-01']
M_cap = M_cap.loc[:, ["date", "change"]]
M_cap.head(1)

In [None]:
S_cap = pd.read_csv("./data/stock_ijr.csv")
S_cap = S_cap.loc[S_cap['date']>'2016-01-01']
S_cap = S_cap.loc[:, ["date", "change"]]
S_cap.head(1)

In [None]:
powell_econ = pd.read_csv("./data/powell_daydf_count.csv")
powell_econ.head(1)

In [None]:
powell_sentbyday = pd.read_csv("./data/powell_sentbydaydf_count.csv")
powell_sentbyday.head(1)

In [None]:
powell_sentbydaypos = pd.read_csv("./data/powell_sentbydayposdf_count.csv")
powell_sentbydaypos.head(1)

In [None]:
powell_sentbydayneg = pd.read_csv("./data/powell_sentbydaynegdf_count.csv")
powell_sentbydayneg.head(1)

Join data frames and create classification labels.

In [None]:
powell_mkt = M_cap.merge(powell_sentbydayneg, how='inner', left_on="date" ,right_on="Date")
powell_mkt = powell_mkt.merge(powell_sentbydaypos, how='inner' ,on="Date")
powell_mkt =  powell_mkt.merge(powell_sentbyday, how='inner' ,on="Date")
powell_mkt =  powell_mkt.merge(powell_econ, how='inner' ,on="Date")
powell_mkt['pos_neg'] = np.where (powell_mkt['change'] > 0,1,0)
powell_mkt['pos'] = np.where (powell_mkt['change'] > 0.002,1,0)
powell_mkt['neg'] = np.where (powell_mkt['change'] < -0.002,1,0)
powell_mkt['flat'] = np.where ((powell_mkt['change'] <= 0.002) & (powell_mkt['change'] >= -0.002) ,1,0)

powell_mkt.head()


Add % changes of fundamental factors as features.  Since they are monthly released data, the values of subsequent daily rows after each releasing date would be equal to their prior row until they reach the new releasing date.

In [None]:
powell_rgdp = pd.read_csv("./Data/RGDP.csv")
powell_rgdp.head()
powell_mkt['date'] = pd.to_datetime(powell_mkt['date'])

In [None]:
powell_mkt['growthgap']=np.nan
for x in range(len(powell_rgdp.index)-1):
   powell_mkt.loc[((powell_mkt['date']>= powell_rgdp.loc[x][0]) & (powell_mkt['date']< powell_rgdp.loc[x+1][0])),['growthgap']]=powell_rgdp.loc[x][3]
powell_mkt.head(1)

In [None]:
powell_empsit = pd.read_csv("./Data/empsit.csv")
powell_empsit.head()
powell_mkt['date'] = pd.to_datetime(powell_mkt['date'])

In [None]:
powell_mkt['job_change']=np.nan
for x in range(len(powell_empsit.index)-1):
   powell_mkt.loc[((powell_mkt['date']>= powell_empsit.loc[x][0]) & (powell_mkt['date']< powell_empsit.loc[x+1][0])),['job_change']]=powell_empsit.loc[x][1]
#    print(wsb_mkt.loc[(wsb_mkt['date']>= wsb_empsit.loc[x][0]) & (wsb_mkt['date']< wsb_empsit.loc[x+1][0])]['job_change'])
powell_mkt.head(1)

In [None]:
powell_mkt_hm1 = powell_mkt.loc[:, [
"date",
"change",
"Neg_Count",
"Pos_Count",
"Sent_Count",
"Count",
"job_change",
"growthgap"]]
powell_mkt_hm1.set_index('date', inplace=True)
plt.subplots_adjust(bottom=0.25)
powell_mkt_hm1.plot(subplots=True, figsize=(16, 16)); plt.legend(loc='best')

In [None]:
#pps matrix plot
powell_mkt_hm1 = powell_mkt.loc[:, [

"pos_neg",
"Neg_Count",
"Pos_Count",
"Sent_Count",
#"favorites",
"Count",
"job_change",
"growthgap"]]

ax = sns.heatmap(pps.matrix(powell_mkt_hm1), vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True, fmt=".3f")
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
;

# Save Figure
plt.savefig('./Images/powell_LCap_PPS_Matrix.png', figsize=(400,240))

# Show plot
plt.show()

Create lag values to add to features given that signals may adjust in lagged time.

In [None]:
powell_mkt['negcount1day'] = powell_mkt['Neg_Count'].shift(1)
powell_mkt['poscount1day'] = powell_mkt['Pos_Count'].shift(1)
powell_mkt['sentcount1day'] = powell_mkt['Sent_Count'].shift(1)
powell_mkt['econcount1day'] = powell_mkt['Count'].shift(1)
powell_mkt['job_change1day'] = powell_mkt['job_change'].shift(1)
powell_mkt['growthgap1day'] = powell_mkt['growthgap'].shift(1)
powell_mkt['change1day'] = powell_mkt['change'].shift(1)
powell_mkt['negcount2day'] = powell_mkt['Neg_Count'].shift(2)
powell_mkt['poscount2day'] = powell_mkt['Pos_Count'].shift(2)
powell_mkt['sentcount2day'] = powell_mkt['Sent_Count'].shift(2)
powell_mkt['econcount2day'] = powell_mkt['Count'].shift(2)
powell_mkt['job_change2day'] = powell_mkt['job_change'].shift(2)
powell_mkt['growthgap2day'] = powell_mkt['growthgap'].shift(2)
powell_mkt['change2day'] = powell_mkt['change'].shift(2)
powell_mkt['negcount3day'] = powell_mkt['Neg_Count'].shift(3)
powell_mkt['poscount3day'] = powell_mkt['Pos_Count'].shift(3)
powell_mkt['sentcount3day'] = powell_mkt['Sent_Count'].shift(3)
powell_mkt['econcount3day'] = powell_mkt['Count'].shift(3)
powell_mkt['job_change3day'] = powell_mkt['job_change'].shift(3)
powell_mkt['growthgap3day'] = powell_mkt['growthgap'].shift(3)
powell_mkt['change3day'] = powell_mkt['change'].shift(3)
powell_mkt['negcount4day'] = powell_mkt['Neg_Count'].shift(4)
powell_mkt['poscount4day'] = powell_mkt['Pos_Count'].shift(4)
powell_mkt['sentcount4day'] = powell_mkt['Sent_Count'].shift(4)
powell_mkt['econcount4day'] = powell_mkt['Count'].shift(4)
powell_mkt['job_change4day'] = powell_mkt['job_change'].shift(4)
powell_mkt['growthgap4day'] = powell_mkt['growthgap'].shift(4)
powell_mkt['change4day'] = powell_mkt['change'].shift(4)
powell_mkt['negcount5day'] = powell_mkt['Neg_Count'].shift(5)
powell_mkt['poscount5day'] = powell_mkt['Pos_Count'].shift(5)
powell_mkt['sentcount5day'] = powell_mkt['Sent_Count'].shift(5)
powell_mkt['econcount5day'] = powell_mkt['Count'].shift(5)
powell_mkt['job_change5day'] = powell_mkt['job_change'].shift(5)
powell_mkt['growthgap5day'] = powell_mkt['growthgap'].shift(5)
powell_mkt['change5day'] = powell_mkt['change'].shift(5)
powell_mkt['negcount6day'] = powell_mkt['Neg_Count'].shift(6)
powell_mkt['poscount6day'] = powell_mkt['Pos_Count'].shift(6)
powell_mkt['sentcount6day'] = powell_mkt['Sent_Count'].shift(6)
powell_mkt['econcount6day'] = powell_mkt['Count'].shift(6)
powell_mkt['job_change6day'] = powell_mkt['job_change'].shift(6)
powell_mkt['growthgap6day'] = powell_mkt['growthgap'].shift(6)
powell_mkt['change6day'] = powell_mkt['change'].shift(6)
powell_mkt['negcount7day'] = powell_mkt['Neg_Count'].shift(7)
powell_mkt['poscount7day'] = powell_mkt['Pos_Count'].shift(7)
powell_mkt['sentcount7day'] = powell_mkt['Sent_Count'].shift(7)
powell_mkt['econcount7day'] = powell_mkt['Count'].shift(7)
powell_mkt['job_change7day'] = powell_mkt['job_change'].shift(7)
powell_mkt['growthgap7day'] = powell_mkt['growthgap'].shift(7)
powell_mkt['change7day'] = powell_mkt['change'].shift(7)
powell_mkt['negcount8day'] = powell_mkt['Neg_Count'].shift(8)
powell_mkt['poscount8day'] = powell_mkt['Pos_Count'].shift(8)
powell_mkt['sentcount8day'] = powell_mkt['Sent_Count'].shift(8)
powell_mkt['econcount8day'] = powell_mkt['Count'].shift(8)
powell_mkt['job_change8day'] = powell_mkt['job_change'].shift(8)
powell_mkt['growthgap8day'] = powell_mkt['growthgap'].shift(8)
powell_mkt['change8day'] = powell_mkt['change'].shift(8)
powell_mkt['negcount9day'] = powell_mkt['Neg_Count'].shift(9)
powell_mkt['poscount9day'] = powell_mkt['Pos_Count'].shift(9)
powell_mkt['sentcount9day'] = powell_mkt['Sent_Count'].shift(9)
powell_mkt['econcount9day'] = powell_mkt['Count'].shift(9)
powell_mkt['job_change9day'] = powell_mkt['job_change'].shift(9)
powell_mkt['growthgap9day'] = powell_mkt['growthgap'].shift(9)
powell_mkt['change9day'] = powell_mkt['change'].shift(9)
powell_mkt['negcount10day'] = powell_mkt['Neg_Count'].shift(10)
powell_mkt['poscount10day'] = powell_mkt['Pos_Count'].shift(10)
powell_mkt['sentcount10day'] = powell_mkt['Sent_Count'].shift(10)
powell_mkt['econcount10day'] = powell_mkt['Count'].shift(10)
powell_mkt['job_change10day'] = powell_mkt['job_change'].shift(10)
powell_mkt['growthgap10day'] = powell_mkt['growthgap'].shift(10)
powell_mkt['change10day'] = powell_mkt['change'].shift(10)
powell_mkt['negcount11day'] = powell_mkt['Neg_Count'].shift(11)
powell_mkt['poscount11day'] = powell_mkt['Pos_Count'].shift(11)
powell_mkt['sentcount11day'] = powell_mkt['Sent_Count'].shift(11)
powell_mkt['econcount11day'] = powell_mkt['Count'].shift(11)
powell_mkt['job_change11day'] = powell_mkt['job_change'].shift(11)
powell_mkt['growthgap11day'] = powell_mkt['growthgap'].shift(11)
powell_mkt['change11day'] = powell_mkt['change'].shift(11)
powell_mkt['negcount12day'] = powell_mkt['Neg_Count'].shift(12)
powell_mkt['poscount12day'] = powell_mkt['Pos_Count'].shift(12)
powell_mkt['sentcount12day'] = powell_mkt['Sent_Count'].shift(12)
powell_mkt['econcount12day'] = powell_mkt['Count'].shift(12)
powell_mkt['job_change12day'] = powell_mkt['job_change'].shift(12)
powell_mkt['growthgap12day'] = powell_mkt['growthgap'].shift(12)
powell_mkt['change12day'] = powell_mkt['change'].shift(12)
powell_mkt['negcount13day'] = powell_mkt['Neg_Count'].shift(13)
powell_mkt['poscount13day'] = powell_mkt['Pos_Count'].shift(13)
powell_mkt['sentcount13day'] = powell_mkt['Sent_Count'].shift(13)
powell_mkt['econcount13day'] = powell_mkt['Count'].shift(13)
powell_mkt['job_change13day'] = powell_mkt['job_change'].shift(13)
powell_mkt['growthgap13day'] = powell_mkt['growthgap'].shift(13)
powell_mkt['change13day'] = powell_mkt['change'].shift(13)
powell_mkt['negcount14day'] = powell_mkt['Neg_Count'].shift(14)
powell_mkt['poscount14day'] = powell_mkt['Pos_Count'].shift(14)
powell_mkt['sentcount14day'] = powell_mkt['Sent_Count'].shift(14)
powell_mkt['job_change14day'] = powell_mkt['job_change'].shift(14)
powell_mkt['growthgap14day'] = powell_mkt['growthgap'].shift(14)
powell_mkt['econcount14day'] = powell_mkt['change'].shift(14)
powell_mkt['change14day'] = powell_mkt['change'].shift(14)
powell_mkt['negcount15day'] = powell_mkt['Neg_Count'].shift(15)
powell_mkt['poscount15day'] = powell_mkt['Pos_Count'].shift(15)
powell_mkt['sentcount15day'] = powell_mkt['Sent_Count'].shift(15)
powell_mkt['econcount15day'] = powell_mkt['Count'].shift(15)
powell_mkt['job_change15day'] = powell_mkt['job_change'].shift(15)
powell_mkt['growthgap15day'] = powell_mkt['growthgap'].shift(15)
powell_mkt['change15day'] = powell_mkt['change'].shift(15)
powell_mkt.head(1)

In [None]:
powell_mkt.to_csv('./Data/powell_lmkt.csv', index=False)

In [None]:
for x in powell_mkt.columns:
    print(x)

Machine Learning Models: RandomForesRegression & Ridge for the model with % change of prices for SPY, MDY and IJR as labels; RandomClassifier and LogisticRegressor for the models with labels as classifiers of price change directions -- up or down. The RandomFlorest models also oberve feature importance and decision trees. 

In [None]:
powell_mkt.dropna(inplace=True)
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change (y_3)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Classification Change (y_1)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 y_1 Neg_Count, Pos_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head(1)
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y3, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#1 %Change

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

In [None]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change (y_3)
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Classification Change (y_1)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 y_1, Pos_Count, Neg_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
#powell_lmkt_x.head(1)
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
from sklearn.ensemble import RandomForestClassifier

regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#2 Pos_Count, Neg_Count, Count, job_change, growthgap

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

In [None]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Pos_Count, Neg_Count, Count, job_change, growthgap
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 Pos_Count, Neg_Count, Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head(1)
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#3 Pos_Count, Neg_Count, Count

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

In [None]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #1 %Change
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 11:13]] #2 Pos_Count, Neg_Count, Count, job_change, growthgap
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7]] #3 Pos_Count, Neg_Count, Count
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 5:6, 6:7]] #4 Sent_Count, Count
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 5:6]] #4.1 Sent_Count

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head(1)
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))


#4.1 Sent_Count

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestRegressor()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#4 Sent_Count, Count

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

In [None]:
powell_mkt.dropna(inplace=True)
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 19:20]] #5.1 with 1 lag
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:20]] #5 with 1 lag
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:62]] #6 with 7 lags
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:64, 65:66, 68:71, 72:73, 75:78, 79:80, 82:85, 86:87, 89:92, 93:94, 96:99, 100:101, 103:106, 107:108, 110-113, 114:115, 117:118]] #7 with 15 lags

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head(1)
#powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
#powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#5 with 1 lag

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[3:5, 6:7, 13:15, 16:17, 19:20]]), reverse=True)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))


#6 with 7 lags

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:62]]), reverse=True)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))


#7 with 15 lags

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
# regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:64, 65:66, 68:71, 72:73, 75:78, 79:80, 82:85, 86:87, 89:92, 93:94, 96:99, 100:101, 103:106, 107:108, 110-113, 114:115, 117:118]]), reverse=True)

In [None]:
powell_mkt.dropna(inplace=True)
powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 19:20]] #5.1 with 1 lag
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:20]] #5 with 1 lag
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:62]] #6 with 7 lags
#powell_lmkt_x = powell_mkt.iloc[:, np.r_[0, 3:5, 6:7, 13:15, 16:17, 19:22, 23:24, 26:29, 30:31,33:36, 37:38, 40:43, 44:45, 47:50, 51:52, 54:57, 58:59, 61:64, 65:66, 68:71, 72:73, 75:78, 79:80, 82:85, 86:87, 89:92, 93:94, 96:99, 100:101, 103:106, 107:108, 110-113, 114:115, 117:118]] #7 with 15 lags

powell_lmkt_x.set_index("date", inplace=True)
powell_lmkt_y1 = powell_mkt['pos_neg']
powell_lmkt_y2 = powell_mkt[['pos','neg','flat']]
powell_lmkt_y3 = powell_mkt[['change']]
powell_lmkt_x.head(1)
powell_lmkt_x.to_csv('./Data/powell_lmkt_x.csv', index=True)
powell_lmkt_y1.to_csv('./Data/powell_lmkt_y1.csv', index=False)

In [None]:
X_train, X_test,y_train,y_test = train_test_split(powell_lmkt_x, powell_lmkt_y1, test_size = 0.33,random_state=42)
regr = RandomForestClassifier()
regr.fit(X_train,y_train)
print(regr.score(X_test, y_test))

#5.1 with 1 lag (with no lag for Count)

ridge = Ridge()
ridge.fit(X_train,y_train)
print(ridge.score(X_test, y_test))
regr.predict_proba(X_test)

sorted(zip(regr.feature_importances_, powell_mkt.iloc[:, np.r_[3:5, 6:7, 13:15, 19:20]]), reverse=True)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
y_pred = regr.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report
confusion_matrix(y_test, y_pred)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
# Build the plot
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)
# Add labels to the plot
class_names = ['pos_neg=down', 'pos_neg=up']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.savefig('./Images/powell_LCap_rf_y_pred_confusion_matrix.png', figsize=(400,240))
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('PowellEffect Characteristic/RandomForest Model')
plt.legend(loc="lower right")
plt.savefig('./Images/powell_LCap_rf_y_pred.png', figsize=(400,240))

In [None]:
#pps matrix plot
powell_mkt_hm1 = powell_mkt.loc[:, [
    
"pos_neg",
"Neg_Count",
"Pos_Count",
"Count",
"negcount1day",
"poscount1day",
"change1day"

]]

ax = sns.heatmap(pps.matrix(powell_mkt_hm1), vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True, fmt=".3f")
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
;

# Save Figure
plt.savefig('./Images/powell_LCap_PPS_Matrix.png', figsize=(400,240))

# Show plot
plt.show()