In [1]:
import pandas as pd
import datetime

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

import matplotlib.pyplot as plt

In [2]:
SPX = pd.read_csv('SPX.txt', sep="\t", header=None)
SPX.columns = ["dt", "SPX"]
SPX['dt'] = pd.to_datetime(SPX['dt'])
SPX['SPX'] = SPX['SPX'].str.replace(',', '.').astype(float)

VIX = pd.read_csv('VIX.txt', sep="\t", header=None)
VIX.columns = ["dt", "VIX"]
VIX['dt'] = pd.to_datetime(VIX['dt'])
VIX['VIX'] = VIX['VIX'].str.replace(',', '.').astype(float)

dt = pd.merge(SPX, VIX,  how='left', on=["dt"]).sort_values(by=['dt'])

dt['dt'] = dt['dt'].dt.to_period('W').apply(lambda r: r.start_time)
dt = dt.groupby(['dt']).agg({'SPX':['mean', 'min', 'max'], 'VIX':['mean']})
dt.columns = dt.columns.droplevel(0)
dt = dt.reset_index()
dt.columns = ["dt", "SPX", "SPX_min", "SPX_max", "VIX"]
dt.head()

  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])
  SPX['dt'] = pd.to_datetime(SPX['dt'])


Unnamed: 0,dt,SPX,SPX_min,SPX_max,VIX
0,1995-01-09,545.596667,465.97,606.98,11.166667
1,1995-01-16,464.78,464.78,464.78,12.15
2,1995-01-23,470.39,470.39,470.39,11.25
3,1995-02-06,532.51,532.51,532.51,12.98
4,1995-02-13,481.97,481.97,481.97,11.71


In [4]:
train = dt.loc[(dt['dt'] < '2012-01-01')].copy()
test =  dt.loc[(dt['dt'] >= '2012-01-01')].copy()

X_train = train[["SPX_min", "SPX_max", "VIX"]]
y_train = train[["SPX"]]
X_test = test[["SPX_min", "SPX_max", "VIX"]]
y_test = test[["SPX"]]

In [5]:
regr = LinearRegression() 
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)
regr_mse = mean_squared_error(y_test, y_pred)
print("MSE: ", regr_mse)
print("RMSE: ", regr_mse*(1/2.0)) 

regr_r2 = r2_score(y_test, y_pred)
print('r2 score for Random Forest model is', regr_r2)
test['pred_regr'] = y_pred

MSE:  1.235518902050835
RMSE:  0.6177594510254175
r2 score for Random Forest model is 0.9999947096363703


In [6]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train.values.ravel())

y_pred = rfr.predict(X_test)
rfr_mse = mean_squared_error(y_test, y_pred)
print("MSE: ", rfr_mse)
print("RMSE: ", rfr_mse*(1/2.0)) 

rfr_r2 = r2_score(y_test, y_pred)
print('r2 score for Random Forest model is', rfr_r2)
test['pred_rfr'] = y_pred

MSE:  556424.1504449105
RMSE:  278212.07522245526
r2 score for Random Forest model is -1.3825504274692073


In [7]:
gb = GradientBoostingRegressor( )
gb.fit(X_train, y_train.values.ravel())

y_pred = gb.predict(X_test)
gb_mse = mean_squared_error(y_test, y_pred)
print("MSE: ", gb_mse)
print("RMSE: ", gb_mse*(1/2.0)) 

gb_r2 = r2_score(y_test, y_pred)
print('r2 score for GardientBoosting model is', gb_r2)
test['pred_gb'] = y_pred

MSE:  547975.0854792814
RMSE:  273987.5427396407
r2 score for GardientBoosting model is -1.3463724087231141


In [8]:
test["SPX_diff"] = test["SPX"].diff()
test["SPX_pos"] = test['SPX_diff'].apply(lambda x : 1 if x > 0 else 0)
test["SPX_neg"] = test['SPX_diff'].apply(lambda x : 1 if x < 0 else 0)

test["pred_regr_diff"] = test["pred_regr"].diff()
test["regr_pos"] = test['pred_regr_diff'].apply(lambda x : 1 if x > 0 else 0)
test["regr_neg"] = test['pred_regr_diff'].apply(lambda x : 1 if x < 0 else 0)

test["pred_rfr_diff"] = test["pred_rfr"].diff()
test["rfr_pos"] = test['pred_rfr_diff'].apply(lambda x : 1 if x > 0 else 0)
test["rfr_neg"] = test['pred_rfr_diff'].apply(lambda x : 1 if x < 0 else 0)

test["pred_gb_diff"] = test["pred_gb"].diff()
test["gb_pos"] = test['pred_gb_diff'].apply(lambda x : 1 if x > 0 else 0)
test["gb_neg"] = test['pred_gb_diff'].apply(lambda x : 1 if x < 0 else 0)

test = test.iloc[1: , :]
test.head(100)

Unnamed: 0,dt,SPX,SPX_min,SPX_max,VIX,pred_regr,pred_rfr,pred_gb,SPX_diff,SPX_pos,SPX_neg,pred_regr_diff,regr_pos,regr_neg,pred_rfr_diff,rfr_pos,rfr_neg,pred_gb_diff,gb_pos,gb_neg
774,2012-01-09,1289.09,1289.09,1289.09,20.91,1289.065470,1289.18295,1291.200308,11.04,1,0,11.029146,1,0,11.12845,1,0,12.668614,1,0
775,2012-01-16,1315.38,1315.38,1315.38,18.28,1315.347243,1315.34315,1319.539769,26.29,1,0,26.281773,1,0,26.16020,1,0,28.339461,1,0
776,2012-01-23,1316.33,1316.33,1316.33,18.53,1316.297495,1315.84255,1319.539769,0.95,1,0,0.950252,1,0,0.49940,1,0,0.000000,0,0
777,2012-01-30,1369.63,1369.63,1369.63,17.29,1369.587334,1374.37420,1380.225876,53.30,1,0,53.289839,1,0,58.53165,1,0,60.686107,1,0
778,2012-02-06,1414.20,1414.20,1414.20,17.59,1414.150966,1414.44720,1418.193603,44.57,1,0,44.563632,1,0,40.07300,1,0,37.967727,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,2014-01-13,1838.70,1838.70,1838.70,12.44,1838.577564,1548.77730,1556.554044,-86.45,0,1,-86.444033,0,1,0.06350,1,0,0.596864,1,0
870,2014-01-20,1790.29,1790.29,1790.29,18.14,1790.184078,1548.61790,1555.957179,-48.41,0,1,-48.393486,0,1,-0.15940,0,1,-0.596864,0,1
871,2014-01-27,1782.59,1782.59,1782.59,18.41,1782.485691,1548.64150,1555.957179,-7.70,0,1,-7.698387,0,1,0.02360,1,0,0.000000,0,0
872,2014-02-03,1881.14,1881.14,1881.14,12.91,1881.011795,1548.77730,1555.957179,98.55,1,0,98.526104,1,0,0.13580,1,0,0.000000,0,0


In [9]:
confusion_matrix = metrics.confusion_matrix(test["SPX_pos"], test["regr_pos"])
regr_pos_tn, fp, fn, regr_pos_tp = confusion_matrix.ravel()
confusion_matrix = metrics.confusion_matrix(test["SPX_neg"], test["regr_neg"])
regr_neg_tn, fp, fn, regr_neg_tp = confusion_matrix.ravel()

confusion_matrix = metrics.confusion_matrix(test["SPX_pos"], test["rfr_pos"])
rfr_pos_tn, fp, fn, rfr_pos_tp = confusion_matrix.ravel()
confusion_matrix = metrics.confusion_matrix(test["SPX_neg"], test["rfr_neg"])
rfr_neg_tn, fp, fn, rfr_neg_tp = confusion_matrix.ravel()

confusion_matrix = metrics.confusion_matrix(test["SPX_pos"], test["gb_pos"])
gb_pos_tn, fp, fn, gb_pos_tp = confusion_matrix.ravel()
confusion_matrix = metrics.confusion_matrix(test["SPX_neg"], test["gb_neg"])
gb_neg_tn, fp, fn, gb_neg_tp = confusion_matrix.ravel()

In [10]:
res = pd.DataFrame(columns=['Model_ID', 'MSE', 'RMSE', 'R2', 'pos_TP', 'pos_TN', 'neg_TP', 'neg_TN'])
res["Model_ID"] = ["REG", "RF", "GB"]
res["MSE"] = [regr_mse, rfr_mse, gb_mse]
res["RMSE"] = [regr_mse*(1/2.0), rfr_mse*(1/2.0), gb_mse*(1/2.0)]
res["R2"] = [regr_r2, rfr_r2, gb_r2]
res["pos_TP"] = [regr_pos_tp, rfr_pos_tp, gb_pos_tp]
res["pos_TN"] = [regr_pos_tn, rfr_pos_tn, gb_pos_tn]
res["neg_TP"] = [regr_neg_tp, rfr_neg_tp, gb_neg_tp]
res["neg_TN"] = [regr_neg_tn, rfr_neg_tn, gb_neg_tn]
res

Unnamed: 0,Model_ID,MSE,RMSE,R2,pos_TP,pos_TN,neg_TP,neg_TN
0,REG,1.235519,0.617759,0.999995,196,161,161,196
1,RF,556424.150445,278212.075222,-1.38255,113,142,91,178
2,GB,547975.085479,273987.54274,-1.346372,83,125,63,151
