In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

In [30]:
raw = pd.read_csv('tr_eikon_eod_data.csv', index_col = 0, parse_dates = True)
data = raw[['AAPL.O', 'MSFT.O', 'INTC.O', 'AMZN.O', 'GS.N']].dropna()
data

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.572827,30.950,20.88,133.90,173.08
2010-01-05,30.625684,30.960,20.87,134.69,176.14
2010-01-06,30.138541,30.770,20.80,132.25,174.26
2010-01-07,30.082827,30.452,20.60,130.00,177.67
2010-01-08,30.282827,30.660,20.83,133.52,174.31
...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.15,221.54
2018-06-26,184.430000,99.080,49.67,1691.09,221.58
2018-06-27,184.160000,97.540,48.76,1660.51,220.18
2018-06-28,185.500000,98.630,49.25,1701.45,223.42


# Calculate returns for benchmark case

In [32]:
data['Returns'] = data['AAPL.O'] + data['MSFT.O'] + data['INTC.O'] + data['AMZN.O'] + data['GS.N']
log_ret = (np.log(data / data.shift(1))).dropna()
log_ret

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-05,0.001727,0.000323,-0.000479,0.005883,0.017525,0.009973
2010-01-06,-0.016034,-0.006156,-0.003360,-0.018282,-0.010731,-0.012968
2010-01-07,-0.001850,-0.010389,-0.009662,-0.017160,0.019379,0.001509
2010-01-08,0.006626,0.006807,0.011103,0.026717,-0.019093,0.002050
2010-01-11,-0.008861,-0.012802,0.005744,-0.024350,-0.015902,-0.016822
...,...,...,...,...,...,...
2018-06-25,-0.014983,-0.020323,-0.034690,-0.031090,-0.020020,-0.028279
2018-06-26,0.012330,0.006988,-0.020722,0.016660,0.000181,0.013398
2018-06-27,-0.001465,-0.015665,-0.018491,-0.018249,-0.006338,-0.015571
2018-06-28,0.007250,0.011113,0.009999,0.024356,0.014608,0.021077


In [8]:
# split the dataset into 50 (training) / 50 (testing)
from sklearn.model_selection import train_test_split
log_ret_train, log_ret_test = train_test_split(log_ret,
                                         test_size = 0.5,
                                         shuffle = True,
                                         random_state = 0) 

In [34]:
log_ret_train

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-14,-0.002336,0.003372,0.004198,-0.000855,0.004067,0.004379
2013-10-03,-0.012642,-0.001770,-0.012532,-0.018103,-0.011537,-0.045047
2014-03-31,-0.000224,0.016977,0.007544,-0.005707,0.009505,0.018590
2016-05-24,0.015129,0.030705,0.027086,0.010636,0.013673,0.083556
2014-09-26,0.029002,0.008004,0.003509,0.003968,0.005579,0.044483
...,...,...,...,...,...,...
2014-02-12,-0.000075,0.007904,0.003264,-0.035276,-0.005429,-0.024182
2016-11-17,-0.000364,0.016461,0.005153,0.013188,0.016207,0.034438
2013-01-16,0.040671,-0.006267,0.010457,-0.010983,0.039762,0.033877
2013-05-01,-0.007913,-0.011547,0.001669,-0.022230,-0.023972,-0.040021


# AAPL.O stepwise regression

In [35]:
AAPLO_train = pd.DataFrame(log_ret_train['AAPL.O'])
AAPLO_test = pd.DataFrame(log_ret_test['AAPL.O'])

In [36]:
# Prepare train data set 

for i in range(1,6):
    s = 'lag'+ str(i)
    AAPLO_train[s] = AAPLO_train['AAPL.O'].shift(i)
AAPLO_train = AAPLO_train.dropna()
AAPLO_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-03-17,-0.001471,0.029002,0.015129,-0.000224,-0.012642,-0.002336
2012-02-27,0.006392,-0.001471,0.029002,0.015129,-0.000224,-0.012642
2010-06-14,0.003033,0.006392,-0.001471,0.029002,0.015129,-0.000224
2012-01-12,-0.002749,0.003033,0.006392,-0.001471,0.029002,0.015129
2010-03-10,0.008128,-0.002749,0.003033,0.006392,-0.001471,0.029002
...,...,...,...,...,...,...
2014-02-12,-0.000075,0.008299,0.000082,-0.001187,-0.036705,0.007403
2016-11-17,-0.000364,-0.000075,0.008299,0.000082,-0.001187,-0.036705
2013-01-16,0.040671,-0.000364,-0.000075,0.008299,0.000082,-0.001187
2013-05-01,-0.007913,0.040671,-0.000364,-0.000075,0.008299,0.000082


In [37]:
# Stepwise regression select features

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

LR_model = LinearRegression()
SFS_model = SequentialFeatureSelector(LR_model,
                                      scoring = 'r2')

X_train = AAPLO_train.iloc[:,2:]
y_train = AAPLO_train.iloc[:,1]

sel_X_bool = SFS_model.fit(X_train, y_train).get_support()
sel_X_train = X_train.loc[:, sel_X_bool]
sel_X_train

Unnamed: 0_level_0,lag3,lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-03-17,-0.000224,-0.002336
2012-02-27,0.015129,-0.012642
2010-06-14,0.029002,-0.000224
2012-01-12,-0.001471,0.015129
2010-03-10,0.006392,0.029002
...,...,...
2014-02-12,-0.001187,0.007403
2016-11-17,0.000082,-0.036705
2013-01-16,0.008299,-0.001187
2013-05-01,-0.000075,0.000082


#### Prediciton on training data

In [38]:
# perform linear regression on the selected variables to predict training data
AAPLO_train['Predict_Step'] = LR_model.fit(sel_X_train, y_train).predict(sel_X_train)
AAPLO_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-03-17,-0.001471,0.029002,0.015129,-0.000224,-0.012642,-0.002336,0.000900
2012-02-27,0.006392,-0.001471,0.029002,0.015129,-0.000224,-0.012642,0.000953
2010-06-14,0.003033,0.006392,-0.001471,0.029002,0.015129,-0.000224,0.001564
2012-01-12,-0.002749,0.003033,0.006392,-0.001471,0.029002,0.015129,0.001327
2010-03-10,0.008128,-0.002749,0.003033,0.006392,-0.001471,0.029002,0.001851
...,...,...,...,...,...,...,...
2014-02-12,-0.000075,0.008299,0.000082,-0.001187,-0.036705,0.007403,0.001133
2016-11-17,-0.000364,-0.000075,0.008299,0.000082,-0.001187,-0.036705,0.000015
2013-01-16,0.040671,-0.000364,-0.000075,0.008299,0.000082,-0.001187,0.001108
2013-05-01,-0.007913,0.040671,-0.000364,-0.000075,0.008299,0.000082,0.000966


In [39]:
AAPLO_train['dir'] = np.sign(AAPLO_train['Predict_Step'])
AAPLO_train['Predict_dir'] = np.sign(AAPLO_train['Predict_Step'])
AAPLO_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir,Predict_dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-03-17,-0.001471,0.029002,0.015129,-0.000224,-0.012642,-0.002336,0.000900,1.0,1.0
2012-02-27,0.006392,-0.001471,0.029002,0.015129,-0.000224,-0.012642,0.000953,1.0,1.0
2010-06-14,0.003033,0.006392,-0.001471,0.029002,0.015129,-0.000224,0.001564,1.0,1.0
2012-01-12,-0.002749,0.003033,0.006392,-0.001471,0.029002,0.015129,0.001327,1.0,1.0
2010-03-10,0.008128,-0.002749,0.003033,0.006392,-0.001471,0.029002,0.001851,1.0,1.0
...,...,...,...,...,...,...,...,...,...
2014-02-12,-0.000075,0.008299,0.000082,-0.001187,-0.036705,0.007403,0.001133,1.0,1.0
2016-11-17,-0.000364,-0.000075,0.008299,0.000082,-0.001187,-0.036705,0.000015,1.0,1.0
2013-01-16,0.040671,-0.000364,-0.000075,0.008299,0.000082,-0.001187,0.001108,1.0,1.0
2013-05-01,-0.007913,0.040671,-0.000364,-0.000075,0.008299,0.000082,0.000966,1.0,1.0


In [40]:
# number of trades in total
(AAPLO_train['Predict_dir'].diff().dropna() != 0).sum()

62

In [41]:
# number of false prediction
(AAPLO_train['Predict_dir'] != AAPLO_train['dir']).sum()

0

In [42]:
# Calculate return from strategy
AAPLO_train['Returns_Step'] = AAPLO_train['Predict_dir'] * AAPLO_train['AAPL.O']

np.exp(np.sum(AAPLO_train[['AAPL.O', 'Returns_Step']]))

AAPL.O          2.843319
Returns_Step    2.314426
dtype: float64

#### Testings

In [43]:
# Prepare test data

for i in range(1,6):
    s = 'lag'+ str(i)
    AAPLO_test[s] = AAPLO_test['AAPL.O'].shift(i)
AAPLO_test = AAPLO_test.dropna()

X_test = AAPLO_test.iloc[:,2:]
sel_X_test = X_test.loc[:, sel_X_bool]
sel_X_test

Unnamed: 0_level_0,lag3,lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-17,0.007060,-0.005286
2013-08-21,0.004795,0.009472
2018-03-16,0.007684,0.007060
2011-11-18,0.011581,0.004795
2012-09-24,0.002571,0.007684
...,...,...
2014-05-21,0.019878,0.012242
2014-05-23,-0.007339,0.011811
2017-12-15,0.028206,0.019878
2012-09-12,0.002642,-0.007339


In [44]:
# perform linear regression on the selected variables to predict test data
AAPLO_test['Predict_Step'] = LR_model.fit(sel_X_train, y_train).predict(sel_X_test)
AAPLO_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-01-17,0.011581,0.007684,0.004795,0.007060,0.009472,-0.005286,0.000975
2013-08-21,0.002571,0.011581,0.007684,0.004795,0.007060,0.009472,0.001311
2018-03-16,-0.003533,0.002571,0.011581,0.007684,0.004795,0.007060,0.001309
2011-11-18,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.004795,0.001331
2012-09-24,-0.013380,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.001218
...,...,...,...,...,...,...,...
2014-05-21,0.002642,0.028206,-0.007339,0.019878,0.011811,0.012242,0.001697
2014-05-23,0.011233,0.002642,0.028206,-0.007339,0.019878,0.011811,0.001119
2017-12-15,0.010110,0.011233,0.002642,0.028206,-0.007339,0.019878,0.002069
2012-09-12,0.013831,0.010110,0.011233,0.002642,0.028206,-0.007339,0.000830


In [45]:
AAPLO_test['dir'] = np.sign(AAPLO_test['Predict_Step'])
AAPLO_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-17,0.011581,0.007684,0.004795,0.007060,0.009472,-0.005286,0.000975,1.0
2013-08-21,0.002571,0.011581,0.007684,0.004795,0.007060,0.009472,0.001311,1.0
2018-03-16,-0.003533,0.002571,0.011581,0.007684,0.004795,0.007060,0.001309,1.0
2011-11-18,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.004795,0.001331,1.0
2012-09-24,-0.013380,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.001218,1.0
...,...,...,...,...,...,...,...,...
2014-05-21,0.002642,0.028206,-0.007339,0.019878,0.011811,0.012242,0.001697,1.0
2014-05-23,0.011233,0.002642,0.028206,-0.007339,0.019878,0.011811,0.001119,1.0
2017-12-15,0.010110,0.011233,0.002642,0.028206,-0.007339,0.019878,0.002069,1.0
2012-09-12,0.013831,0.010110,0.011233,0.002642,0.028206,-0.007339,0.000830,1.0


In [46]:
AAPLO_test['Predict_dir'] = np.sign(AAPLO_test['Predict_Step'])
AAPLO_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir,Predict_dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-17,0.011581,0.007684,0.004795,0.007060,0.009472,-0.005286,0.000975,1.0,1.0
2013-08-21,0.002571,0.011581,0.007684,0.004795,0.007060,0.009472,0.001311,1.0,1.0
2018-03-16,-0.003533,0.002571,0.011581,0.007684,0.004795,0.007060,0.001309,1.0,1.0
2011-11-18,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.004795,0.001331,1.0,1.0
2012-09-24,-0.013380,-0.006566,-0.003533,0.002571,0.011581,0.007684,0.001218,1.0,1.0
...,...,...,...,...,...,...,...,...,...
2014-05-21,0.002642,0.028206,-0.007339,0.019878,0.011811,0.012242,0.001697,1.0,1.0
2014-05-23,0.011233,0.002642,0.028206,-0.007339,0.019878,0.011811,0.001119,1.0,1.0
2017-12-15,0.010110,0.011233,0.002642,0.028206,-0.007339,0.019878,0.002069,1.0,1.0
2012-09-12,0.013831,0.010110,0.011233,0.002642,0.028206,-0.007339,0.000830,1.0,1.0


In [47]:
# number of trades in total
(AAPLO_test['Predict_dir'].diff().dropna() != 0).sum()

68

In [48]:
# number of false prediction
(AAPLO_test['Predict_dir'] != AAPLO_test['dir']).sum()

0

In [49]:
# Calculate return from strategy
AAPLO_test['Returns_Step'] = AAPLO_test['Predict_dir'] * AAPLO_test['AAPL.O']

np.exp(np.sum(AAPLO_test[['AAPL.O', 'Returns_Step']]))

AAPL.O          2.020230
Returns_Step    2.487104
dtype: float64