In [147]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

In [148]:
raw = pd.read_csv('tr_eikon_eod_data.csv', index_col = 0, parse_dates = True)
data = raw[['AAPL.O', 'MSFT.O', 'INTC.O', 'AMZN.O', 'GS.N']].dropna()
data

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.572827,30.950,20.88,133.90,173.08
2010-01-05,30.625684,30.960,20.87,134.69,176.14
2010-01-06,30.138541,30.770,20.80,132.25,174.26
2010-01-07,30.082827,30.452,20.60,130.00,177.67
2010-01-08,30.282827,30.660,20.83,133.52,174.31
...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.15,221.54
2018-06-26,184.430000,99.080,49.67,1691.09,221.58
2018-06-27,184.160000,97.540,48.76,1660.51,220.18
2018-06-28,185.500000,98.630,49.25,1701.45,223.42


# Calculate returns for benchmark case

In [149]:
data['Returns'] = data['AAPL.O'] + data['MSFT.O'] + data['INTC.O'] + data['AMZN.O'] + data['GS.N']
log_ret = (np.log(data / data.shift(1))).dropna()
log_ret

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-05,0.001727,0.000323,-0.000479,0.005883,0.017525,0.009973
2010-01-06,-0.016034,-0.006156,-0.003360,-0.018282,-0.010731,-0.012968
2010-01-07,-0.001850,-0.010389,-0.009662,-0.017160,0.019379,0.001509
2010-01-08,0.006626,0.006807,0.011103,0.026717,-0.019093,0.002050
2010-01-11,-0.008861,-0.012802,0.005744,-0.024350,-0.015902,-0.016822
...,...,...,...,...,...,...
2018-06-25,-0.014983,-0.020323,-0.034690,-0.031090,-0.020020,-0.028279
2018-06-26,0.012330,0.006988,-0.020722,0.016660,0.000181,0.013398
2018-06-27,-0.001465,-0.015665,-0.018491,-0.018249,-0.006338,-0.015571
2018-06-28,0.007250,0.011113,0.009999,0.024356,0.014608,0.021077


In [150]:
# split the dataset into 50 (training) / 50 (testing)
from sklearn.model_selection import train_test_split
log_ret_train, log_ret_test = train_test_split(log_ret,
                                         test_size = 0.5,
                                         shuffle = True,
                                         random_state = 0) 

In [151]:
log_ret_train

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-14,-0.002336,0.003372,0.004198,-0.000855,0.004067,-0.000119
2013-10-03,-0.012642,-0.001770,-0.012532,-0.018103,-0.011537,-0.014619
2014-03-31,-0.000224,0.016977,0.007544,-0.005707,0.009505,0.000764
2016-05-24,0.015129,0.030705,0.027086,0.010636,0.013673,0.012988
2014-09-26,0.029002,0.008004,0.003509,0.003968,0.005579,0.008269
...,...,...,...,...,...,...
2014-02-12,-0.000075,0.007904,0.003264,-0.035276,-0.005429,-0.019854
2016-11-17,-0.000364,0.016461,0.005153,0.013188,0.016207,0.012375
2013-01-16,0.040671,-0.006267,0.010457,-0.010983,0.039762,0.010348
2013-05-01,-0.007913,-0.011547,0.001669,-0.022230,-0.023972,-0.019173


# AAPL.O stepwise regression

In [152]:
Apple = pd.DataFrame(log_ret['AAPL.O'])
for i in range(1,6):
    s = 'lag'+ str(i)
    Apple[s] = Apple['AAPL.O'].shift(i)
Apple = Apple.dropna()

In [153]:
# Prepare train data 
Apple_train = Apple.merge(log_ret_train, left_index=True, right_index=True).loc[:, 'AAPL.O_x' : 'lag5'].dropna()
Apple_train

Unnamed: 0_level_0,AAPL.O_x,lag1,lag2,lag3,lag4,lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-14,-0.005808,0.014007,-0.011440,-0.008861,0.006626,-0.001850
2010-01-22,-0.050881,-0.017404,-0.015536,0.043288,-0.016853,-0.005808
2010-02-03,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.009395
2010-02-04,-0.036704,0.017060,0.005786,0.013791,-0.036938,-0.042219
2010-02-09,0.010607,-0.006879,0.017600,-0.036704,0.017060,0.005786
...,...,...,...,...,...,...
2018-06-12,0.005476,-0.002455,-0.009139,-0.002684,0.003460,0.007686
2018-06-14,0.000524,-0.008251,0.005476,-0.002455,-0.009139,-0.002684
2018-06-21,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000524
2018-06-26,0.012330,-0.014983,-0.002916,-0.005592,0.004353,-0.016292


In [154]:
Apple_train = Apple_train.rename(columns = {'AAPL.O_x' : 'AAPL.O'})
Apple_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-14,-0.005808,0.014007,-0.011440,-0.008861,0.006626,-0.001850
2010-01-22,-0.050881,-0.017404,-0.015536,0.043288,-0.016853,-0.005808
2010-02-03,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.009395
2010-02-04,-0.036704,0.017060,0.005786,0.013791,-0.036938,-0.042219
2010-02-09,0.010607,-0.006879,0.017600,-0.036704,0.017060,0.005786
...,...,...,...,...,...,...
2018-06-12,0.005476,-0.002455,-0.009139,-0.002684,0.003460,0.007686
2018-06-14,0.000524,-0.008251,0.005476,-0.002455,-0.009139,-0.002684
2018-06-21,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000524
2018-06-26,0.012330,-0.014983,-0.002916,-0.005592,0.004353,-0.016292


In [155]:
# Stepwise regression select features

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

LR_model = LinearRegression()
SFS_model = SequentialFeatureSelector(LR_model,
                                      scoring = 'r2')

X_train = Apple_train.iloc[:,2:]
y_train = Apple_train.iloc[:,1]

sel_X_bool = SFS_model.fit(X_train, y_train).get_support()
sel_X_train = X_train.loc[:, sel_X_bool]
sel_X_train

Unnamed: 0_level_0,lag2,lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-14,-0.011440,-0.008861
2010-01-22,-0.015536,0.043288
2010-02-03,0.013791,-0.036938
2010-02-04,0.005786,0.013791
2010-02-09,0.017600,-0.036704
...,...,...
2018-06-12,-0.009139,-0.002684
2018-06-14,0.005476,-0.002455
2018-06-21,-0.016292,-0.000530
2018-06-26,-0.002916,-0.005592


#### Prediciton on training data

In [156]:
# perform linear regression on the selected variables to predict training data
Apple_train['Predict_Step'] = LR_model.fit(sel_X_train, y_train).predict(sel_X_train)
Apple_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-14,-0.005808,0.014007,-0.011440,-0.008861,0.006626,-0.001850,-0.000453
2010-01-22,-0.050881,-0.017404,-0.015536,0.043288,-0.016853,-0.005808,0.000107
2010-02-03,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.009395,0.001484
2010-02-04,-0.036704,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.001638
2010-02-09,0.010607,-0.006879,0.017600,-0.036704,0.017060,0.005786,0.001858
...,...,...,...,...,...,...,...
2018-06-12,0.005476,-0.002455,-0.009139,-0.002684,0.003460,0.007686,-0.000116
2018-06-14,0.000524,-0.008251,0.005476,-0.002455,-0.009139,-0.002684,0.001309
2018-06-21,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000524,-0.000771
2018-06-26,0.012330,-0.014983,-0.002916,-0.005592,0.004353,-0.016292,0.000436


In [157]:
Apple_train['dir'] = np.sign(Apple_train['Predict_Step'])
Apple_train['Predict_dir'] = np.sign(Apple_train['Predict_Step'])
Apple_train

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir,Predict_dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-14,-0.005808,0.014007,-0.011440,-0.008861,0.006626,-0.001850,-0.000453,-1.0,-1.0
2010-01-22,-0.050881,-0.017404,-0.015536,0.043288,-0.016853,-0.005808,0.000107,1.0,1.0
2010-02-03,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.009395,0.001484,1.0,1.0
2010-02-04,-0.036704,0.017060,0.005786,0.013791,-0.036938,-0.042219,0.001638,1.0,1.0
2010-02-09,0.010607,-0.006879,0.017600,-0.036704,0.017060,0.005786,0.001858,1.0,1.0
...,...,...,...,...,...,...,...,...,...
2018-06-12,0.005476,-0.002455,-0.009139,-0.002684,0.003460,0.007686,-0.000116,-1.0,-1.0
2018-06-14,0.000524,-0.008251,0.005476,-0.002455,-0.009139,-0.002684,0.001309,1.0,1.0
2018-06-21,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000524,-0.000771,-1.0,-1.0
2018-06-26,0.012330,-0.014983,-0.002916,-0.005592,0.004353,-0.016292,0.000436,1.0,1.0


In [158]:
# number of trades in total
(Apple_train['Predict_dir'].diff().dropna() != 0).sum()

312

In [159]:
# number of false prediction
(Apple_train['Predict_dir'] != Apple_train['dir']).sum()

0

In [160]:
# Calculate return from strategy
Apple_train['Returns_Step'] = Apple_train['Predict_dir'] * Apple_train['AAPL.O']

np.exp(np.sum(Apple_train[['AAPL.O', 'Returns_Step']]))

AAPL.O          2.902430
Returns_Step    1.386552
dtype: float64

#### Testings

In [165]:
# Prepare test data
Apple_test = Apple.merge(log_ret_test, left_index=True, right_index=True).loc[:, 'AAPL.O_x' : 'lag5'].dropna()
Apple_test = Apple_test.rename(columns={'AAPL.O_x' : 'AAPL.O'})
X_test = Apple_test.iloc[:,2:]
sel_X_test = X_test.loc[:, sel_X_bool]
sel_X_test

Unnamed: 0_level_0,lag2,lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-12,0.006626,-0.001850
2010-01-13,-0.008861,0.006626
2010-01-15,0.014007,-0.011440
2010-01-19,-0.005808,0.014007
2010-01-20,-0.016853,-0.005808
...,...,...
2018-06-20,-0.000530,-0.010326
2018-06-22,0.004353,-0.016292
2018-06-25,-0.005592,0.004353
2018-06-28,0.012330,-0.014983


In [166]:
# perform linear regression on the selected variables to predict test data
Apple_test['Predict_Step'] = LR_model.fit(sel_X_train, y_train).predict(sel_X_test)
Apple_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-12,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.001727,0.001432
2010-01-13,0.014007,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.000083
2010-01-15,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.006626,0.001973
2010-01-19,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.000515
2010-01-20,-0.015536,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.000923
...,...,...,...,...,...,...,...
2018-06-20,0.004353,-0.016292,-0.000530,-0.010326,0.000524,-0.008251,0.000581
2018-06-22,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000946
2018-06-25,-0.014983,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,0.000358
2018-06-28,0.007250,-0.001465,0.012330,-0.014983,-0.002916,-0.005592,0.001745


In [167]:
Apple_test['dir'] = np.sign(Apple_test['Predict_Step'])
Apple_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-12,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.001727,0.001432,1.0
2010-01-13,0.014007,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.000083,1.0
2010-01-15,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.006626,0.001973,1.0
2010-01-19,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.000515,1.0
2010-01-20,-0.015536,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.000923,-1.0
...,...,...,...,...,...,...,...,...
2018-06-20,0.004353,-0.016292,-0.000530,-0.010326,0.000524,-0.008251,0.000581,1.0
2018-06-22,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000946,1.0
2018-06-25,-0.014983,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,0.000358,1.0
2018-06-28,0.007250,-0.001465,0.012330,-0.014983,-0.002916,-0.005592,0.001745,1.0


In [168]:
Apple_test['Predict_dir'] = np.sign(Apple_test['Predict_Step'])
Apple_test

Unnamed: 0_level_0,AAPL.O,lag1,lag2,lag3,lag4,lag5,Predict_Step,dir,Predict_dir
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-12,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.001727,0.001432,1.0,1.0
2010-01-13,0.014007,-0.011440,-0.008861,0.006626,-0.001850,-0.016034,0.000083,1.0,1.0
2010-01-15,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.006626,0.001973,1.0,1.0
2010-01-19,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.008861,0.000515,1.0,1.0
2010-01-20,-0.015536,0.043288,-0.016853,-0.005808,0.014007,-0.011440,-0.000923,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...
2018-06-20,0.004353,-0.016292,-0.000530,-0.010326,0.000524,-0.008251,0.000581,1.0,1.0
2018-06-22,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,-0.010326,0.000946,1.0,1.0
2018-06-25,-0.014983,-0.002916,-0.005592,0.004353,-0.016292,-0.000530,0.000358,1.0,1.0
2018-06-28,0.007250,-0.001465,0.012330,-0.014983,-0.002916,-0.005592,0.001745,1.0,1.0


In [169]:
# number of trades in total
(Apple_test['Predict_dir'].diff().dropna() != 0).sum()

346

In [171]:
# number of false prediction
(Apple_test['Predict_dir'] != Apple_test['dir']).sum()

0

In [172]:
# Calculate return from strategy
Apple_test['Returns_Step'] = Apple_test['Predict_dir'] * Apple_test['AAPL.O']

np.exp(np.sum(Apple_test[['AAPL.O', 'Returns_Step']]))

AAPL.O          2.124809
Returns_Step    1.158832
dtype: float64