# Imports 

In [2]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from backtesting import Backtest, Strategy

import multiprocessing as mp

# Data

In [3]:
df = pd.read_excel('data/MSFT_linkedin_processed.xlsx', parse_dates=['Date'], index_col=0)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-08,61.299999,61.580002,60.84,61.009998,21220800,1.549141,UP
2016-12-09,61.18,61.990002,61.130001,61.970001,27349400,0.321694,UP
2016-12-12,61.82,62.299999,61.720001,62.169998,20198100,1.286125,UP
2016-12-13,62.5,63.419998,62.240002,62.98,35718900,-0.47862,DOWN
2016-12-14,63.0,63.450001,62.529999,62.68,30352700,-0.159793,DOWN


In [4]:
df = df.drop(['change_tomorrow_direction'], axis=1)

## Walk Forward Validation

In [5]:
ts = TimeSeriesSplit(test_size=200)

splits = ts.split(X=df)

In [6]:
split1= next(splits)
split1

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [7]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    list_df_train.append(df.iloc[index_train])
    list_df_test.append(df.iloc[index_test])

In [8]:
list_df_train[0]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-08,61.299999,61.580002,60.840000,61.009998,21220800,1.549141
2016-12-09,61.180000,61.990002,61.130001,61.970001,27349400,0.321694
2016-12-12,61.820000,62.299999,61.720001,62.169998,20198100,1.286125
2016-12-13,62.500000,63.419998,62.240002,62.980000,35718900,-0.478620
2016-12-14,63.000000,63.450001,62.529999,62.680000,30352700,-0.159793
...,...,...,...,...,...,...
2019-12-18,154.300003,155.479996,154.179993,154.369995,24129200,0.860582
2019-12-19,154.000000,155.770004,153.750000,155.710007,24958900,1.079980
2019-12-20,157.350006,158.490005,156.289993,157.410004,53477500,0.000000
2019-12-23,158.119995,158.119995,157.270004,157.410004,17718200,-0.019061


In [9]:
list_df_test[0]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-26,157.559998,158.729996,157.399994,158.669998,14520600,0.182441
2019-12-27,159.449997,159.550003,158.220001,158.960007,18412800,-0.869351
2019-12-30,158.990005,159.020004,156.729996,157.589996,16348400,0.069753
2019-12-31,156.770004,157.770004,156.449997,157.699997,18369400,1.817954
2020-01-02,158.779999,160.729996,158.330002,160.619995,22622100,-1.260875
...,...,...,...,...,...,...
2020-10-05,207.220001,210.410004,206.979996,210.380005,21331600,-2.170852
2020-10-06,208.820007,210.179993,204.820007,205.910004,28554300,1.868178
2020-10-07,207.059998,210.110001,206.720001,209.830002,25681100,0.356159
2020-10-08,210.509995,211.190002,208.320007,210.580002,19925800,2.423426


## ML Model

Define feature and target

In [10]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [11]:
# Train test split

list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]

In [12]:
# Call and fit model

model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt.fit(X_train, y_train)

# Evaluation
y_pred = model_dt.predict(X_test)
error_mse = mean_squared_error(y_test, y_pred)
error_mse

3.256057680177794

In [13]:
# Add the procedure inside the for loop
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    
    model_dt.fit(X_train, y_train)
    
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    
    error_mse_list.append(error_mse)

error_mse_list

[10.969597948232485,
 32.215004985812634,
 6.918122049560407,
 5.572134400039568,
 3.256057680177794]

In [14]:
np.mean(error_mse_list)

11.786183412764577

### Backtesting

In [15]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    n_train = 600
    coef_retrain = 200
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.n_train, :-1]
        y_train = self.data.df.iloc[:self.n_train, -1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [16]:
class WalkForwardAnchored(Regression):
    def next(self):
        
        # We don't take any action and move on to the following day
        if len(self.data) < self.n_train:
            return
        
        # We retrain the model each 200 days
        if len(self.data) % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:, :-1]
            y_train = self.data.df.iloc[:, -1]

            self.model.fit(X_train, y_train)

            super().next()
            
        else:
            
            super().next()

In [17]:
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [18]:
mp.set_start_method('fork')

In [21]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [22]:
# We need to use nupy version 19.0.0
# pip install numpy==1.19


## Unanchored Walk Forward

In [23]:
%load_ext autoreload
%autoreload 2

In [24]:
import strategies

In [25]:
strategies.WalkForwardUnanchored

strategies.WalkForwardUnanchored

In [26]:
bt_unanchored = Backtest(df, strategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

Plot each model

In [None]:
bt.plot(filename='reports_backtesting/walk_forward_anchored.html')

In [None]:
bt_unanchored.plot(filename='reports_backtesting/walk_forward_unanchored.html')