# Imports

In [17]:
import pandas as pd 
import numpy as np 

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from backtesting import Backtest, Strategy

# Data

In [2]:
df = pd.read_excel('data/MSFT_linkedin_processed.xlsx', parse_dates=['Date'], index_col=0)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-08,61.299999,61.580002,60.84,61.009998,21220800,1.549141,UP
2016-12-09,61.18,61.990002,61.130001,61.970001,27349400,0.321694,UP
2016-12-12,61.82,62.299999,61.720001,62.169998,20198100,1.286125,UP
2016-12-13,62.5,63.419998,62.240002,62.98,35718900,-0.47862,DOWN
2016-12-14,63.0,63.450001,62.529999,62.68,30352700,-0.159793,DOWN


Define target and feature

In [3]:
target = df.change_tomorrow
features = df[['Open', 'High', 'Low', 'Close', 'Volume']]

In [4]:
n_days = len(df.index)
n_days

1766

In [5]:
n_days_split = int(n_days*0.70)
n_days_split 

1236

Define train, test split

In [6]:
X_train, y_train = features.iloc[:n_days_split], target.iloc[:n_days_split]
X_test, y_test = features.iloc[n_days_split:], target.iloc[n_days_split:]

In [10]:
print(f'Shape of train set is',X_train.shape )
print(f'Shape of test set is',X_test.shape )
print(f'Shape of train target is',y_train.shape )
print(f'Shape of test target is',y_test.shape )

Shape of train set is (1236, 5)
Shape of test set is (530, 5)
Shape of train target is (1236,)
Shape of test target is (530,)


Call and fit the model 

In [12]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt_split.fit(X_train, y_train)

In [16]:
# Evaluate the model
y_pred_test = model_dt_split.predict(X_test)

# Calculate error
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

6.650579349442743

In [15]:
y_pred_train = model_dt_split.predict(X_train)

mean_squared_error(y_true=y_train, y_pred=y_pred_train)

0.9558860856576346

Since model already know data in train dataset, error is less because of overfitting problem

## Backtest

In [26]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5

    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False

        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

Run backtest on **test** data

In [27]:
bt_test = Backtest(X_test,Regression ,cash=10000, commission=.002, exclusive_orders=True)

In [30]:
results = bt_test.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]'].rename(
    {'Values':'Out of Sample (Test)'}, axis=1
)

df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2021-11-05 00:00:00
End,2023-12-14 00:00:00
Duration,769 days 00:00:00
Exposure Time [%],96.037736
Equity Final [$],9633.907789
Equity Peak [$],10307.101416
Return [%],-3.660922


Try with train dataset

In [31]:
bt_train = Backtest(X_train,Regression ,cash=10000, commission=.002, exclusive_orders=True)

results = bt_train.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]'].rename(
    {'Values':'Out of Sample (Test)'}, axis=1
)

df_results_train

Unnamed: 0,Out of Sample (Test)
Start,2016-12-08 00:00:00
End,2021-11-04 00:00:00
Duration,1792 days 00:00:00
Exposure Time [%],99.433657
Equity Final [$],62631.222154
Equity Peak [$],62839.8994
Return [%],526.312222


We can see that if did our backtest with train data we get 500% return. But in test dataset we are not even proffting. 

In [33]:
bt_test.plot()
bt_train.plot()

  formatter=DatetimeTickFormatter(days=['%d %b', '%a %d'],
  formatter=DatetimeTickFormatter(days=['%d %b', '%a %d'],
  fig = gridplot(
  fig = gridplot(
  formatter=DatetimeTickFormatter(days=['%d %b', '%a %d'],
  formatter=DatetimeTickFormatter(days=['%d %b', '%a %d'],
  fig = gridplot(
  fig = gridplot(
