# The overfitting problem

![](<src/09_Table_The Overfitting.png>)

## Load the data

In [2]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-12-08,61.299999,61.580002,60.840000,61.009998,55.705235,21220800,1.549141,UP
2016-12-09,61.180000,61.990002,61.130001,61.970001,56.581772,27349400,0.321694,UP
2016-12-12,61.820000,62.299999,61.720001,62.169998,56.764374,20198100,1.286125,UP
2016-12-13,62.500000,63.419998,62.240002,62.980000,57.503944,35718900,-0.478620,DOWN
2016-12-14,63.000000,63.450001,62.529999,62.680000,57.230022,30352700,-0.159793,DOWN
...,...,...,...,...,...,...,...,...
2023-12-18,369.450012,373.000000,368.679993,372.649994,372.649994,21802900,0.163429,UP
2023-12-19,371.489990,373.260010,369.839996,373.260010,373.260010,20603700,-0.712324,DOWN
2023-12-20,375.000000,376.029999,370.529999,370.619995,370.619995,26316700,0.781714,UP
2023-12-21,372.559998,374.410004,370.040009,373.540009,373.540009,17708000,0.277638,UP


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [3]:
target = df.change_tomorrow
explanatory = df[['Open','High','Low','Close','Volume']]

## Train test split

### Split the dataset

In [4]:
n_days = len(df.index)
n_days

1772

In [5]:
n_days_split = int(n_days*0.7)
n_days_split

1240

In [6]:
X_train, y_train = explanatory.iloc[:n_days_split], target.iloc[:n_days_split]
X_test, y_test = explanatory.iloc[n_days_split:], target.iloc[n_days_split:]

### Fit the model on train set

In [7]:
from sklearn.tree import DecisionTreeRegressor

In [8]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [9]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [10]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

6.256383247671782

#### On train set

In [11]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

0.7362062867421468

## [ ] Backtesting

In [18]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [19]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [20]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [21]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2021-11-11 00:00:00
End,2023-12-22 00:00:00
Duration,771 days 00:00:00
Exposure Time [%],97.744361
Equity Final [$],10614.803101
Equity Peak [$],10867.363648
Return [%],6.148031


### Run the backtest on `train` data

In [22]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2016-12-08 00:00:00
End,2021-11-10 00:00:00
Duration,1798 days 00:00:00
Exposure Time [%],99.758065
Equity Final [$],71613.352638
Equity Peak [$],72131.228981
Return [%],616.133526


### Compare both backtests

In [23]:
df_results = pd.concat([df_results_test, df_results_train], axis=1)
df_results

Unnamed: 0,Out of Sample (Test),In Sample (Train)
Start,2021-11-11 00:00:00,2016-12-08 00:00:00
End,2023-12-22 00:00:00,2021-11-10 00:00:00
Duration,771 days 00:00:00,1798 days 00:00:00
Exposure Time [%],97.744361,99.758065
Equity Final [$],10614.803101,71613.352638
Equity Peak [$],10867.363648,72131.228981
Return [%],6.148031,616.133526


## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="09D_The Overfitting Problem.ipynb">notebook</a>.