# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [1]:
import yfinance as yf

ticker = 'ITC.NS'
df = yf.download(ticker)
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1996-01-01,5.550000,5.600000,5.533333,5.583333,3.303553,985500
1996-01-02,5.466666,5.566666,5.288888,5.372222,3.178642,7470000
1996-01-03,5.133333,5.254444,5.101111,5.200000,3.076741,15160500
1996-01-04,5.200000,5.332222,5.144444,5.297777,3.134595,12397500
1996-01-05,5.297777,5.277777,5.188888,5.202222,3.078056,5008500
...,...,...,...,...,...,...
2023-12-22,453.000000,455.899994,450.549988,455.200012,455.200012,11026317
2023-12-26,456.049988,458.899994,454.700012,456.450012,456.450012,5871301
2023-12-27,457.000000,458.500000,453.750000,457.100006,457.100006,7543383
2023-12-28,457.200012,464.700012,456.100006,464.100006,464.100006,20696101


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [2]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [3]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [4]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01,238.600006,238.600006,237.100006,238.100006,201.114258,4208837,0.729617
2020-01-02,238.199997,240.949997,238.100006,239.850006,202.592407,8402979,-0.566043
2020-01-03,241.000000,241.000000,238.000000,238.500000,201.452103,9284478,-1.446187
2020-01-06,237.500000,238.300003,235.000000,235.100006,198.580261,7636617,0.106225
2020-01-07,236.050003,237.899994,234.600006,235.350006,198.791428,8416741,-0.491032
...,...,...,...,...,...,...,...
2023-12-21,449.000000,453.750000,444.450012,451.450012,451.450012,11154625,0.823814
2023-12-22,453.000000,455.899994,450.549988,455.200012,455.200012,11026317,0.273853
2023-12-26,456.049988,458.899994,454.700012,456.450012,456.450012,5871301,0.142199
2023-12-27,457.000000,458.500000,453.750000,457.100006,457.100006,7543383,1.508296


## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [5]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

In [6]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

### Fit the model on train set

In [12]:
from sklearn.tree import DecisionTreeRegressor
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [13]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

4.715552929174557

#### On train set

In [14]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

1.1655102255990133

## Backtesting

In [16]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [17]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [22]:
bt_test = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

  bt_test = Backtest(X_test, Regression,


In [23]:
results = bt_test.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2020-01-01 00:00:00
End,2023-12-21 00:00:00
Duration,1450 days 00:00:00
Exposure Time [%],99.085366
Equity Final [$],15305.888565
Equity Peak [$],16065.788772
Return [%],53.058886


### Run the backtest on `train` data

In [24]:
bt_train = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt_train.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

  bt_train = Backtest(X_train, Regression,


Unnamed: 0,In Sample (Train)
Start,2020-01-02 00:00:00
End,2023-12-28 00:00:00
Duration,1456 days 00:00:00
Exposure Time [%],99.396682
Equity Final [$],46369.667959
Equity Peak [$],49899.61611
Return [%],363.69668


### Compare both backtests

- HINT: Concatenate the previous `DataFrames`

In [25]:
import pandas as pd
df_results = pd.concat([df_results_test, df_results_train], axis=1)
df_results

Unnamed: 0,Out of Sample (Test),In Sample (Train)
Start,2020-01-01 00:00:00,2020-01-02 00:00:00
End,2023-12-21 00:00:00,2023-12-28 00:00:00
Duration,1450 days 00:00:00,1456 days 00:00:00
Exposure Time [%],99.085366,99.396682
Equity Final [$],15305.888565,46369.667959
Equity Peak [$],16065.788772,49899.61611
Return [%],53.058886,363.69668


#### Plot both backtest reports

In [26]:
bt_test.plot(filename='reports_backtesting/regression_test_set.html')
bt_train.plot(filename='reports_backtesting/regression_train_set.html')

## How to solve the overfitting problem?

> Walk Forward Validation as a realistic approach to backtesting.

Next tutorial → [Walk Forward Validation]()

![](<src/10_Table_Validation Methods.png>)