# Challenge: Optimization on Other Datasets

## Download data from `yfinance`

In [1]:
import yfinance as yf

ticker = 'HAL.NS'
df = yf.download(ticker)
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-04-02,563.849976,577.500000,559.000000,574.625000,405.803162,352730
2018-04-03,570.500000,573.500000,563.174988,564.299988,398.511566,115130
2018-04-04,565.000000,570.000000,557.500000,557.950012,394.027161,163364
2018-04-05,561.000000,565.625000,554.750000,556.974976,393.338562,149988
2018-04-06,551.750000,559.424988,550.025024,551.424988,389.419189,146026
...,...,...,...,...,...,...
2023-12-21,2639.000000,2737.949951,2585.000000,2706.500000,2706.500000,2206814
2023-12-22,2723.000000,2727.000000,2677.600098,2703.699951,2703.699951,1000294
2023-12-26,2710.000000,2829.949951,2701.250000,2820.649902,2820.649902,2160870
2023-12-27,2832.949951,2836.949951,2788.000000,2808.199951,2808.199951,1156573


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [2]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [3]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [4]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01,367.500000,367.575012,365.000000,365.100006,272.420624,6386,0.686829
2020-01-02,367.475006,374.625000,364.674988,367.625000,274.304626,26118,0.223920
2020-01-03,368.899994,378.450012,361.750000,368.450012,274.920227,277418,-1.014418
2020-01-06,365.549988,369.725006,360.850006,364.750000,272.159393,23622,0.734795
2020-01-07,362.600006,369.725006,362.500000,367.450012,274.174011,15998,0.088389
...,...,...,...,...,...,...,...
2023-12-20,2797.949951,2819.949951,2651.000000,2665.899902,2665.899902,1947471,1.500096
2023-12-21,2639.000000,2737.949951,2585.000000,2706.500000,2706.500000,2206814,-0.103564
2023-12-22,2723.000000,2727.000000,2677.600098,2703.699951,2703.699951,1000294,4.146206
2023-12-26,2710.000000,2829.949951,2701.250000,2820.649902,2820.649902,2160870,-0.443343


## Compute Machine Learning model

Proposal: Random Forest within `ensemble` module of `sklearn` library

In [5]:
X = df.drop(columns=['Adj Close', 'change_tomorrow'])
y = df.change_tomorrow

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
model_rf = RandomForestRegressor(max_depth=15)

In [8]:
model_rf.fit(X=X, y=y)

## Parametrize & optimize the investment strategy

### Create Strategy class

In [14]:
from backtesting import Strategy

class Regression(Strategy):
  limit_buy = 1
  limit_sell = -5
  
  def init(self):
    self.model = model_rf
    self.already_bought = False

  def next(self):
      explanatory_today = self.data.df.iloc[[-1],:]
      forecast_tomorrow = model_rf.predict(explanatory_today)[0]
      
      # conditions to sell or buy
      if forecast_tomorrow > 5 and self.already_bought==False:
          self.buy()
          self.already_bought = True

      elif forecast_tomorrow < -5 and self.already_bought:
          self.sell()
          self.already_bought = False

      else:
          pass

### Create `Backtest` class

In [15]:
from backtesting import Backtest

In [16]:
bt = Backtest(
    X, Regression, cash=10000,
    commission=.002, exclusive_orders=True
)

### Optimize backtesting with multiple combinations

In [None]:
import multiprocessing as mp
mp.set_start_method('fork')

In [None]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy=[0, 10],
    limit_sell=[-10, 0],
    maximize='Return [%]',
    method='skopt',
    max_tries=500,
    random_state=0,
    return_heatmap=True,
    return_optimization=True
    )

### Which are the best values for the parameters?

In [None]:
dff = heatmap.reset_index()
dff = dff.pivot(index='limit_buy', columns='limit_sell', values='Return [%]')
dff

In [None]:
dff.style.highlight_null(props='background-color: transparent; color: transparent')

In [None]:
import numpy as np
dff.sort_index(axis=1, ascending=False)\
  .style.format(precision=0)\
  .background_gradient(vmin=np.nanmin(dff), vmax=np.nanmax(dff))\
  .highlight_null(props='background-color: transparent; color: transparent')

### How to interpret smart optimization?

#### Number of evaluations

https://kernc.github.io/backtesting.py/doc/examples/Parameter%20Heatmap%20&%20Optimization.html

In [None]:
from skopt.plots import plot_evaluations

_ = plot_evaluations(optimize_result, bins=10)

#### Density

In [None]:
from skopt.plots import plot_objective

_ = plot_objective(optimize_result, n_points=10)

## How to solve the overfitting problem?

> Backtest the investment strategy on a validation set.

Next chapter → [The Overfitting Problem in Backtesting]()

![](<src/09_Table_The Overfitting.png>)