# Forecasting Returns with LightGBM for [M6 Competition](https://m6competition.com/)

<br>__[Aleksei Mashlakov](https://aleksei-mashlakov.github.io/)__ 
<br>
<br>
<a href='https://www.buymeacoffee.com/amashlakov' target='_blank'><img height='50' style='border:0px;height:50px;' src='https://www.buymeacoffee.com/assets/img/guidelines/download-assets-2.svg' border='0' alt='Buy Me a Coffee' /></a>

> :warning: **NO INVESTMENT ADVICE** :warning:​ This notebook is for educational/informational purposes only. The author is not responsible for any losses incurred as a result of using this notebook. 

In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import os
import numpy as np
import pandas as pd
import random
import yfinance as yf
import warnings
import datetime
from pathlib import Path
import plotly.express as px
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.io as pio
# pio.renderers.default = "notebook"
pio.templates.default = "simple_white"

warnings.filterwarnings("ignore")
# pd.options.display.float_format = '{:.4%}'.format

# Reproducibility

random.seed(0)
np.random.seed(0)

In [None]:
wd_path = Path.cwd().parent 
os.chdir(wd_path)
data_path = wd_path / "data" 

In [None]:
df_m6 = pd.read_csv(data_path / "template/M6_Universe.csv", index_col=0)
df_m6["symbol"] = df_m6["symbol"].str.replace("FB", "META")
df_m6.head(5)

In [None]:
stocks = df_m6[df_m6["class"]== "Stock"]["symbol"].to_list()
etfs = df_m6[df_m6["class"]== "ETF"]["symbol"].to_list()

In [None]:
SAMPLE_SIZE: int = 100 # number of stocks to sample
FORECAST_HORIZON: int = 20 # days ahead to forecast
PERIODS: int = 20 # number of periods in stock returns  

In [None]:
from tqdm.notebook import tqdm
from src.io import get_ticker_historical_data, get_dre_ticker_data, get_today_date
import pandas_datareader as pdr

SAVE_DIRECTORY = data_path / "results/tickers"
SAVE_TICKERS = False

if not os.path.exists(SAVE_DIRECTORY):
    os.makedirs(SAVE_DIRECTORY)

tickers = df_m6["symbol"].to_list()
from_date = pd.to_datetime("2018-01-01")
to_date = pd.to_datetime("2023-02-03")
interval = "1d"
tickers_data = dict()

for ticker in tqdm(tickers):
    try:
        if ticker != "DRE":
            data = get_ticker_historical_data(
                ticker=ticker, from_date=from_date, to_date=to_date, interval=interval
            )
        else:
            data = get_dre_ticker_data().loc[from_date:to_date, :]
        # This returns a data frame of scraped stock data from yahoo
        # data = pdr.DataReader(str(ticker), 'nasdaq', from_date, to_date)
        tickers_data[ticker] = data
        if SAVE_TICKERS:
            data.reset_index().to_csv(
                os.path.join(str(SAVE_DIRECTORY), f"{ticker}_{interval}.csv")
            )
    except Exception as e:
        print(f"Exception for {ticker=}: {e}")
        continue


> Note that DRE stock has been removed from yhoofinance API so we will use its historical data  

In [None]:
# Read asset prices data (as provided by the M6 submission platform)

path = data_path / "raw" / "assets_m6.csv"
m6_price_data = pd.read_csv(path)
m6_price_data["date"] = pd.to_datetime(m6_price_data["date"])
m6_price_data["symbol"] = m6_price_data["symbol"].replace("FB", "META")
# m6_price_data = m6_price_data.pivot(index="date", columns="symbol", values="price")
m6_price_data.head()

In [None]:
from src.time_features import reindex_weekdays

for k, df in tickers_data.items():
    tickers_data[k] = reindex_weekdays(df, start_index=from_date, end_index=to_date)

### Convert returns to ranking

:warning: Note that this is not the best way to convert returns to ranking because it does not properly handle tight values. We will use this approach for simplicity :warning: 

In [None]:
from src.ticker_features import calculate_pct_returns

random_noise = np.random.normal(0, 1e-12, size=(100))
df = (
    pd.DataFrame.from_dict({k: v["Adj Close"] for k, v in tickers_data.items()})
    .apply(calculate_pct_returns, periods=PERIODS, axis=0)
    .dropna()
    .apply(lambda x: x + random_noise, axis=1)
)

target_data = df.copy()

for idx, row in df.iterrows():
    target_data.loc[idx, :] = (pd.qcut(row, q=[0, 0.2, 0.4, 0.6, 0.8, 1]).astype('category').cat.codes).values
    if target_data.loc[idx, :].sum() !=200:
        print(idx, target_data.loc[idx, :].sum())

target_data = target_data.astype(int)

The other way to get ranking would be the following:
```python
from src.ticker_features import calculate_pct_returns

df = pd.DataFrame.from_dict({k: v["Adj Close"] for k, v in tickers_data.items()})

target_data2 = (
    (
        df.copy()
        .apply(calculate_pct_returns, periods=PERIODS, axis=0)
        .apply(lambda x: x + random_noise, axis=1)
        .dropna()
        .rank(1, ascending=True, method="min")
        // (20.0 + 1e-12)
        + 1
    )
    .clip(upper=5)
    .astype(int)
)
target_data2 -= 1
```

### Main pipeline

* Uses custom trading strategy with several indexes (SMA, EMA, RSI, MACD, etc.)
* Features based on the transformation of stock prices, volumes
* Categorical features based on the ticker properties (sector, industry, etc.)

In [None]:

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from src.reduce_memory import ReduceMemoryTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from src.strategy import CustomStrategy1, CustomStrategy
from src.ticker_features import upper_shadow, lower_shadow, upper_shadow_percent, lower_shadow_percent
from src.transformers import DateTimeTransformer
from src.ticker_features import calculate_pct_returns, calculate_log_returns, calculate_cum_log_returns, calculate_cum_pct_returns

tickers_data_enriched = {}

date_time_transforms = make_pipeline(
    DateTimeTransformer()
)

memory_transforms = make_pipeline(
    ReduceMemoryTransformer()
)

for k, v in tickers_data.items():
    df = v.copy()
    df = reindex_weekdays(df, drop_weekends=True, start_index=pd.to_datetime("2018-01-01"))
    df.ta.strategy(CustomStrategy)
    # df.ta.percent_return(cumulative=False, append=True)
    # df.ta.percent_return(cumulative=False, length=PERIODS, append=True)
    df[f"cum_log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_cum_log_returns, periods=PERIODS, axis=0).values
    df[f"log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_log_returns, periods=PERIODS, axis=0).values
    df[f"cum_prc_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_cum_pct_returns, periods=PERIODS, axis=0).values
    df[f"prc_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_pct_returns, periods=PERIODS, axis=0).values
    df['high2low'] = df['High'] / df['Low']
    df['high_low'] = df['High'] - df['Low']
    df[f'var_{PERIODS}'] = df['Adj Close'].rolling(20).var()
    # df['target_var'] = df[f'PCTRET_{PERIODS}'].var()
    df['upper_shadow'] = upper_shadow(df)
    df['lower_shadow'] = lower_shadow(df)
    df['upper_shadow_percent'] = upper_shadow_percent(df)
    df['lower_shadow_percent'] = lower_shadow_percent(df)    
    df["log_volume"] = np.log(df["Volume"] + 1e-8)
    df["log_high"] = np.log(df["High"] + 1e-8)
    df["log_low"] = np.log(df["Low"] + 1e-8)
    df = df.fillna(method="ffill")
    df = reindex_weekdays(df, drop_weekends=True, start_index=pd.to_datetime("2018-01-01"))
    df = memory_transforms.fit_transform(df)
    
    df["GICS_sector/ETF_type"] = df_m6[df_m6["symbol"]==k]["GICS_sector/ETF_type"].values[0]
    df["GICS_industry/ETF_subtype"] = df_m6[df_m6["symbol"]==k]["GICS_industry/ETF_subtype"].values[0]
    df["group"] = k
    df["ticker"] = "stock" if k in stocks else "etf"
    #df["month"] = df.index.month.astype(str).astype("category")  # categories have be strings
    #df["day_of_week"] = df.index.day_of_week #.astype(str).astype("category")  # categories have be strings
    #     scaler = MinMaxScaler() #StandardScaler()
    #     df_scaled = pd.DataFrame(data=scaler.fit_transform(df), 
    #                              index=df.index,
    #                              columns=df.columns)
    #     df_scaled.dropna(inplace=True)
    #     tickers_data_enriched[k] = df_scaled
    tickers_data_enriched[k] = df#[df_stock_returns_quantiles.index[0]:]

### Add datetime features 

* One hot encodings of holidays
* Spline of day of the week
* Kernels for holidays

In [None]:
from src.time_features import get_datetime_covariates

covariates = get_datetime_covariates(from_date, to_date, memory_transforms=memory_transforms, date_time_transforms=date_time_transforms)
covariates = reindex_weekdays(covariates, drop_weekends=True)

### Concatenate all features

In [None]:
data = pd.concat([pd.concat(
[target_data[[k]].astype(int).rename(columns={k: "target"}),
    target_data[[k]].astype(int).rename(columns={k: "last_target"}).shift(PERIODS),
    tickers_data_enriched[k].shift(PERIODS), 
    covariates.shift(PERIODS)
    ], axis=1).dropna(how="all", axis=0)
                for k in tickers_data_enriched.keys()])

data = data.dropna()
data["target"] = data["target"].astype(int)
data.columns

In [None]:
# time_varying_known_categoricals = ['day_of_week', 'month'] # , 
static_columns = ["group", "ticker", 'GICS_sector/ETF_type','GICS_industry/ETF_subtype']#, "month"]
time_var_reals = list(data.columns[(~data.columns.isin(static_columns+["target"]))])

# data = memory_transforms.fit_transform(data)
data['target'] = data['target'].astype(int)
# data['day_of_week'] = data['day_of_week'].astype(str)
# data['month'] = data['month'].astype(str)

In [None]:
from sklearn import preprocessing
for column in static_columns:
    labels = data[column].unique()
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    data[column] = le.transform(data[column].values)

> The data is fully ready now, lets do the backtesting and evaluation

#### First, lets test the LightGBM model with calibration

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, cross_val_score, TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit
from src.metrics import portfolio_rps

train_start = data.index[0]
scalers = []
classifiers = []
train_rpss = []
test_rpss = []

for i, test_start in enumerate(
    pd.date_range(start="2022-03-04", end="2023-02-03", freq="28D", inclusive="left")
):
    test_end = test_start + pd.Timedelta(days=28)
    print(f"Competition Month {i+1:2d}")
    # use the last tree months of training data for calibration
    valid_cutoff = test_start - pd.Timedelta(days=28 * 3)
    # calib_cutoff = test_start - pd.Timedelta(days=1)
    print(f"Train data: {train_start.date()} -- {valid_cutoff.date()- pd.Timedelta(days=1)}")
    print(f"Valid data: {valid_cutoff.date()} -- {test_start.date()}")
    print(f"Test data: {test_end.date()} -- {test_end.date()}")

    # split the data per group into train, calibration and test
    groupby_full = data.groupby("group")
    groupby_train = groupby_full.apply(lambda group: group.loc[:(valid_cutoff - pd.Timedelta(days=1))])
    groupby_valid = groupby_full.apply(lambda group: group.loc[valid_cutoff:test_start])
    groupby_test = groupby_full.apply(lambda group: group.loc[test_end:test_end])

    X_scaler = MinMaxScaler()

    groupby_train.loc[:, time_var_reals] = X_scaler.fit_transform(
        groupby_train.loc[:, time_var_reals]
    )
    y_train_df = groupby_train.loc[:, "target"]
    X_train_df = groupby_train.drop(["target"], axis=1)

    groupby_valid.loc[:, time_var_reals] = X_scaler.transform(
        groupby_valid.loc[:, time_var_reals]
    )
    y_calib_df = groupby_valid.loc[:, "target"]
    X_calib_df = groupby_valid.drop(["target"], axis=1)

    groupby_test.loc[:, time_var_reals] = X_scaler.transform(
        groupby_test.loc[:, time_var_reals]
    )
    y_test_df = groupby_test.loc[:, "target"]
    X_test_df = groupby_test.drop(["target"], axis=1)

    scalers.append(X_scaler)

    # groupby_submit.loc[:, time_var_reals] = X_scaler.transform(groupby_submit.loc[:, time_var_reals])

    X_train = X_train_df.values
    y_train = y_train_df.values.astype(int)

    X_calib = X_calib_df.values
    y_calib = y_calib_df.values.astype(int)

    X_test = X_test_df.values
    y_test = y_test_df.values.astype(int)

    # X_train, X_calib, y_train, y_calib = train_test_split(X, y, random_state=42, shuffle=False)
    # concatenate train and calibration data for classifier without calibration
    x_tr = np.concatenate((X_train, X_calib), axis=0)
    y_tr = np.concatenate((y_train, y_calib), axis=0)

    clf = LGBMClassifier(
            boosting_type="gbdt",
            learning_rate=0.02,
            n_estimators=100,
            random_state=10,
            max_depth=15,
            importance_type='gain'
            #lambda_l2=1.0,
    )  # min_data_in_leaf=500, num_leaves=50, max_depth=10,

    ### Model 1 - No calibration

    clf.fit(x_tr, y_tr)
    cal_clf_cv_probs_train = clf.predict_proba(x_tr)
    cal_clf_cv_probs_test = clf.predict_proba(X_test)

    n_values = np.max(y_tr) + 1
    targets_train = np.eye(n_values)[y_tr]

    n_values = np.max(y_test) + 1
    targets_test = np.eye(n_values)[y_test]

    ### Model 2 - Isotonic calibration

    # clf.fit(X_train, y_train)
    # cal_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit", ensemble=True)
    # cal_clf.fit(X_calib, y_calib)
    # cal_clf_cv_probs_train = cal_clf.predict_proba(X_train)
    # cal_clf_cv_probs_test = cal_clf.predict_proba(X_test)

    # n_values = np.max(y_train)+1
    # targets_train = np.eye(n_values)[y_train]

    # n_values = np.max(y_test)+1
    # targets_test = np.eye(n_values)[y_test]

    ### Model 3 - Isotonic calibration with cross validation

    # cal_clf_cv = CalibratedClassifierCV(
    #     clf, method="isotonic", cv=TimeSeriesSplit(n_splits=5), ensemble=True
    # )

    # cal_clf_cv.fit(x_tr, y_tr)
    # cal_clf_cv_probs_train = cal_clf_cv.predict_proba(x_tr)
    # cal_clf_cv_probs_test = cal_clf_cv.predict_proba(X_test)

    # n_values = np.max(y_tr) + 1
    # targets_train = np.eye(n_values)[y_tr]

    # n_values = np.max(y_test) + 1
    # targets_test = np.eye(n_values)[y_test]

    train_rps = portfolio_rps(probs=cal_clf_cv_probs_train, outcome=targets_train)
    test_rps = portfolio_rps(probs=cal_clf_cv_probs_test, outcome=targets_test)

    train_rpss.append(train_rps)
    test_rpss.append(test_rps)

    print(f"Month {i+1:2d}: {test_start.date()} -- {test_end.date()}")
    # print(f"Classifier: {clf}")
    print(f"Month {i+1:2d}: {train_rps=}")
    print(f"Month {i+1:2d}: {test_rps=}")
    print("  ----------------------------------------  ") 

In [None]:
def plot_feature_importance(model, X_train_df, importance_type):
    # based on https://www.kaggle.com/code/skylord/lgb-model-feature-importance/notebook
    model.importance_type = importance_type
    feature_imp = pd.DataFrame(
        zip(model.feature_importances_, list(X_train_df.columns)),
        columns=[f"Value_{clf.importance_type.capitalize()}", "Feature"],
    )
    feature_imp.sort_values(
        by=[f"Value_{clf.importance_type.capitalize()}"], ascending=True, inplace=True
    )

    trace2 = go.Bar(
        y=feature_imp["Feature"],
        x=feature_imp[f"Value_{clf.importance_type.capitalize()}"],
        name=f"feature_importance_{clf.importance_type}",
        marker=dict(
            color="rgba(174, 255, 255, 0.5)", line=dict(color="rgb(0,0,0)", width=1.5)
        ),
        orientation="h",
        text=feature_imp["Feature"],
    )

    fig_data = [trace2]
    layout = go.Layout(barmode="group", title=f"Feature Importance by {clf.importance_type.capitalize()}")
    fig = go.Figure(data=fig_data, layout=layout)
    fig.write_html(data_path / f"results/lgbm_feature_importance_by_{clf.importance_type}.html")
    py.plot(fig, filename=f"lgbm_feature_importance_by_{clf.importance_type}", auto_open = True)
    fig.show()


plot_feature_importance(clf, X_train_df.rename(columns={"ticket":"ticker"}), importance_type='split')
plot_feature_importance(clf, X_train_df.rename(columns={"ticket":"ticker"}), importance_type='gain')

In [None]:
rps_backtest = pd.DataFrame(data={"Train RPS": train_rpss, "Test RPS": test_rpss}, index=[i for i in range(1, 13)])
rps_backtest.mean()

In [None]:
returns = rps_backtest["Train RPS"].values
train_quarter_rps = [np.mean(returns[i : i + 3]) for i in range(0, 12, 3) if i < len(returns)]
returns = rps_backtest["Test RPS"].values
test_quarter_rps = [np.mean(returns[i : i + 3]) for i in range(0, 12, 3) if i < len(returns)]

In [None]:
import plotly.express as px
rps_backtest_quarter = pd.DataFrame(data={"Train RPS": train_quarter_rps, "Test RPS": test_quarter_rps}, index=[i for i in range(1, 5)])
fig = px.bar(rps_backtest_quarter, x=rps_backtest_quarter.index, 
             y=["Test RPS", "Train RPS"], barmode="group", color="variable", title="RPS quaterly backtest")
fig.add_hline(y=rps_backtest["Test RPS"].mean(), line_dash="dash", line_width=3, line_color="blue")
fig.add_hline(y=rps_backtest["Train RPS"].mean(), line_dash="dash", line_width=3, line_color="orange")
fig.add_hline(y=0.16, line_dash="dash", line_width=3, line_color="red", name="Benchmark")
fig.update_layout(
    height=400,
    width=600,
    xaxis_title="Quarter",
    yaxis_title="RPS",
)
fig.write_html(data_path / "results/rps_quarterly_backtest.html")
py.plot(fig, filename=f"rps_quarterly_backtest", auto_open = True)
fig.show()

In [None]:
import plotly.express as px
rps_backtest = pd.DataFrame(data={"Train RPS": train_rpss, "Test RPS": test_rpss}, index=[i for i in range(1, 13)])
fig = px.bar(rps_backtest, x=rps_backtest.index, y=["Test RPS", "Train RPS"], 
             barmode="group", color="variable", title="RPS monthly backtest")
fig.add_hline(y=rps_backtest["Test RPS"].mean(), line_dash="dash", line_width=3, line_color="blue")
fig.add_hline(y=rps_backtest["Train RPS"].mean(), line_dash="dash", line_width=3, line_color="orange")
fig.add_hline(y=0.16, line_dash="dash", line_width=3, line_color="red", name="Benchmark")
fig.update_layout(
    height=400,
    width=700,
    xaxis_title="Month",
    yaxis_title="RPS",
)
fig.write_html(data_path / "results/rps_monthly_backtest.html")
py.plot(fig, filename=f"rps_monthly_backtest", auto_open = True)
fig.show()

In [None]:
# We will track how many training rounds we needed for our best score.
# We will use that number of rounds later.
best_score = 999
training_rounds = 10000
from src.darts_hop import logging_callback
import optuna

# Declare how we evaluate how good a set of hyperparameters are, i.e.
# declare an objective function.
def objective(trial):
    # Specify a search space using distributions across plausible values of hyperparameters.
    param = {
        #"objective": "binary",
        #"metric": "binary_error",
        "verbosity": -1,
        "boosting_type": "gbdt", #trial.suggest_categorical("boosting_type", choices=["gbdt", "dart", "rf"]),                
        "seed": 42,
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    # Run LightGBM for the hyperparameter values
    clf = LGBMClassifier(**param,)
    cal_clf_cv = CalibratedClassifierCV(clf, method="isotonic", cv=TimeSeriesSplit(n_splits=5), ensemble=True)

    cal_clf_cv.fit(x_tr, y_tr)
    cal_clf_cv_probs_train = cal_clf_cv.predict_proba(x_tr)
    cal_clf_cv_probs_test = cal_clf_cv.predict_proba(X_test)

    n_values = np.max(y_tr)+1
    targets_train = np.eye(n_values)[y_tr]

    n_values = np.max(y_test)+1
    targets_test = np.eye(n_values)[y_test]
    

    print(f"Classifier: {clf}")
    train_rps = portfolio_rps(probs=cal_clf_cv_probs_train, outcome=targets_train)
    test_rps = portfolio_rps(probs=cal_clf_cv_probs_test, outcome=targets_test)
    # print(f"cal_clf test RPS = {portfolio_rps(probs=cal_clf_probs, outcome=targets_test)}")
    print(f"train RPS = {train_rps}")
    print(f"test RPS = {test_rps}")
    
    # Return metric of interest
    return test_rps

# Suppress information only outputs - otherwise optuna is 
# quite verbose, which can be nice, but takes up a lot of space
optuna.logging.set_verbosity(optuna.logging.WARNING) 

# We search for another 4 hours (3600 s are an hours, so timeout=14400).
# We could instead do e.g. n_trials=1000, to try 1000 hyperparameters chosen 
# by optuna or set neither timeout or n_trials so that we keep going until 
# the user interrupts ("Cancel run").
study = optuna.create_study(direction='minimize',  sampler=optuna.samplers.TPESampler(seed=42))  
study.optimize(objective, n_trials=100, timeout=14400, callbacks=[logging_callback])

print(f"Best trial: \n{study.best_trial}\n")
print(f"Best value: {study.best_value}\n")
print(f"Best params: {study.best_params}\n")
# print(study.trials)

In [None]:
# for submission
clf = LGBMClassifier(boosting_type='gbdt', learning_rate=0.02, n_estimators=100, random_state=10, max_depth=15)
X_tr = np.concatenate((X_train, X_calib, X_test), axis=0)
y_tr = np.concatenate((y_train, y_calib, y_test), axis=0)
cal_clf = CalibratedClassifierCV(clf, method="isotonic", cv=5, ensemble=True)
cal_clf.fit(X_tr, y_tr)
preds = cal_clf.predict_proba(groupby_submit.values)
preds = pd.DataFrame(data=preds, index=les[0].inverse_transform(groupby_submit.group.values))


In [None]:
def submit_forecasts(month: int, preds: pd.DataFrame):
    decimals = 5
    df_submission = pd.read_csv(data_path / "template/template.csv")
    df_submission["ID"] = df_submission["ID"].replace("FB", "META")
    preds = preds.reindex(index=df_submission.ID.values)
    df_submission.set_index("ID", inplace=True)
    df_submission.iloc[:,:-1] = preds.values
    df_submission.iloc[:,:-1] = df_submission.iloc[:,:-1].round(decimals)
    df_submission.iloc[:, 0] += (1 - df_submission.iloc[:,:-1].sum(axis=1))
    df_submission.round(decimals).to_csv( data_path / f"results/lgbm_submission_{month}.csv")

### Try hyperparameter tuning

In [None]:
# We will track how many training rounds we needed for our best score.
# We will use that number of rounds later.
best_score = 999
training_rounds = 10000
from src.darts_hop import logging_callback
import optuna

# Declare how we evaluate how good a set of hyperparameters are, i.e.
# declare an objective function.
def objective(trial):
    # Specify a search space using distributions across plausible values of hyperparameters.
    param = {
        #"objective": "binary",
        #"metric": "binary_error",
        "verbosity": -1,
        "boosting_type": "gbdt", #trial.suggest_categorical("boosting_type", choices=["gbdt", "dart", "rf"]),                
        "seed": 42,
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    # Run LightGBM for the hyperparameter values
    clf = LGBMClassifier(**param,)
    cal_clf_cv = CalibratedClassifierCV(clf, method="isotonic", cv=TimeSeriesSplit(n_splits=5), ensemble=True)

    cal_clf_cv.fit(x_tr, y_tr)
    cal_clf_cv_probs_train = cal_clf_cv.predict_proba(x_tr)
    cal_clf_cv_probs_test = cal_clf_cv.predict_proba(X_test)

    n_values = np.max(y_tr)+1
    targets_train = np.eye(n_values)[y_tr]

    n_values = np.max(y_test)+1
    targets_test = np.eye(n_values)[y_test]
    

    print(f"Classifier: {clf}")
    train_rps = portfolio_rps(probs=cal_clf_cv_probs_train, outcome=targets_train)
    test_rps = portfolio_rps(probs=cal_clf_cv_probs_test, outcome=targets_test)
    # print(f"cal_clf test RPS = {portfolio_rps(probs=cal_clf_probs, outcome=targets_test)}")
    print(f"train RPS = {train_rps}")
    print(f"test RPS = {test_rps}")
    
    # Return metric of interest
    return test_rps

# Suppress information only outputs - otherwise optuna is 
# quite verbose, which can be nice, but takes up a lot of space
optuna.logging.set_verbosity(optuna.logging.WARNING) 

# We search for another 4 hours (3600 s are an hours, so timeout=14400).
# We could instead do e.g. n_trials=1000, to try 1000 hyperparameters chosen 
# by optuna or set neither timeout or n_trials so that we keep going until 
# the user interrupts ("Cancel run").
study = optuna.create_study(direction='minimize',  sampler=optuna.samplers.TPESampler(seed=42))  
study.optimize(objective, n_trials=100, timeout=14400, callbacks=[logging_callback])

print(f"Best trial: \n{study.best_trial}\n")
print(f"Best value: {study.best_value}\n")
print(f"Best params: {study.best_params}\n")
# print(study.trials)

### Try the other classifiers (without tuning they performed worse than LightGBM)

In [None]:

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sktime.classification.kernel_based import RocketClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sktime.transformations.panel import rocket
from sktime.classification.hybrid import HIVECOTEV2

from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC

classifiers = [
    KNeighborsClassifier(10),
    RocketClassifier(
        num_kernels=100,
    ),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF([1.0, 1.0])),
    LinearSVC(random_state=10, max_iter=500),
    RidgeClassifier(),
    GaussianNB(),
    HistGradientBoostingClassifier(
        learning_rate=0.1,
        loss="categorical_crossentropy",
        max_iter=1000,
        l2_regularization=0.0,
        max_depth=50,
        random_state=10,
    ),
    RandomForestClassifier(
        n_estimators=200, max_depth=5, max_features="auto", random_state=10
    ),  # max_features=1, min_samples_leaf=1,
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=10),
    LogisticRegression(C=10.0, max_iter=100),
]
