In [None]:
import utils
import models.categorical
import math
import pickle
import numpy as np
import pandas as pd
from datetime import date

import xgboost as xgb # xgboost model
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

# Hyperparameter Tuning

**Parameters**

In [None]:
xgbParams = [
    {'booster': ['gbtree', 'dart'], # default is gbtree
     'learning_rate': [0.1, 0.3], # default 0.3
     'gamma': [0, 1], # higher means more regularization
     'max_depth': [6, 9], # default 6
     # 'min_child_weight': [1, 3], # larger means possibly better generalization
     # 'n_estimators': [100]
    }
]


start_date = date(2010, 1, 1)
end_date = date(2021, 1, 1)

In [None]:
# retrieve parameter grid
parameter_grid = list(ParameterGrid(xgbParams))
y_var = "LONG_SHORT"
file_dir = "xgb/perc_linear/"

for future in tqdm(utils.futuresList):
    print(future)

    # load data - generates df with PCT and DIFF of close, tech indicators, and macro indicators
    df = utils.prepare_data(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=True, \
                                   tech_indicators=[], \
                                   macro_indicators=False)
    
    print(X_vars)
    
    if 'CLOSE_LINEAR_PCT' in X_vars:

        # load X and y
        X_df, y_df = utils.generate_X_y(df, X_vars=X_vars, y_var=y_var)
        cost_df = df["CLOSE_PCT"]

        # prepare collated results
        agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                            columns=["accuracy_SMA", "opp_cost_SMA"])
        win_results_collated = []

        # run walk forward validation 
        for i in range(len(parameter_grid)):
            param_set = parameter_grid[i]
            model = XGBClassifier(**param_set)
            win_results, agg_results = models.categorical.walk_forward(
                model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, 
                max_windows = 100, start_index = start_date
            )
            win_results_collated.append(win_results)
            agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
            agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]

        # save parameters
        parameter_df = pd.DataFrame.from_records(parameter_grid)
        combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)
        # sort by lowest opp cost
        combined_df = combined_df.sort_values(by=["opp_cost_SMA"], ascending=True)
        combined_df.to_csv(f"model_metrics/categorical/{file_dir}{future}.csv", index=False)
        
    else:
        print("no linearisation required")


# Results of Tuning

## Best X_vars Summary

**Evaluated based on: Lowest Opp Cost SMA**

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_LINEAR_PCT, VOL_LINEAR_PCT]`
- 2 out of 10 futures models performed better when linearized
- `F_LC` and `F_TY`

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_PCT, VOL_PCT, +techIndicators, +macroIndicators]`
- all futures except `F_ZQ` perform better with added technical indicators and macro indicators

`[CLOSE_PCT, VOL_PCT, +tech, +macro]` vs `[CLOSE_LINEAR_PCT, VOL_LINEAER_PCT, +tech, +macro]`
- 0 out of 10 futures models performed better when linearized

In [None]:
for future in tqdm(utils.futuresList):
    perc = pd.read_csv(f"model_metrics/categorical/xgb/perc_tech_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
    
    try:
        perc_linear = pd.read_csv(f"model_metrics/categorical/rf/perc_linear_tech_macro/{future}.csv")
        perc_linear_best = max(perc_linear['opp_cost_SMA'])
        if (perc_linear_best <= perc_best):
            # print(future, perc_linear.loc[perc_linear['opp_cost_SMA'] == perc_linear_best])
            print('*', future, perc_best-perc_linear_best)  
        else:
            print(future, perc_linear_best-perc_best)
        continue
    except:      
        # print(future, perc.loc[perc['opp_cost_SMA'] == perc_best])
        continue

## Best Tuning Params Summary

- `booster`: `gbtree` (77), `dart` (11)
- `learning_rate`: 0.1 works better, default=0.3
- `gamma`: 1 works best for all, more regularisation
- `max_depth`: 6 (default) works best for all, higher depth=9 probably led to overfitting

In [None]:
best_params = pd.DataFrame()

for future in tqdm(utils.futuresList):
    try:
        perc = pd.read_csv(f"model_metrics/categorical/xgb/perc_linear_tech_macro/{future}.csv")
    except:
        perc = pd.read_csv(f"model_metrics/categorical/xgb/perc_tech_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
    perc_best_row = pd.DataFrame(perc.loc[perc['opp_cost_SMA'] == perc_best]).reset_index(drop=True).iloc[0]
    best_params = best_params.append(perc_best_row, ignore_index=True)

In [None]:
best_params['booster'].value_counts()

In [None]:
best_params['learning_rate'].value_counts()

In [None]:
best_params['gamma'].value_counts()

In [None]:
best_params['max_depth'].value_counts()

# Save Models

In [None]:
train_start = date(2018, 9, 1)
train_end = date(2020, 9, 30)

for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=True, \
                                   tech_indicators=["MACD", "RSI14", "VPT"], \
                                   macro_indicators=True)
    
    # save model
    try:
        models.categorical.save_model(path='xgb/perc_linear_tech_macro', metric="opp_cost_SMA", \
                                      model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                                      future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                      ext_path="csv",\
                                      train_start=train_start, train_end=train_end)
        print(f'{future} done')
    except:
        models.categorical.save_model(path='xgb/perc_tech_macro', metric="opp_cost_SMA", \
                              model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                              future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                              ext_path="csv",\
                              train_start=train_start, train_end=train_end)
        print(f'{future} done')