# XGBoost Model

In [None]:
import utils
import models.categorical
import math
import pickle
import numpy as np
import pandas as pd
from datetime import date

import xgboost as xgb # xgboost model
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

# Hyperparameter Tuning

**Parameters**
- `learning_rate`
- `gamma`
- `max_depth`

In [None]:
xgbParams = [
    {'booster': ['gbtree'], # default is gbtree 
     'learning_rate': [0.01, 0.1, 0.3], # default 0.3
     'gamma': [0, 1], # higher means more regularization
     'max_depth': [4, 6, 8], # default 6
     # 'min_child_weight': [1, 3], # larger means possibly better generalization
     # 'n_estimators': [100]
    }
]


start_date = date(2010, 1, 1)
end_date = date(2021, 1, 1)

In [None]:
# retrieve parameter grid
parameter_grid = list(ParameterGrid(xgbParams))
y_var = "LONG_SHORT"
file_dir = "xgb/pct_tech_macro/"

for future in tqdm(utils.futuresList):
    print(future)

    # load data - generates df with PCT and DIFF of close, tech indicators, and macro indicators
    df = utils.prepare_data(future)
    
    # generate X vars
    # ['MACD', 'RSI14', 'VPT']
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=['MACD', 'RSI14', 'VPT'], \
                                   macro_indicators=True)
    
    print(X_vars)
    
    # if 'CLOSE_LINEAR_PCT' in X_vars:

    # load X and y
    X_df, y_df = utils.generate_X_y(df, X_vars=X_vars, y_var=y_var)
    cost_df = df["CLOSE_PCT"]

    # prepare collated results
    agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                        columns=["accuracy_SMA", "opp_cost_SMA"])
    win_results_collated = []

    # run walk forward validation 
    for i in range(len(parameter_grid)):
        param_set = parameter_grid[i]
        model = XGBClassifier(**param_set)
        win_results, agg_results = models.categorical.walk_forward(
            model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, 
            max_windows = 100, start_index = start_date
        )
        win_results_collated.append(win_results)
        agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
        agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]

    # save parameters
    parameter_df = pd.DataFrame.from_records(parameter_grid)
    combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)
    # sort by lowest opp cost
    combined_df = combined_df.sort_values(by=["opp_cost_SMA"], ascending=True)
    combined_df.to_csv(f"model_metrics/categorical/{file_dir}{future}.csv", index=False)

# Results of Tuning

## Opp Cost Summary

In [None]:
filepaths = ['pct', 'pct_tech', 'pct_macro', 'pct_tech_macro']

# get validation statistics
lowest_cost_lst = []

param_df = pd.DataFrame()

for filepath in filepaths:
    print(filepath)
    for future in tqdm(utils.futuresList):
        future_metric = pd.read_csv(f'model_metrics/categorical/xgb/{filepath}/{future}.csv')
        # get lowest opp cost
        future_cost = future_metric.loc[future_metric['opp_cost_SMA'] == min(future_metric['opp_cost_SMA'])].reset_index(drop=True)
        future_cost_val = future_cost.iloc[0]['opp_cost_SMA']
        lowest_cost_lst.append(future_cost_val)

        row = dict()

        for col in future_cost.columns:
            if (col[:8] != 'opp_cost') & (col[:8] != 'accuracy'):
                row[col] = future_cost.iloc[0][col]

        row['future'] = future
        param_df = param_df.append(row, ignore_index=True)
    
    print("Average Opp Cost: ", sum(lowest_cost_lst)/len(lowest_cost_lst))
    print(param_df['learning_rate'].value_counts())
    print(param_df['gamma'].value_counts())
    print(param_df['max_depth'].value_counts())
    print("=============================================")

## Best X_vars Summary

**Evaluated based on: Lowest Opp Cost SMA**

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT]`
- 47 out of 88 futures performed better with added tech indicators

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_PCT, VOL_PCT, +macroIndicators]`
- 60 out of 88 futures performed better with added macro indicators

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT, +macroIndicators]`
- 46 out of 88 futures performed better with added macro indicators

`[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT]` vs `[CLOSE_PCT, VOL_PCT, +macroIndicators]`
- 37 out of 88 futures performed better with added macro indicators compared to tech indicators

`[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT]` vs `[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT, +macroIndicators]`
- 62 out of 88 futures performed better in the second case

`[CLOSE_PCT, VOL_PCT, +macroIndicators]` vs `[CLOSE_PCT, VOL_PCT, MACD, RSI14, VPT, +macroIndicators]`
- 45 out of 88 futures performed better in the second case

In [None]:
count = 0

for future in tqdm(utils.futuresList):
    perc = pd.read_csv(f"model_metrics/categorical/xgb/pct_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
        
    perc_2 = pd.read_csv(f"model_metrics/categorical/xgb/pct_tech_macro/{future}.csv")
    perc_2_best = max(perc_2['opp_cost_SMA'])
    if (perc_2_best <= perc_best):
        # second model is better
        count += 1
        # print('*', future, perc_best-perc_2_best)
    else:
        # print(future, perc_2_best-perc_best)
        continue

print(count)

## Best Tuning Params Summary

In [None]:
best_params = pd.DataFrame()

for future in tqdm(utils.futuresList):
    perc = pd.read_csv(f"model_metrics/categorical/xgb/pct_tech_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
    best_params = best_params.append(perc.loc[perc['opp_cost_SMA'] == perc_best], ignore_index=True)

In [None]:
best_params['learning_rate'].value_counts()

In [None]:
best_params['gamma'].value_counts()

In [None]:
best_params['max_depth'].value_counts()

# Save Meta Predictions

In [None]:
file_dir = "xgb/pct"
y_var = "LONG_SHORT"

for future in tqdm(utils.futuresList):
    print(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=[], macro_indicators=False)
    
    models.categorical.save_meta_predictions(
        path = file_dir, metric="opp_cost_SMA", model_fn = XGBClassifier, future = future,
        X_vars = X_vars, y_var = y_var, model_name = "xgb"
    )

In [None]:
file_dir = "xgb/pct_tech"
y_var = "LONG_SHORT"

for future in tqdm(utils.futuresList):
    print(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=['MACD', 'RSI14', 'VPT'], macro_indicators=False)
    
    models.categorical.save_meta_predictions(
        path = file_dir, metric="opp_cost_SMA", model_fn = XGBClassifier, future = future,
        X_vars = X_vars, y_var = y_var, model_name = "xgb"
    )

In [None]:
file_dir = "xgb/pct_macro"
y_var = "LONG_SHORT"

for future in tqdm(utils.futuresList):
    print(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=[], macro_indicators=True)
    
    models.categorical.save_meta_predictions(
        path = file_dir, metric="opp_cost_SMA", model_fn = XGBClassifier, future = future,
        X_vars = X_vars, y_var = y_var, model_name = "xgb"
    )

In [None]:
file_dir = "xgb/pct_tech_macro"
y_var = "LONG_SHORT"

for future in tqdm(utils.futuresList):
    print(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=['MACD', 'RSI14', 'VPT'], macro_indicators=True)
    
    models.categorical.save_meta_predictions(
        path = file_dir, metric="opp_cost_SMA", model_fn = XGBClassifier, future = future,
        X_vars = X_vars, y_var = y_var, model_name = "xgb"
    )

# Save Models

In [None]:
# train_start = date(2018, 9, 1)
# train_end = date(2020, 9, 30)
train_start = date(2019, 1, 1)
train_end = date(2020, 12, 31)

for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=[], \
                                   macro_indicators=False)
    
    models.categorical.save_model(path='xgb/pct', metric="opp_cost_SMA", \
                                  model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                                  future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                  ext_path="csv",\
                                  train_start=train_start, train_end=train_end)
    # print(f'{future} done')

In [None]:
for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=["MACD", "RSI14", "VPT"], \
                                   macro_indicators=False)
    
    models.categorical.save_model(path='xgb/pct_tech', metric="opp_cost_SMA", \
                                  model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                                  future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                  ext_path="csv",\
                                  train_start=train_start, train_end=train_end)
    # print(f'{future} done')

In [None]:
for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=[], \
                                   macro_indicators=True)
    
    models.categorical.save_model(path='xgb/pct_macro', metric="opp_cost_SMA", \
                                  model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                                  future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                  ext_path="csv",\
                                  train_start=train_start, train_end=train_end)
    # print(f'{future} done')

In [None]:
for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=False, \
                                   tech_indicators=["MACD", "RSI14", "VPT"], \
                                   macro_indicators=True)
    
    models.categorical.save_model(path='xgb/pct_tech_macro', metric="opp_cost_SMA", \
                                  model_fn=XGBClassifier, model_wrapper=models.categorical.XGBWrapper, \
                                  future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                  ext_path="csv",\
                                  train_start=train_start, train_end=train_end)
    # print(f'{future} done')