In [None]:
import utils
import models.categorical
import math
import pickle
import numpy as np
import pandas as pd
from datetime import date

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

# Hyperparameter Tuning

**Parameters**
- `n_estimators` (default=100): Number of trees in the forest.
- `criterion` (default='gini'): Function to measure quality of split. Either Gini coefficient ('gini') or information gain ('entropy').
- `max_depth` (default=None): If None, nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
- `min_samples_split` (default=2)
- `min_samples_leaf` (default=1)

**Inputs / X_vars combinations**
- `perc`: `['CLOSE_PCT', 'VOL_PCT']`
- `perc_linear`: `['CLOSE_LINEAR_PCT', 'VOL_LINEAR_PCT']`
    - Note only futures that follow an exponential trend are linearized
- `perc_tech_macro`: `['CLOSE_PCT', 'VOL_PCT', 'MACD', 'RSI14', 'VPT', +macroIndicators]`
- `perc_tech_macro`: `['CLOSE_LINEAR_PCT', 'VOL_LINEAR_PCT', 'MACD', 'RSI14', 'VPT', +macroIndicators]`
    - Note only futures that follow an exponential trend are lienarized
    - Note that macroeconomic indicators are added depending on the industry of the future, as well as the country of origin of the future (only US futures have +macroIndicators, as the indicators are USA-based)

In [None]:
rfParams = [
    {
        "n_estimators": [100], 
        "criterion": ['gini'],
        # "max_depth": [5, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 5, 10],
        # "max_features": ['sqrt', 'log2']
    }
]


start_date = date(2010, 1, 1)
end_date = date(2021, 1, 1)

In [None]:
# retrieve parameter grid
parameter_grid = list(ParameterGrid(rfParams))
y_var = "LONG_SHORT" # DO NOT CHANGE THIS
file_dir = "rf/perc_linear/" # or perc/ perc_tech_macro/ perc_linear_tech_macro/ 

for future in tqdm(utils.futuresList):
    # print(future)

    # load data - generates df with PCT and DIFF of close, tech indicators, and macro indicators
    df = utils.prepare_data(future)
    
    # generate X vars
    X_vars = utils.generate_X_vars(future, linearise=True, \
                                   tech_indicators=["MACD", "RSI14", "VPT"], \
                                   macro_indicators=False)
        
    # if 'CLOSE_LINEAR_PCT' in X_vars: # only 10 futures models require linearisation

    # load X and y
    X_df, y_df = utils.generate_X_y(df, X_vars=X_vars, y_var=y_var)
    cost_df = df["CLOSE_PCT"]

    # prepare collated results
    agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                        columns=["accuracy_SMA", "opp_cost_SMA"])
    win_results_collated = []

    # run walk forward validation 
    for i in range(len(parameter_grid)):
        param_set = parameter_grid[i]
        model = RandomForestClassifier(**param_set)
        win_results, agg_results = models.categorical.walk_forward(
            model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, 
            max_windows = 100, start_index = start_date
        )
        win_results_collated.append(win_results)
        agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
        agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]

    # save parameters
    parameter_df = pd.DataFrame.from_records(parameter_grid)
    combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)
    # sort by lowest opp cost
    combined_df = combined_df.sort_values(by=["opp_cost_SMA"], ascending=True)
    combined_df.to_csv(f"model_metrics/categorical/{file_dir}{future}.csv", index=False)

# Results of Tuning

## Best X_vars Summary

**Evaluated based on: Lowest Opp Cost SMA**

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_LINEAR_PCT, VOL_LINEAR_PCT]`
- 6 out of 10 futures models performed better when linearized
- `F_FC, F_GC, F_LC, F_SF, F_TY, F_DL`

`[CLOSE_PCT, VOL_PCT]` vs `[CLOSE_PCT, VOL_PCT, +techIndicators, +macroIndicators]`
- all futures perform better with added technical indicators and macro indicators

`[CLOSE_PCT, VOL_PCT, +tech, +macro]` vs `[CLOSE_LINEAR_PCT, VOL_LINEAER_PCT, +tech, +macro]`
- 3 out of 10 futures models performed better when linearized
- `F_FV, F_TY, F_VX`

In [None]:
for future in tqdm(utils.futuresList):
    perc = pd.read_csv(f"model_metrics/categorical/rf/perc_tech_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
    
    try:
        perc_linear = pd.read_csv(f"model_metrics/categorical/rf/perc_linear_tech_macro/{future}.csv")
        perc_linear_best = max(perc_linear['opp_cost_SMA'])
        if (perc_linear_best <= perc_best):
            # linearise = better
            print('*', future, perc_best-perc_linear_best)
            continue
        else:
            print(future, perc_linear_best-perc_best)
            continue
        continue
    except:      
        continue

## Best Tuning Params Summary

- `min_samples_split` seems to be a good tuning parameter (tried 2, 5, 10), different models work differently depending on this param
- `min_samples_leaf` (tried 1, 5, 10, default=1), all models perform the best when min_samples_leaf=10, meaning that generalisation is better

In [None]:
best_params = pd.DataFrame()

for future in tqdm(utils.futuresList):
    try:
        perc = pd.read_csv(f"model_metrics/categorical/rf/perc_linear_tech_macro/{future}.csv")
    except:
        perc = pd.read_csv(f"model_metrics/categorical/rf/perc_tech_macro/{future}.csv")
    perc_best = max(perc['opp_cost_SMA'])
    best_params = best_params.append(perc.loc[perc['opp_cost_SMA'] == perc_best], ignore_index=True)

In [None]:
best_params['min_samples_split'].value_counts()

In [None]:
best_params['min_samples_leaf'].value_counts()

# Save Models
- Train: (2018, 9, 1) to (2020, 9, 31)
- Val: (2020, 10, 1) to (2020, 12, 31)

In [None]:
train_start = date(2018, 9, 1)
train_end = date(2020, 9, 30)

for future in utils.futuresList:
    # prepare X variables
    X_vars = utils.generate_X_vars(future, linearise=True, \
                                   tech_indicators=["MACD", "RSI14", "VPT"], \
                                   macro_indicators=True)
    
    # save model
    try:
        models.categorical.save_model(path='rf/perc_linear_tech_macro', metric="opp_cost_SMA", \
                                      model_fn=RandomForestClassifier, model_wrapper=models.categorical.RFWrapper, \
                                      future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                                      ext_path="csv",\
                                      train_start=train_start, train_end=train_end)
        print(f'{future} done')
    except:
        models.categorical.save_model(path='rf/perc_tech_macro', metric="opp_cost_SMA", \
                              model_fn=RandomForestClassifier, model_wrapper=models.categorical.RFWrapper, \
                              future=future, X_vars=X_vars, y_var="LONG_SHORT", \
                              ext_path="csv",\
                              train_start=train_start, train_end=train_end)
        print(f'{future} done')