In [6]:
import utils
import models.categorical
import pandas as pd
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore")

In [7]:
futuresList = utils.futuresList

rfParams = [
    {
        "n_estimators": [20, 50, 100], 
        "max_depth": [5, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 5],
        # "max_features": ['sqrt', 'log2']
    }
]

Testing Tuning Code for One Future

In [8]:
future = futuresList[0]
df = pd.read_csv(f"tickerData/{future}.txt", parse_dates = ["DATE"])
df.columns = ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL', 'OI', 'P', 'R', 'RINFO']
df = df.set_index("DATE")
df = df[(df.VOL != 0) & (df.CLOSE != 0)]
df = df.dropna(axis=0)
X_df = utils.generate_X_df([df.CLOSE, df.VOL], ["perc", "perc"])
y_df = utils.generate_y_cat(df.CLOSE)
cost_df = utils.perc_change(df.CLOSE, shift=0)

In [10]:
parameter_grid = list(ParameterGrid(rfParams))
# prepare collated results
agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                    columns=["accuracy_SMA", "opp_cost_SMA", "accuracy_EMA", "opp_cost_EMA"])
win_results_collated = []

# run walk forward validation 
for i in range(len(parameter_grid)):
    param_set = parameter_grid[i]
    model = RandomForestClassifier(**param_set)
    win_results, agg_results = models.categorical.walk_forward(
        model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, 
        max_windows = 36, start_index = datetime.datetime(2010, 1, 1)
    )
    win_results_collated.append(win_results)
    agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
    agg_results_collated.loc[i, "accuracy_EMA"] = agg_results.loc["EMA", "accuracy"]
    agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]
    agg_results_collated.loc[i, "opp_cost_EMA"] = agg_results.loc["EMA", "opp_cost"]
    
# save parameters
parameter_df = pd.DataFrame.from_records(parameter_grid)
combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)



In [11]:
combined_df

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,n_estimators,accuracy_SMA,opp_cost_SMA,accuracy_EMA,opp_cost_EMA
0,5,1,2,20,0.566796,15.485189,0.56276,17.924073
1,5,1,2,50,0.569317,15.361419,0.566319,17.546847
2,5,1,2,100,0.568948,15.354864,0.56004,17.859469
3,5,1,5,20,0.565616,15.492619,0.557744,17.844464
4,5,1,5,50,0.567922,15.428227,0.559595,17.984994
5,5,1,5,100,0.569191,15.337026,0.562265,17.732742
6,5,1,10,20,0.5642,15.608291,0.558656,17.977083
7,5,1,10,50,0.568137,15.41549,0.565627,17.548326
8,5,1,10,100,0.567232,15.441462,0.557809,17.982325
9,5,5,2,20,0.565408,15.590442,0.556974,18.048797


Tuning Code for All Futures

In [None]:
# retrieve parameter grid
parameter_grid = list(ParameterGrid(rfParams))

for future in futuresList:
    # load data
    df = pd.read_csv(f"tickerData/{future}.txt", parse_dates = ["DATE"])
    df.columns = ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL', 'OI', 'P', 'R', 'RINFO']
    df = df.set_index("DATE")
    df = df[(df.VOL != 0) & (df.CLOSE != 0)]
    df = df.dropna(axis=0)
    
    # load X and y
    X_df = utils.generate_X_df([df.CLOSE, df.VOL], ["perc", "perc"])
    y_df = utils.generate_y_cat(df.CLOSE)
    cost_df = utils.perc_change(df.CLOSE, shift=0)
    
    # prepare collated results
    agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                        columns=["accuracy_SMA", "opp_cost_SMA", "accuracy_EMA", "opp_cost_EMA"])
    win_results_collated = []

    # run walk forward validation 
    for i in range(len(parameter_grid)):
        param_set = parameter_grid[i]
        model = RandomForestClassifier(**param_set)
        win_results, agg_results = models.categorical.walk_forward(
            model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, max_windows = 114
        )
        win_results_collated.append(win_results)
        agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
        agg_results_collated.loc[i, "accuracy_EMA"] = agg_results.loc["EMA", "accuracy"]
        agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]
        agg_results_collated.loc[i, "opp_cost_EMA"] = agg_results.loc["EMA", "opp_cost"]

    # save parameters
    parameter_df = pd.DataFrame.from_records(parameter_grid)
    combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)
    combined_df.to_csv(f"model_metrics/categorical/rf/perc/{future}.txt", index=False)
    
    print(f'{future} done')

In [None]:
combined_df

Save Model

In [None]:
import utils
import pickle
import numpy as np
import pandas as pd
import models.categorical
from sklearn.ensemble import RandomForestClassifier

train_start = '2019-01-01'
train_end = '2020-12-31'

In [None]:
futuresList = utils.futuresList

for future in futuresList:
    txt_path = f'rf/perc/{future}'

    # load data
    df = pd.read_csv(f"tickerData/{future}.txt", parse_dates = ["DATE"])
    df.columns = ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL', 'OI', 'P', 'R', 'RINFO']
    # extract train set
    train_df = df.loc[(df['DATE'] >= train_start) & (df['DATE'] <= train_end)]
    train_df = train_df.set_index("DATE")
    # remove na and zero values
    train_df = train_df[(train_df.VOL != 0) & (train_df.CLOSE != 0)]
    train_df = train_df.dropna(axis=0)

    # extract X_train and y_train
    X_train = utils.generate_X_df([train_df.CLOSE, train_df.VOL], ["perc", "perc"])
    y_train = utils.generate_y_cat(train_df.CLOSE)
    X_train = X_train.dropna(axis=0)
    y_train = y_train.dropna(axis=0)

    # get intersection of all dataframes
    common_index = X_train.index.intersection(y_train.index)
    X_train = X_train[X_train.index.isin(common_index)].to_numpy()
    y_train = y_train[y_train.index.isin(common_index)].to_numpy(np.dtype(int))
    
    metric = 'accuracy_SMA'
    model = RandomForestClassifier
    
    _ = models.categorical.save_model(txt_path, metric, RandomForestClassifier, X_train, y_train)
    
    break