# Build Model RF and XGBOOST

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

## RANDOM FOREST

In [5]:
def prepare(data): 
    x_train = data.loc[:, data.columns != 'Target']
    y_train = data["Target"]
    return x_train, y_train

In [3]:
param_grid = {
    'min_samples_leaf': [0.005, 0.01, 0.02],
    'min_samples_split': [0.01, 0.02, 0.03],
    "max_features": ["sqrt", 11, 18]
}

In [4]:
def fit_rf(x_train, y_train):
    rfr = RandomForestRegressor(random_state = 1)
    CV_rf = GridSearchCV(estimator = rfr, param_grid = param_grid, 
                         scoring = "neg_mean_squared_error",
                         cv = 5, verbose = 2)
    CV_rf.fit(x_train, y_train)
    best_grid = CV_rf.best_estimator_
    return best_grid

In [7]:
clusters = list(range(5))
for t in tqdm(clusters):
    data = pd.read_csv(f"TrainingData/StockType{t}.csv")
    x_train, y_train = prepare(data)
    rf_model = fit_rf(x_train, y_train)
    with open(f'model_rf_cluster{t}', 'wb') as files:
        pickle.dump(rf_model, files)

  0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   3.0s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   2.9s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   3.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   3.2s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   3.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   2.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   2.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   2.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   3.0s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total t

 20%|██        | 1/5 [16:07<1:04:30, 967.66s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.0s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   5.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.6s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.7s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.5s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total t

 40%|████      | 2/5 [44:31<1:10:02, 1400.98s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.0s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   5.9s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   5.6s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.6s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.7s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.6s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.7s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   5.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total t

 60%|██████    | 3/5 [1:13:45<52:03, 1561.85s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   1.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   1.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   1.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   1.4s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   1.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   1.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   1.2s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   1.2s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   1.2s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total t

 80%|████████  | 4/5 [1:20:28<18:24, 1104.54s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.4s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.6s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.5s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.9s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.01; total time=   6.8s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   6.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   6.3s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   6.1s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total time=   6.7s
[CV] END max_features=sqrt, min_samples_leaf=0.005, min_samples_split=0.02; total t

100%|██████████| 5/5 [1:55:14<00:00, 1382.88s/it]


### Best RF model Performance

In [9]:
for i in range(5):
    data = pd.read_csv(f"TrainingData/StockType{i}.csv")
    x_train, y_train = prepare(data)
    loaded_model = pickle.load(open(f'model_rf_cluster{i}', 'rb'))
    y_pred = loaded_model.predict(x_train)
    print(f"Mean squared error (Cluster {i}): {mean_squared_error(y_train, y_pred)}")

Mean squared error (Cluster 0): 0.0036111842217845507
Mean squared error (Cluster 1): 0.004642742833976078
Mean squared error (Cluster 2): 0.0038358937981820486
Mean squared error (Cluster 3): 0.005529169228058321
Mean squared error (Cluster 4): 0.0020202005760990623


## XGBOOST

In [10]:
# A parameter grid for XGBoost
params = {
        'gamma': [0.005, 0.01, 0.1],
        'learning_rate': [0.005, 0.01, 0.1],
        'max_depth': [6, 10, 20]
        }

In [11]:
def fit_xgb(x_train, y_train):
    xgb_mod = xgb.XGBRegressor(random_state = 1)
    CV_xgb = GridSearchCV(estimator = xgb_mod, param_grid = params, 
                         scoring = "neg_mean_squared_error",
                         cv = 5, verbose = 2)
    CV_xgb.fit(x_train, y_train)
    best_xgb = CV_xgb.best_estimator_
    return best_xgb

In [12]:
clusters = list(range(5))
for t in tqdm(clusters):
    data = pd.read_csv(f"TrainingData/StockType{t}.csv")
    x_train, y_train = prepare(data)
    xgb_model = fit_xgb(x_train, y_train)
    with open(f'model_xgb_cluster{t}', 'wb') as files:
        pickle.dump(xgb_model, files)

  0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.8s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.7s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.7s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.7s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.7s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   1.1s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   1.0s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   1.1s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   1.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   1.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=20; total time=   1.3s
[CV] END .....gamma=0.005, learning_rate=0.005,

 20%|██        | 1/5 [03:56<15:46, 236.67s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.3s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.5s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.0s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.1s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.2s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.1s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=20; total time=   2.9s
[CV] END .....gamma=0.005, learning_rate=0.005,

 40%|████      | 2/5 [10:09<15:50, 316.74s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.3s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.3s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.2s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.2s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.3s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=20; total time=   2.7s
[CV] END .....gamma=0.005, learning_rate=0.005,

 60%|██████    | 3/5 [16:40<11:41, 350.57s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.4s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   0.4s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   0.6s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   0.6s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   0.7s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   0.7s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   0.6s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=20; total time=   0.8s
[CV] END .....gamma=0.005, learning_rate=0.005,

 80%|████████  | 4/5 [18:51<04:23, 263.80s/it]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.8s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.8s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.8s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.8s
[CV] END ......gamma=0.005, learning_rate=0.005, max_depth=6; total time=   1.8s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.5s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.6s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.3s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.5s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=10; total time=   2.5s
[CV] END .....gamma=0.005, learning_rate=0.005, max_depth=20; total time=   2.7s
[CV] END .....gamma=0.005, learning_rate=0.005,

100%|██████████| 5/5 [26:20<00:00, 316.00s/it]


In [14]:
for i in range(5):
    data = pd.read_csv(f"TrainingData/StockType{i}.csv")
    x_train, y_train = prepare(data)
    loaded_model = pickle.load(open(f'model_xgb_cluster{i}', 'rb'))
    y_pred = loaded_model.predict(x_train)
    print(f"Mean squared error (Cluster {i}): {mean_squared_error(y_train, y_pred)}")

Mean squared error (Cluster 0): 0.0034277820095908063
Mean squared error (Cluster 1): 0.003395629607421614
Mean squared error (Cluster 2): 0.0033708327171582996
Mean squared error (Cluster 3): 0.005020138525977068
Mean squared error (Cluster 4): 0.0019485820408943009
