# Prediction: XGBoost Classifier

> **Warning!** Please run `01_cleaning.ipynb` first if you haven't already

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from functions.constants import BM_NAME, STARTDATE, ENDDATE, N_THRESHOLD_BPS,DATA_DIR
from functions.helper_fns import * 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns

In [2]:
active_returns_path = DATA_DIR + BM_NAME + "_active_returns.csv"
active_returns = pd.read_csv(active_returns_path, index_col=0, parse_dates=True)
print("Loaded active returns from", active_returns_path)
active_returns_thresholded_path = DATA_DIR + BM_NAME + "_active_returns_thresholded_" + str(N_THRESHOLD_BPS) + "bps.csv"
active_returns_thresholded = pd.read_csv(active_returns_thresholded_path, index_col=0, parse_dates=True)
print("Loaded active returns thresholded from", active_returns_thresholded_path)

Loaded active returns from ./../data/SP500_active_returns.csv
Loaded active returns thresholded from ./../data/SP500_active_returns_thresholded_100bps.csv


In [3]:
NUM_FEATURES = 12
PREDICTION_PERIOD = "1w"
MODEL_USING_DF = active_returns
# MODEL_USING_DF = test_ticker_df #turn this on to run for one company
df = featurize_time_series(MODEL_USING_DF, PREDICTION_PERIOD, NUM_FEATURES,set_threshold_for_target_var_bps=N_THRESHOLD_BPS)
target_var = "ar_" + PREDICTION_PERIOD + "_t"
X = df.drop(columns=[target_var, "Date", "Ticker"])
y = df[[target_var]]
print("X shape:", X.shape)
print(X.head())
print("y shape:", y.shape)
print(y.head())
X = X.to_numpy()
y = y.to_numpy()

X shape: (249991, 12)
    ar_1w_t_minus_1  ar_1w_t_minus_2  ar_1w_t_minus_3  ar_1w_t_minus_4  \
12         0.008647        -0.008387         0.008395         0.030798   
13        -0.020086         0.008647        -0.008387         0.008395   
14         0.022483        -0.020086         0.008647        -0.008387   
15         0.017339         0.022483        -0.020086         0.008647   
16         0.005448         0.017339         0.022483        -0.020086   

    ar_1w_t_minus_5  ar_1w_t_minus_6  ar_1w_t_minus_7  ar_1w_t_minus_8  \
12         0.006078        -0.007792         0.012361         0.018158   
13         0.030798         0.006078        -0.007792         0.012361   
14         0.008395         0.030798         0.006078        -0.007792   
15        -0.008387         0.008395         0.030798         0.006078   
16         0.008647        -0.008387         0.008395         0.030798   

    ar_1w_t_minus_9  ar_1w_t_minus_10  ar_1w_t_minus_11  ar_1w_t_minus_12  
12        -0

In [4]:
TEST_FRACTION = 0.2
EVAL_FRACTION = 0.2
X_train_and_eval, X_test, y_train_and_eval, y_test = train_test_split(X, y, test_size=TEST_FRACTION, shuffle=False)
X_train, X_eval, y_train, y_eval = train_test_split(X_train_and_eval, y_train_and_eval, test_size=EVAL_FRACTION/(1-TEST_FRACTION), shuffle=False)
print("X_train shape:", X_train.shape)
print("X_eval shape:", X_eval.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_eval shape:", y_eval.shape)
print("y_test shape:", y_test.shape)

X_train shape: (149994, 12)
X_eval shape: (49998, 12)
X_test shape: (49999, 12)
y_train shape: (149994, 1)
y_eval shape: (49998, 1)
y_test shape: (49999, 1)


## Train a XGBoost

In [5]:

# Import XGBoost
from xgboost import XGBClassifier

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Evaluate the model
train_score = xgb_model.score(X_train, y_train)
eval_score = xgb_model.score(X_eval, y_eval)
test_score = xgb_model.score(X_test, y_test)

print("Training accuracy:", train_score)
print("Evaluation accuracy:", eval_score)
print("Test accuracy:", test_score)


Parameters: { "use_label_encoder" } are not used.



Training accuracy: 0.6979012493833087
Evaluation accuracy: 0.6453658146325854
Test accuracy: 0.6579131582631652


In [6]:

# Import XGBoost
from xgboost import XGBClassifier

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Evaluate the model
train_score = xgb_model.score(X_train, y_train)
eval_score = xgb_model.score(X_eval, y_eval)
test_score = xgb_model.score(X_test, y_test)

print("Training accuracy:", train_score)
print("Evaluation accuracy:", eval_score)
print("Test accuracy:", test_score)


Parameters: { "use_label_encoder" } are not used.



Training accuracy: 0.6979012493833087
Evaluation accuracy: 0.6453658146325854
Test accuracy: 0.6579131582631652


In [None]:
y_pred = xgb_model.predict(X_eval)\
y_pred_proba = xgb_model.predict_proba(X_eval)
y_train_pred = xgb_model.predict(X_train)
y_train_pred_proba = xgb_model.predict_proba(X_train)
print("===TRAINING SET===")
evaluate_model_performance(y_train, y_train_pred, y_train_pred_proba,PREDICTION_PERIOD,NUM_FEATURES,plot_confusion_matrix=False)
print("===EVALUATION SET===")
evaluate_model_performance(y_eval, y_pred, y_pred_proba,PREDICTION_PERIOD,NUM_FEATURES)

===TRAINING SET===
Accuracy: 0.6979012493833087
Precision: 0.8559823488115536
Recall: 0.16284438678165306
F1: 0.27363223955244215
ROC AUC: 0.7265909909776842
===EVALUATION SET===


NameError: name 'y_pred_proba' is not defined

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the hyperparameter grid for XGBoost
param_grid = {
    'max_depth': [3, 6, 9],  # Tree depth
    'learning_rate': [0.01, 0.1, 0.2],  # Step size
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples for training each tree
    'colsample_bytree': [0.6, 0.8, 1.0]  # Fraction of features for each tree
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Optimize for accuracy
    verbose=1
)
33
# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Extract the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

best_params


Fitting 3 folds for each of 243 candidates, totalling 729 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 

In [None]:
# Train the XGBoost model with the best parameters
optimized_model = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=50,
    subsample=0.6,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Fit the model on the training data
optimized_model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = optimized_model.score(X_train, y_train)
eval_accuracy = optimized_model.score(X_eval, y_eval)
test_accuracy = optimized_model.score(X_test, y_test)

# Print the results
print("Training Accuracy:", train_accuracy)
print("Evaluation Accuracy:", eval_accuracy)
print("Test Accuracy:", test_accuracy)
