# XGBoost

## Setup

In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform, loguniform
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import RandomizedSearchCV, HalvingGridSearchCV
from sklearn.metrics import (accuracy_score, f1_score, 
                            confusion_matrix, roc_auc_score,
                            RocCurveDisplay)

train_df = pd.read_csv("./datasets/train.csv")
val_df = pd.read_csv("./datasets/val.csv")
test_df = pd.read_csv("./datasets/test.csv")

# Define features and binary classification target
features = ['Open', 'High', 'Low', 'Close', 'Volume']
train_df['target'] = (train_df['log_return'] > 0).astype(int)
val_df['target'] = (val_df['log_return'] > 0).astype(int)
test_df['target'] = (test_df['log_return'] > 0).astype(int)

# Extract features and target
X_train = train_df[features]
y_train = train_df['target']

X_val = val_df[features]
y_val = val_df['target']

X_test = test_df[features]
y_test = test_df['target']

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## HalvingRandomSearchCV

In [15]:
base_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    verbosity=0,
    random_state=42,
    device='cuda',
    early_stopping_rounds=50,
    n_jobs=-1
)
param_dist = {
    #  20 choices from 50 → 2 000
    'n_estimators':       list(np.linspace(50, 2000, 10, dtype=int)),

    # 20 log‑spaced values from 1e‑3 → 1.0
    'learning_rate':      list(np.logspace(-3, 0, 5)),

    # all integers from 1 → 15
    'max_depth':          list(range(1, 16)),

    # all integers from 1 → 30
    'min_child_weight':   list(range(1, 31)),

    # 11 choices from 0 → 10
    'gamma':              list(np.linspace(0, 10, 11)),

    # 8 choices from 0.3 → 1.0
    'subsample':          list(np.linspace(0.3, 1.0, 8)),
    'colsample_bytree':   list(np.linspace(0.3, 1.0, 8)),
    'colsample_bylevel':  list(np.linspace(0.3, 1.0, 8)),
    'colsample_bynode':   list(np.linspace(0.3, 1.0, 8)),

    # 10 log‑spaced values from 1e‑3 → 1e2
    'reg_alpha':          list(np.logspace(-3, 2, 10)),
    'reg_lambda':         list(np.logspace(-3, 2, 10)),

    # all integers from 0 → 10
    'max_delta_step':     list(range(0, 11)),

    # 10 choices from 1 → 10
    'scale_pos_weight':   list(np.linspace(1, 10, 10)),

    # integers from 64 → 512 stepping by 32 (15 choices)
    'max_bin':            list(range(64, 513, 32))
}

rand_search = RandomizedSearchCV(
    estimator=base_clf,
    param_distributions=param_dist,
    scoring='accuracy',
    # n_iter=100, 
    cv=5,
    verbose=3,
    random_state=42,
    n_jobs=1
)
rand_search.fit(X_train_scaled, 
    y_train,
    eval_set=[(X_val_scaled, y_val)],
)

print("RandomSearch best params:", rand_search.best_params_)
print("RandomSearch best CV acc: {:.4f}".format(rand_search.best_score_))

ValueError: high is out of bounds for int32

## HalvingGridSearchCV

In [2]:
# GridSearchCV fine‑tuning around Randomized best
best = rand_search.best_params_
grid_params = {
    'max_depth':         [max(1, best['max_depth']-2), best['max_depth'], best['max_depth']+2],
    'min_child_weight':  [max(1, best['min_child_weight']-2), best['min_child_weight'], best['min_child_weight']+2],
    'gamma':             [max(0.0, best['gamma']-1.0), best['gamma'], best['gamma']+1.0],
    'subsample':         [max(0.1, best['subsample']-0.2), best['subsample'], min(1.0, best['subsample']+0.2)],
    'colsample_bytree':  [max(0.1, best['colsample_bytree']-0.2), best['colsample_bytree'], min(1.0, best['colsample_bytree']+0.2)],
    'colsample_bylevel': [max(0.1, best['colsample_bylevel']-0.2), best['colsample_bylevel'], min(1.0, best['colsample_bylevel']+0.2)],
    'colsample_bynode':  [max(0.1, best['colsample_bynode']-0.2), best['colsample_bynode'], min(1.0, best['colsample_bynode']+0.2)],
    'learning_rate':     [max(1e-4, best['learning_rate']/2), best['learning_rate'], min(1.0, best['learning_rate']*2)],
    'reg_alpha':         [best['reg_alpha']/2, best['reg_alpha'], best['reg_alpha']*2],
    'reg_lambda':        [best['reg_lambda']/2, best['reg_lambda'], best['reg_lambda']*2],
    'scale_pos_weight':  [max(1, best['scale_pos_weight']-5), best['scale_pos_weight'], best['scale_pos_weight']+5],
    'max_delta_step':    [max(0, best['max_delta_step']-2), best['max_delta_step'], best['max_delta_step']+2],
    'max_bin':           [max(32, best['max_bin']-128), best['max_bin'], min(1024, best['max_bin']+128)]
}

grid_search = HalvingGridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        tree_method='hist',
        random_state=42,
        device='cuda',
        gpu_id=0,
        n_jobs=1,
        # fix the other hyps from random search
        n_estimators=best['n_estimators'],
    ),
    n_candidates=200,
    param_grid=grid_params,
    factor=3,
    resource='n_estimators',
    max_resources=best['n_estimators'],
    min_resources=max(10, best['n_estimators']//5),
    scoring='accuracy',
    cv=3,
    verbose=3,
    n_jobs=1
)
grid_search.fit(X_train_scaled, y_train)

print("GridSearch best params:", grid_search.best_params_)
print("GridSearch best CV acc: {:.4f}".format(grid_search.best_score_))

NameError: name 'rand_search' is not defined

## Final Fit with Early Stopping

In [None]:
# Final fit with early stopping
# Extend n_estimators so early stopping can trim it back
final_params = {
    **grid_search.best_params_,
    'n_estimators': 10000,
    'learning_rate': best['learning_rate'],
    'reg_alpha':     best['reg_alpha'],
    'reg_lambda':    best['reg_lambda'],
    'tree_method':   'gpu_hist',
    'predictor':     'gpu_predictor',
    'gpu_id':        0
}

final_clf = XGBClassifier(
    **final_params,
    objective='binary:logistic',
    eval_metric='logloss',
    early_stopping_rounds=30,
    random_state=42
)
final_clf.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    verbose=True
)

## Evaluation

In [None]:
# Evaluate on test
y_test_pred = final_clf.predict(X_test_scaled)
y_pred_binary = (y_test_pred > 0.5).astype(int)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
auc_roc = roc_auc_score(y_test, y_test_pred)


# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_binary)

# Display results
print("No Window")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC Score: {auc_roc:.4f}")
print("\nConfusion Matrix:")
print(pd.DataFrame(conf_matrix, 
                  index=['Actual Down', 'Actual Up'],
                  columns=['Predicted Down', 'Predicted Up']))

# Plot ROC curve
RocCurveDisplay.from_predictions(y_test, y_test_pred)
plt.title('ROC Curve')
plt.show()