# XGBoost

## Setup

In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (accuracy_score, f1_score, 
                            confusion_matrix, roc_auc_score,
                            RocCurveDisplay)

train_df = pd.read_csv("./datasets/train.csv")
val_df = pd.read_csv("./datasets/val.csv")
test_df = pd.read_csv("./datasets/test.csv")

# Define features and binary classification target
features = ['Open', 'High', 'Low', 'Close', 'Volume']
train_df['target'] = (train_df['log_return'] > 0).astype(int)
val_df['target'] = (val_df['log_return'] > 0).astype(int)
test_df['target'] = (test_df['log_return'] > 0).astype(int)

# Extract features and target
X_train = train_df[features]
y_train = train_df['target']

X_val = val_df[features]
y_val = val_df['target']

X_test = test_df[features]
y_test = test_df['target']

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## Model Implementation

In [None]:
# ─── 2. Base GPU‑powered XGBClassifier ───────────────────────────────────────
base_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    # use_label_encoder=False,
    verbosity=0,
    random_state=42,
    tree_method='gpu_hist',       # GPU training
    predictor='gpu_predictor',    # GPU prediction
    gpu_id=0                      # which GPU to use
)

# ─── 3. RandomizedSearchCV over broad ranges ────────────────────────────────
param_dist = {
    'n_estimators':     [100, 300, 500, 800, 1200],
    'learning_rate':    [0.01, 0.03, 0.05, 0.1, 0.2],
    'max_depth':        [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5, 7],
    'gamma':            [0, 0.1, 0.5, 1, 2],
    'subsample':        [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha':        [0, 1, 5, 10],
    'reg_lambda':       [1, 5, 10, 20]
}

rand_search = RandomizedSearchCV(
    estimator=base_clf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rand_search.fit(X_train_scaled, y_train)

print("RandomSearch best params:", rand_search.best_params_)
print("RandomSearch best CV acc: {:.4f}".format(rand_search.best_score_))

# ─── 4. GridSearchCV fine‑tuning around Randomized best ─────────────────────
best = rand_search.best_params_
grid_params = {
    'max_depth':        [max(1, best['max_depth']-1), best['max_depth'], best['max_depth']+1],
    'min_child_weight': [max(1, best['min_child_weight']-1), best['min_child_weight'], best['min_child_weight']+1],
    'gamma':            [max(0, best['gamma']-0.5), best['gamma'], best['gamma']+0.5],
    'subsample':        [best['subsample']-0.1, best['subsample'], best['subsample']+0.1],
    'colsample_bytree': [best['colsample_bytree']-0.1, best['colsample_bytree'], best['colsample_bytree']+0.1]
}

grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(
        **{k: best[k] for k in ['n_estimators','learning_rate','reg_alpha','reg_lambda']},
        objective='binary:logistic',
        eval_metric='logloss',
        # use_label_encoder=False,
        random_state=42,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        gpu_id=0
    ),
    param_grid=grid_params,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

print("GridSearch best params:", grid_search.best_params_)
print("GridSearch best CV acc: {:.4f}".format(grid_search.best_score_))

# ─── 5. Final fit with early stopping (GPU) ─────────────────────────────────
# Extend n_estimators so early stopping can trim it back
final_params = {
    **grid_search.best_params_,
    'n_estimators': 10000,
    'learning_rate': best['learning_rate'],
    'reg_alpha':     best['reg_alpha'],
    'reg_lambda':    best['reg_lambda'],
    'tree_method':   'gpu_hist',
    'predictor':     'gpu_predictor',
    'gpu_id':        0
}

final_clf = xgb.XGBClassifier(
    **final_params,
    objective='binary:logistic',
    eval_metric='logloss',
    # use_label_encoder=False,
    random_state=42
)
final_clf.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    # early_stopping_rounds=30,
    verbose=True
)

# ─── 6. Evaluate on test ────────────────────────────────────────────────────
y_test_pred = final_clf.predict(X_test_scaled)
test_acc = accuracy_score(y_test, y_test_pred)
print("\nFinal Test Accuracy (GPU): {:.4f}".format(test_acc))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch best params: {'subsample': 1.0, 'reg_lambda': 20, 'reg_alpha': 0, 'n_estimators': 1200, 'min_child_weight': 7, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
RandomSearch best CV acc: 0.7008
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
