# Advanced XGBoost Tuning (Bayesian + SMOTE)
Implementing Optuna for hyperparameter optimization and SMOTE to handle minority class imbalances (Ghost, Ice, Fairy, etc).

In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import joblib
import optuna
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE

from pokemon_predictor import config
from pokemon_predictor.data_utils import load_data


In [2]:
# Load Hybrid + Bio-Ratio Dataset
X_train, X_test, y_train, y_test, classes = load_data('hybrid', split_data=True)

X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y_train_np = y_train.values if isinstance(y_train, pd.DataFrame) else y_train
X_test_np = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
y_test_np = y_test.values if isinstance(y_test, pd.DataFrame) else y_test


In [3]:
class SMOTEMultiOutputClassifier(BaseEstimator, ClassifierMixin):
    """
    Custom wrapper that applies SMOTE independently to each label before fitting its respective XGBoost classifier.
    """
    def __init__(self, estimator, random_state=None):
        self.estimator = estimator
        self.random_state = random_state
        self.estimators_ = []
        
    def fit(self, X, y):
        self.estimators_ = []
        for i in range(y.shape[1]):
            y_i = y[:, i]
            n_minority = np.sum(y_i)
            
            if 0 < n_minority < len(y_i):
                # Adjust k_neighbors bounded by minority size
                k_neighbors = min(5, int(n_minority) - 1)
                smote = SMOTE(k_neighbors=max(1, k_neighbors), random_state=self.random_state)
                try:
                    X_res, y_res = smote.fit_resample(X, y_i)
                except ValueError:
                    X_res, y_res = X, y_i
            else:
                X_res, y_res = X, y_i
                
            est = clone(self.estimator)
            est.fit(X_res, y_res)
            self.estimators_.append(est)
        return self
        
    def predict(self, X):
        preds = [est.predict(X) for est in self.estimators_]
        return np.column_stack(preds)
        
    def predict_proba(self, X):
        return [est.predict_proba(X) for est in self.estimators_]


In [4]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 3, 9)
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.3, log=True)
    gamma = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 7)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 1.0)
    
    kf = KFold(n_splits=3, shuffle=True, random_state=config.RANDOM_SEED)
    scores = []
    
    for train_idx, val_idx in kf.split(X_train_np):
        X_tr, X_val = X_train_np[train_idx], X_train_np[val_idx]
        y_tr, y_val = y_train_np[train_idx], y_train_np[val_idx]
        
        base_est = XGBClassifier(
            n_estimators=100,
            max_depth=max_depth,
            learning_rate=learning_rate,
            gamma=gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            n_jobs=-1,
            random_state=config.RANDOM_SEED
        )
        
        clf = SMOTEMultiOutputClassifier(base_est, random_state=config.RANDOM_SEED)
        clf.fit(X_tr, y_tr)
        
        preds = clf.predict(X_val)
        score = f1_score(y_val, preds, average='micro')
        scores.append(score)
        
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=30)  # 30 trials for demonstration speed

print("=== Optuna Best Parameters ===")
print(study.best_params)
print(f"Best Cross-Validation F1: {study.best_value:.4f}")


[32m[I 2026-02-21 16:07:32,723][0m A new study created in memory with name: no-name-3840456c-dd2b-417d-8353-8a54de1e8097[0m


=== Optuna Best Parameters ===
{'max_depth': 9, 'learning_rate': 0.28923071630444935, 'gamma': 1.2663559946940675e-05, 'min_child_weight': 7, 'subsample': 0.9937843996822882, 'colsample_bytree': 0.9433842385679996}
Best Cross-Validation F1: 0.3594


In [5]:
print("Training final optimized SMOTE model on full training set...")
best_est = XGBClassifier(
    n_estimators=200, # Increased for final fit
    **study.best_params,
    n_jobs=-1,
    random_state=config.RANDOM_SEED
)

final_clf = SMOTEMultiOutputClassifier(best_est, random_state=config.RANDOM_SEED)
final_clf.fit(X_train_np, y_train_np)

test_preds = final_clf.predict(X_test_np)
final_f1 = f1_score(y_test_np, test_preds, average='micro')
print(f"\nFINAL TEST F1 Micro Score (Optuna + SMOTE): {final_f1:.4f}")

# Save Model
out_path = config.MODELS_DIR / "xgboost_optuna_smote.pkl"
joblib.dump(final_clf, out_path)
print(f"Saved optimized model to {out_path}")


Training final optimized SMOTE model on full training set...



FINAL TEST F1 Micro Score (Optuna + SMOTE): 0.4254
Saved optimized model to /Users/yasha/Desktop/projects/pokemon_type_predictor/models/xgboost_optuna_smote.pkl
