In [None]:
!pip install optuna

In [None]:
import optuna
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    f1_score
)
import matplotlib.pyplot as plt

# Load the dataset
nba_data = pd.read_csv("nba_rest_data.csv")
nba_data['rest_category'] = pd.factorize(nba_data['rest_category'])[0]

# Seeds to loop through (randomly generated)
seeds = [2287, 1680, 8936, 1425, 9675]

# Store metrics for averaging later
all_accuracies = []
all_f1_scores = []
all_roc_aucs = []

for seed in seeds:
    print(f"\nRunning experiment with seed = {seed}")

    # Split data
    unique_teams = nba_data['Team'].unique()
    train_teams, test_teams = train_test_split(unique_teams, test_size=0.2, random_state=seed)

    train_data = nba_data[nba_data['Team'].isin(train_teams)]
    test_data = nba_data[nba_data['Team'].isin(test_teams)]

    X_train = train_data[['rest_category']].values
    y_train = train_data['WIN'].values
    X_test = test_data[['rest_category']].values
    y_test = test_data['WIN'].values

    # Class imbalance ratio
    neg, pos = np.bincount(y_train)
    scale_pos_weight = neg / pos


    def objective(trial):
        param = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'max_depth': trial.suggest_int('max_depth', 2, 15),
            'eta': trial.suggest_float('eta', 0.005, 0.3),
            'gamma': trial.suggest_float('gamma', 0, 10.0),
            'subsample': trial.suggest_float('subsample', 0.3, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
            'lambda': trial.suggest_float('lambda', 1e-3, 10),
            'alpha': trial.suggest_float('alpha', 1e-3, 10),
            'scale_pos_weight': scale_pos_weight,
            'seed': seed
        }

        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
        scores = []

        for train_idx, val_idx in skf.split(X_train, y_train):
            X_t, X_val = X_train[train_idx], X_train[val_idx]
            y_t, y_val = y_train[train_idx], y_train[val_idx]

            dtrain = xgb.DMatrix(X_t, label=y_t)
            dval = xgb.DMatrix(X_val, label=y_val)

            num_boost = trial.suggest_int("num_boost_round", 100, 1000)

            bst = xgb.train(
                param,
                dtrain,
                num_boost_round=num_boost,
                evals=[(dval, "eval")],
                early_stopping_rounds=30,
                verbose_eval=False
            )
            preds = bst.predict(dval)
            auc = roc_auc_score(y_val, preds)
            scores.append(auc)

        avg_score = np.mean(scores)
        print(f"Trial {trial.number}: ROC AUC={avg_score:.4f}, Params={param}")
        return avg_score

    # Optuna hyperparameter tuning
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    study.optimize(objective, n_trials=200)

    print("\nBest hyperparameters:")
    print(study.best_params)

    best_params = study.best_params.copy()
    num_boost_round_final = best_params.pop("num_boost_round")

    best_params.update({
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'seed': seed
    })

    dtrain_final = xgb.DMatrix(X_train, label=y_train)
    dtest_final = xgb.DMatrix(X_test, label=y_test)

    xgb_model = xgb.train(
        params=best_params,
        dtrain=dtrain_final,
        num_boost_round=num_boost_round_final
    )

    preds_final = xgb_model.predict(dtest_final)
    preds_binary_final = (preds_final > 0.5).astype(int)

    accuracy_final = accuracy_score(y_test, preds_binary_final)
    roc_auc_final = roc_auc_score(y_test, preds_final)
    f1_final = f1_score(y_test, preds_binary_final)

    print(f"\nFinal Model Accuracy: {accuracy_final * 100:.2f}%")
    print(f"Final ROC AUC Score: {roc_auc_final:.4f}")
    print(f"Final F1 Score: {f1_final:.4f}")

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, preds_binary_final))

    print("\nClassification Report:")
    print(classification_report(y_test, preds_binary_final))

    xgb.plot_importance(xgb_model)
    plt.title(f"Feature Importance (Seed {seed})")
    plt.tight_layout()
    plt.show()

    # Save metrics for averaging
    all_accuracies.append(accuracy_final)
    all_f1_scores.append(f1_final)
    all_roc_aucs.append(roc_auc_final)

# Print average metrics across all seeds
print("\n=== Average Metrics Across All Seeds ===")
print(f"Mean Accuracy: {np.mean(all_accuracies) * 100:.2f}%")
print(f"Mean F1 Score: {np.mean(all_f1_scores):.4f}")
print(f"Mean ROC AUC Score: {np.mean(all_roc_aucs):.4f}")