In [8]:
# XGBoost Refinement Notebook

# %% -------------------- Imports and Setup --------------------
import pandas as pd
import numpy as np
import sys
import pathlib
import random
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.model_selection import RepeatedStratifiedKFold





# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Setup path
project_root = pathlib.Path().resolve().parent
sys.path.append(str(project_root / 'src'))

import util


In [9]:

# %% -------------------- Load and Preprocess Data --------------------
train_df, test_df = util.load_data('../data/train.csv', '../data/test.csv')

# Separate target
target = 'rainfall'
train_y = train_df[target]
train_X = train_df.drop(columns=[target, 'id'])
test_df_no_id = test_df.drop(columns=['id'])

# Preprocess
df_train, df_test = util.preprocess_train_test(train_X, test_df_no_id)
df_test = util.align_columns(df_train, df_test)


Train shape: (2190, 13), Test shape: (730, 12)


In [10]:

# %% -------------------- Optuna Hyperparameter Tuning --------------------
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5.0),  # Try larger values too
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0),
        'random_state': SEED,
        'eval_metric': 'auc'
    }


    model = XGBClassifier(**param)

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=SEED)
    scores = cross_val_score(model, df_train, train_y, cv=kf, scoring='roc_auc', n_jobs=-1)

    return scores.mean()

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\n===== Best Parameters from Optuna =====")
print(study.best_params)


[I 2025-03-09 20:28:45,678] A new study created in memory with name: no-name-63fbd55d-d4fc-430c-9d8e-4ba84dfef3f5


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-03-09 20:28:46,697] Trial 0 finished with value: 0.8838627010849234 and parameters: {'n_estimators': 1436, 'max_depth': 10, 'learning_rate': 0.026975154833351143, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_alpha': 0.2904180608409973, 'reg_lambda': 4.330880728874676}. Best is trial 0 with value: 0.8838627010849234.
[I 2025-03-09 20:28:48,755] Trial 1 finished with value: 0.8874279835390946 and parameters: {'n_estimators': 2003, 'max_depth': 8, 'learning_rate': 0.005242693862597309, 'subsample': 0.9849549260809971, 'colsample_bytree': 0.9162213204002109, 'gamma': 1.0616955533913808, 'reg_alpha': 0.9091248360355031, 'reg_lambda': 0.9170225492671691}. Best is trial 1 with value: 0.8874279835390946.
[I 2025-03-09 20:28:49,966] Trial 2 finished with value: 0.8855985783763561 and parameters: {'n_estimators': 1260, 'max_depth': 6, 'learning_rate': 0.013518080333310006, 'subsample': 0.645614570099021, 'colsample_bytree': 0.

In [None]:

# %% -------------------- Train and Predict with Optimized XGBoost --------------------
best_xgb_model = XGBClassifier(**study.best_params, random_state=SEED, eval_metric='auc')
best_xgb_model.fit(df_train, train_y)


# Predict on test set
test_probs = best_xgb_model.predict_proba(df_test)[:, 1]

# Write submission
util.write_submission(test_df, test_probs, id_column='id', output_path='submission_xgboost_optuna.csv', prediction_column='rainfall')

# print auc
print(f"ROC AUC: {study.best_value:.4f}")

NameError: name 'X_val' is not defined

In [None]:

# %% -------------------- Feature Importance Plot --------------------
def plot_feature_importance(model, feature_names, top_n=20):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1][:top_n]
    plt.figure(figsize=(10, 6))
    plt.title('Top Feature Importances')
    plt.bar(range(len(indices)), importances[indices], align='center')
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

plot_feature_importance(best_xgb_model, df_train.columns)
