# Hyperparameter Tuning with Optuna

In this notebook, we will use **Optuna** to tune the hyperparameters of the **XGBoost Classifier**. We aim to maximize the **F1 Score** on the validation set.

We will track the optimization process using **MLflow**.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
import optuna
import mlflow
import mlflow.xgboost
from sklearn.metrics import f1_score, accuracy_score, log_loss

# Set pandas display options
pd.set_option('display.max_columns', None)

## 1. Load Processed Data

In [None]:
data_path = '../data/processed'

X_train = pd.read_csv(f'{data_path}/train_processed.csv')
X_val = pd.read_csv(f'{data_path}/val_processed.csv')

y_train = X_train.pop('Churn')
y_val = X_val.pop('Churn')

print(f"Train shape: {X_train.shape}")
print(f"Val shape: {X_val.shape}")

## 2. Setup MLflow

Set tracking URI to `http://localhost:5000`.

In [None]:
tracking_uri = "http://localhost:5000"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("XGBoost Hyperparameter Tuning")

# Disable autologging to prevent conflicts or errors during Optuna loop
mlflow.autolog(disable=True)
try:
    mlflow.xgboost.autolog(disable=True)
    mlflow.sklearn.autolog(disable=True)
except:
    pass

print(f"MLflow tracking URI: {tracking_uri}")

## 3. Define Objective Function

The objective function defines the hyperparameter space and the metric to optimize.

In [None]:
def objective(trial):
    # Define hyperparameter space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10.0, log=True),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Train model
    model = XGBClassifier(**params)
    
    # Start MLflow run (nested)
    with mlflow.start_run(nested=True):
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_val)
        
        # Calculate metrics
        f1 = f1_score(y_val, y_pred)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Log params and metrics
        mlflow.log_params(params)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("accuracy", accuracy)
        
        # We return F1 score to maximize
        return f1

## 4. Run Optimization

We will run 50 trials.

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Optimization finished.")

## 5. Analyze Results

Print the best hyperparameters and score.

In [None]:
print(f"Best F1 Score: {study.best_value}")
print("Best Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Log best model to MLflow (as a final run)
with mlflow.start_run(run_name="Best XGBoost Model"):
    best_params = study.best_params
    best_params['use_label_encoder'] = False
    best_params['eval_metric'] = 'logloss'
    best_params['random_state'] = 42
    
    mlflow.log_params(best_params)
    mlflow.log_metric("f1_score", study.best_value)
    
    best_model = XGBClassifier(**best_params)
    best_model.fit(X_train, y_train)
    
    mlflow.xgboost.log_model(best_model, "model")
    print("Best model logged to MLflow.")