# Model Optimization: SMOTE & Optuna (Bayesian Tuning)

In this notebook, we aim to maximize **F1-Score** by:
1.  **Handling Imbalance**: Applying **SMOTE** (Synthetic Minority Over-sampling Technique).
2.  **Advanced Tuning**: Using **Optuna** for Bayesian Hyperparameter Optimization of XGBoost.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

%matplotlib inline

# Set MLflow Tracking URI
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Churn_Prediction_Optimization")

  return FileStore(store_uri, store_uri)
2026/01/06 16:31:36 INFO mlflow.tracking.fluent: Experiment with name 'Churn_Prediction_Optimization' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:d:/MLOPS PROJECT CHURN PRED/experiment/../mlruns/221989432216233886', creation_time=1767697296536, experiment_id='221989432216233886', last_update_time=1767697296536, lifecycle_stage='active', name='Churn_Prediction_Optimization', tags={}>

## 1. Load Data & Preprocessing (Same as Baseline)

In [2]:
df = pd.read_csv('../customer_churn_dataset/train.csv')
X = df.drop('churn', axis=1)
y = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Reuse Preprocessing Logic ---
def impute_internet_service(X_data, knn_model=None, scaler=None, is_train=True):
    X = X_data.copy()
    impute_features = ['monthly_charges', 'total_charges', 'tenure']
    if is_train:
        scaler = StandardScaler()
        scaler.fit(X[impute_features])
    X_scaled = scaler.transform(X[impute_features])
    mask_missing = X['internet_service'].isnull()
    if is_train:
        X_train_knn = X_scaled[~mask_missing]
        y_train_knn = X.loc[~mask_missing, 'internet_service']
        knn_model = KNeighborsClassifier(n_neighbors=5)
        knn_model.fit(X_train_knn, y_train_knn)
    if mask_missing.sum() > 0:
        X_missing_knn = X_scaled[mask_missing]
        imputed_values = knn_model.predict(X_missing_knn)
        X.loc[mask_missing, 'internet_service'] = imputed_values
    return X, knn_model, scaler

# 1. Impute
X_train_imp, knn_imputer, knn_scaler = impute_internet_service(X_train, is_train=True)
X_test_imp, _, _ = impute_internet_service(X_test, knn_model=knn_imputer, scaler=knn_scaler, is_train=False)

# 2. Encode & Scale
numerical_cols = ['tenure', 'monthly_charges', 'total_charges']
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
if 'customer_id' in categorical_cols: categorical_cols.remove('customer_id')

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler_final = StandardScaler()

X_train_enc = ohe.fit_transform(X_train_imp[categorical_cols])
X_test_enc = ohe.transform(X_test_imp[categorical_cols])
X_train_sc = scaler_final.fit_transform(X_train_imp[numerical_cols])
X_test_sc = scaler_final.transform(X_test_imp[numerical_cols])

X_train_final = np.hstack([X_train_sc, X_train_enc])
X_test_final = np.hstack([X_test_sc, X_test_enc])

print("Data Processed. Train Shape:", X_train_final.shape)

Data Processed. Train Shape: (12800, 16)


## 2. Define Optuna Objective Function
We want to find params that give the **Best F1 Score** on Cross-Validation.

In [3]:
def objective(trial):
    # 1. Propose Hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric': 'logloss',
        'n_jobs': -1
    }
    
    # 2. Build Pipeline with SMOTE + XGBoost
    # We use ImbPipeline to ensure SMOTE is only applied to the TRAIN fold during CV
    model = XGBClassifier(**params)
    
    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])
    
    # 3. Cross-Validation (3-Fold to save time, use 5 for more robustness)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train_final, y_train, cv=cv, scoring='f1', n_jobs=-1)
    
    return scores.mean()

## 3. Run Optuna Optimization

In [4]:
# Create Study
study = optuna.create_study(direction="maximize", study_name="XGB_Imbalanced_Optimization")

# Run for 20 Trials
print("Starting Optuna Optimization...")
study.optimize(objective, n_trials=20)

print("Best F1:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 16:31:36,777] A new study created in memory with name: XGB_Imbalanced_Optimization


Starting Optuna Optimization...


[I 2026-01-06 16:31:43,848] Trial 0 finished with value: 0.6062382765289511 and parameters: {'n_estimators': 326, 'learning_rate': 0.1831080288279283, 'max_depth': 8, 'subsample': 0.9861469714312541, 'colsample_bytree': 0.8846580310428939, 'gamma': 2.6240457449311623, 'min_child_weight': 3}. Best is trial 0 with value: 0.6062382765289511.
[I 2026-01-06 16:31:50,459] Trial 1 finished with value: 0.5981530316582121 and parameters: {'n_estimators': 394, 'learning_rate': 0.29234709207556403, 'max_depth': 10, 'subsample': 0.5559897550811289, 'colsample_bytree': 0.7025151970779693, 'gamma': 2.5787844250369347, 'min_child_weight': 8}. Best is trial 0 with value: 0.6062382765289511.
[I 2026-01-06 16:31:55,376] Trial 2 finished with value: 0.6071243117909534 and parameters: {'n_estimators': 296, 'learning_rate': 0.21483319448157145, 'max_depth': 6, 'subsample': 0.8231898156331081, 'colsample_bytree': 0.6732275508840055, 'gamma': 4.990604069677028, 'min_child_weight': 4}. Best is trial 2 with va

Best F1: 0.6132354054371815
Best Params: {'n_estimators': 154, 'learning_rate': 0.13693357019100774, 'max_depth': 4, 'subsample': 0.96601593175214, 'colsample_bytree': 0.8613139733454416, 'gamma': 2.556094525951737, 'min_child_weight': 4}


## 4. Train & Log Best Model
Now we take the best params found by Optuna, retrain on the FULL train set (with SMOTE), and evaluate on Test.

In [5]:
best_params = study.best_params
best_params['eval_metric'] = 'logloss'

with mlflow.start_run(run_name="XGBoost_Optuna_SMOTE"):
    # Log Params
    mlflow.log_params(best_params)
    mlflow.log_param("optimization", "Optuna")
    
    # Apply SMOTE to Full Train Set one last time before final training
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train)
    
    # Train Final Model
    best_model = XGBClassifier(**best_params)
    best_model.fit(X_train_resampled, y_train_resampled)
    
    # Evaluate on Test
    y_pred = best_model.predict(X_test_final)
    y_prob = best_model.predict_proba(X_test_final)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    
    # Log Metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("auc_roc", roc)
    
    mlflow.sklearn.log_model(best_model, name="best_xgb_optuna")
    
    print("\n--- Final Evaluation (Optuna Best) ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {roc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


--- Final Evaluation (Optuna Best) ---
Accuracy: 0.7688
F1 Score: 0.5916
AUC-ROC: 0.7522

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      2102
           1       0.75      0.49      0.59      1098

    accuracy                           0.77      3200
   macro avg       0.76      0.70      0.72      3200
weighted avg       0.77      0.77      0.75      3200

