In [3]:
import time
import os
import pandas as pd
import numpy as np
import joblib
import mlflow
import dagshub
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.base import clone

dagshub.init(repo_owner='abdsalam25', repo_name='churn-project', mlflow=True)
mlflow.set_experiment("Optuna_Churn_Optimization")

try:
    df = pd.read_csv("../churn_data.csv")
except FileNotFoundError:
    df = pd.read_csv("churn_data.csv")

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
df = df.dropna()

X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessing = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def objective_ridge(trial, preprocessing, X_train, y_train, use_pca):
    alpha = trial.suggest_float("alpha", 0.1, 10.0, log=True)
    steps = [('prep', clone(preprocessing))]
    if use_pca:
        n_components = trial.suggest_float("n_components", 0.8, 0.99)
        steps.append(('pca', PCA(n_components=n_components)))
    steps.append(('clf', RidgeClassifier(alpha=alpha)))
    pipeline = Pipeline(steps)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1").mean()

def objective_gb(trial, preprocessing, X_train, y_train, use_pca):
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 8)
    steps = [('prep', clone(preprocessing))]
    if use_pca:
        n_components = trial.suggest_float("n_components", 0.8, 0.99)
        steps.append(('pca', PCA(n_components=n_components)))
    steps.append(('clf', GradientBoostingClassifier(
        learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, random_state=42)))
    pipeline = Pipeline(steps)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1").mean()

def objective_xgb(trial, preprocessing, X_train, y_train, use_pca):
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 8)
    steps = [('prep', clone(preprocessing))]
    if use_pca:
        n_components = trial.suggest_float("n_components", 0.8, 0.99)
        steps.append(('pca', PCA(n_components=n_components)))
    steps.append(('clf', XGBClassifier(
        use_label_encoder=False, eval_metric='logloss', learning_rate=learning_rate,
        n_estimators=n_estimators, max_depth=max_depth, random_state=42)))
    pipeline = Pipeline(steps)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1").mean()

def objective_lgbm(trial, preprocessing, X_train, y_train, use_pca):
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    num_leaves = trial.suggest_int("num_leaves", 20, 50)
    steps = [('prep', clone(preprocessing))]
    if use_pca:
        n_components = trial.suggest_float("n_components", 0.8, 0.99)
        steps.append(('pca', PCA(n_components=n_components)))
    steps.append(('clf', LGBMClassifier(
        learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves,
        random_state=42, verbose=-1)))
    pipeline = Pipeline(steps)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1").mean()

objectives = {
    "Ridge": objective_ridge,
    "GradientBoosting": objective_gb,
    "XGBoost": objective_xgb,
    "LightGBM": objective_lgbm
}

best_global_f1 = 0
best_global_pipeline = None
best_global_name = ""

for model_name, obj_func in objectives.items():
    for use_pca in [False, True]:
        run_name = f"{model_name}_Optuna_PCA_{use_pca}"
        print(f"Optimizing {run_name}...")
        
        study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
        study.optimize(lambda trial: obj_func(trial, preprocessing, X_train, y_train, use_pca), n_trials=10)
        
        best_params = study.best_params
        steps = [('prep', clone(preprocessing))]
        
        if use_pca:
            steps.append(('pca', PCA(n_components=best_params["n_components"])))
            
        if model_name == "Ridge":
            clf = RidgeClassifier(alpha=best_params["alpha"])
        elif model_name == "GradientBoosting":
            clf = GradientBoostingClassifier(learning_rate=best_params["learning_rate"], 
                                           n_estimators=best_params["n_estimators"], 
                                           max_depth=best_params["max_depth"], random_state=42)
        elif model_name == "XGBoost":
            clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                              learning_rate=best_params["learning_rate"],
                              n_estimators=best_params["n_estimators"],
                              max_depth=best_params["max_depth"], random_state=42)
        elif model_name == "LightGBM":
            clf = LGBMClassifier(learning_rate=best_params["learning_rate"],
                               n_estimators=best_params["n_estimators"],
                               num_leaves=best_params["num_leaves"], random_state=42, verbose=-1)
        
        steps.append(('clf', clf))
        final_pipeline = Pipeline(steps)
        final_pipeline.fit(X_train, y_train)
        
        y_pred = final_pipeline.predict(X_test)
        test_f1 = f1_score(y_test, y_pred)
        
        if test_f1 > best_global_f1:
            best_global_f1 = test_f1
            best_global_pipeline = final_pipeline
            best_global_name = run_name

        with mlflow.start_run(run_name=run_name):
            mlflow.log_params(best_params)
            mlflow.log_param("pca", use_pca)
            mlflow.log_metric("cv_f1", study.best_value)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.sklearn.log_model(final_pipeline, "model")

models_dir = "../models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

save_path = os.path.join(models_dir, "best_model_optuna.joblib")
joblib.dump(best_global_pipeline, save_path)

print(f"Global Best Model: {best_global_name}")
print(f"Test F1 Score: {best_global_f1}")
print(f"Model saved to: {save_path}")

2025/12/17 21:02:11 INFO mlflow.tracking.fluent: Experiment with name 'Optuna_Churn_Optimization' does not exist. Creating a new experiment.
[I 2025-12-17 21:02:11,317] A new study created in memory with name: no-name-e834d755-0d0c-40b0-a356-7fecb3b11d41
[I 2025-12-17 21:02:11,379] Trial 0 finished with value: 0.5776023295647147 and parameters: {'alpha': 0.5611516415334507}. Best is trial 0 with value: 0.5776023295647147.
[I 2025-12-17 21:02:11,428] Trial 1 finished with value: 0.5776003433913804 and parameters: {'alpha': 7.969454818643932}. Best is trial 0 with value: 0.5776023295647147.


Optimizing Ridge_Optuna_PCA_False...


[I 2025-12-17 21:02:11,523] Trial 2 finished with value: 0.5772698861802202 and parameters: {'alpha': 2.9106359131330706}. Best is trial 0 with value: 0.5776023295647147.
[I 2025-12-17 21:02:11,571] Trial 3 finished with value: 0.5778054571636754 and parameters: {'alpha': 1.575132049977973}. Best is trial 3 with value: 0.5778054571636754.
[I 2025-12-17 21:02:11,619] Trial 4 finished with value: 0.5783777850811295 and parameters: {'alpha': 0.20513382630874502}. Best is trial 4 with value: 0.5783777850811295.
[I 2025-12-17 21:02:11,666] Trial 5 finished with value: 0.5783777850811295 and parameters: {'alpha': 0.20511104188433976}. Best is trial 4 with value: 0.5783777850811295.
[I 2025-12-17 21:02:11,714] Trial 6 finished with value: 0.5783727642080002 and parameters: {'alpha': 0.13066739238053282}. Best is trial 4 with value: 0.5783777850811295.
[I 2025-12-17 21:02:11,760] Trial 7 finished with value: 0.5768421623279226 and parameters: {'alpha': 5.3994844097874335}. Best is trial 4 with

üèÉ View run Ridge_Optuna_PCA_False at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/b068d3b7ac91455a9a29a0e1d9ede090
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1
Optimizing Ridge_Optuna_PCA_True...


[I 2025-12-17 21:02:18,036] Trial 0 finished with value: 0.5805272354555286 and parameters: {'alpha': 0.5611516415334507, 'n_components': 0.9806357182178841}. Best is trial 0 with value: 0.5805272354555286.
[I 2025-12-17 21:02:18,089] Trial 1 finished with value: 0.5770814240663605 and parameters: {'alpha': 2.9106359131330706, 'n_components': 0.913745111997437}. Best is trial 0 with value: 0.5805272354555286.
[I 2025-12-17 21:02:18,140] Trial 2 finished with value: 0.5705692525691622 and parameters: {'alpha': 0.20513382630874502, 'n_components': 0.8296389588638785}. Best is trial 0 with value: 0.5805272354555286.
[I 2025-12-17 21:02:18,192] Trial 3 finished with value: 0.5774325833461113 and parameters: {'alpha': 0.13066739238053282, 'n_components': 0.9645734676972377}. Best is trial 0 with value: 0.5805272354555286.
[I 2025-12-17 21:02:18,241] Trial 4 finished with value: 0.5763267242120809 and parameters: {'alpha': 1.5930522616241016, 'n_components': 0.9345337897812487}. Best is tria

üèÉ View run Ridge_Optuna_PCA_True at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/7c5f2ebe279d4b629a1df9f5e5cca3ea
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1


[I 2025-12-17 21:02:31,083] A new study created in memory with name: no-name-34f641d5-8f1a-4fe0-97f5-adfea4223578


Optimizing GradientBoosting_Optuna_PCA_False...


[I 2025-12-17 21:02:36,577] Trial 0 finished with value: 0.543969488991717 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'max_depth': 7}. Best is trial 0 with value: 0.543969488991717.
[I 2025-12-17 21:02:37,508] Trial 1 finished with value: 0.5781217350532329 and parameters: {'learning_rate': 0.1836109604171406, 'n_estimators': 73, 'max_depth': 3}. Best is trial 1 with value: 0.5781217350532329.
[I 2025-12-17 21:02:41,775] Trial 2 finished with value: 0.5631508529165009 and parameters: {'learning_rate': 0.026844247528777843, 'n_estimators': 180, 'max_depth': 6}. Best is trial 1 with value: 0.5781217350532329.
[I 2025-12-17 21:02:43,638] Trial 3 finished with value: 0.5562654413267446 and parameters: {'learning_rate': 0.21534104756085318, 'n_estimators': 53, 'max_depth': 8}. Best is trial 1 with value: 0.5781217350532329.
[I 2025-12-17 21:02:45,061] Trial 4 finished with value: 0.5651894624187351 and parameters: {'learning_rate': 0.2514083658321223, 'n_est

üèÉ View run GradientBoosting_Optuna_PCA_False at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/212e7d5211154d3d8433b4539ee0af75
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1
Optimizing GradientBoosting_Optuna_PCA_True...


[I 2025-12-17 21:03:17,358] Trial 0 finished with value: 0.5483147933486534 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'max_depth': 7, 'n_components': 0.913745111997437}. Best is trial 0 with value: 0.5483147933486534.
[I 2025-12-17 21:03:20,355] Trial 1 finished with value: 0.567567917920123 and parameters: {'learning_rate': 0.055245405728306586, 'n_estimators': 73, 'max_depth': 3, 'n_components': 0.9645734676972377}. Best is trial 1 with value: 0.567567917920123.
[I 2025-12-17 21:03:27,149] Trial 2 finished with value: 0.563953510842895 and parameters: {'learning_rate': 0.18432335340553055, 'n_estimators': 156, 'max_depth': 3, 'n_components': 0.9842828719107789}. Best is trial 1 with value: 0.567567917920123.
[I 2025-12-17 21:03:30,137] Trial 3 finished with value: 0.549846213232618 and parameters: {'learning_rate': 0.2514083658321223, 'n_estimators': 82, 'max_depth': 4, 'n_components': 0.8348468568721524}. Best is trial 1 with value: 0.56756791792012

üèÉ View run GradientBoosting_Optuna_PCA_True at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/fbef950bbdce409492a836e1018265c0
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1
Optimizing XGBoost_Optuna_PCA_False...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-12-17 21:04:20,092] Trial 0 finished with value: 0.5600245038589485 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'max_depth': 7}. Best is trial 0 with value: 0.5600245038589485.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-12-17 21:04:20,266] Trial 1 finished with value: 0.5911189800122066 and parameters: {'learning_rate': 0.1836109604171406, 'n_estimators': 73, 'max_depth': 3}. Best is trial 1 with value: 0.5911189800122066

üèÉ View run XGBoost_Optuna_PCA_False at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/ac5f9bf2f90b4c3283d453142fbc8c74
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1


[I 2025-12-17 21:04:30,625] A new study created in memory with name: no-name-d9e111bc-fcdc-4146-a576-d383ff3b80ce


Optimizing XGBoost_Optuna_PCA_True...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-12-17 21:04:31,558] Trial 0 finished with value: 0.5411986230901323 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'max_depth': 7, 'n_components': 0.913745111997437}. Best is trial 0 with value: 0.5411986230901323.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-12-17 21:04:31,759] Trial 1 finished with value: 0.5729273490006614 and parameters: {'learning_rate': 0.055245405728306586, 'n_estimators': 73, 'max_depth': 3, 'n_compone

üèÉ View run XGBoost_Optuna_PCA_True at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/e565a47737b44f528cf3b3e23609d4cb
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1


[I 2025-12-17 21:04:44,686] A new study created in memory with name: no-name-912b2612-6acf-47ae-9277-a2f3a54b1321


Optimizing LightGBM_Optuna_PCA_False...


[I 2025-12-17 21:04:47,328] Trial 0 finished with value: 0.5543789604222434 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'num_leaves': 42}. Best is trial 0 with value: 0.5543789604222434.
[I 2025-12-17 21:04:47,973] Trial 1 finished with value: 0.5614512916895226 and parameters: {'learning_rate': 0.1836109604171406, 'n_estimators': 73, 'num_leaves': 24}. Best is trial 1 with value: 0.5614512916895226.
[I 2025-12-17 21:04:50,297] Trial 2 finished with value: 0.5667507455671172 and parameters: {'learning_rate': 0.026844247528777843, 'n_estimators': 180, 'num_leaves': 38}. Best is trial 2 with value: 0.5667507455671172.
[I 2025-12-17 21:04:51,228] Trial 3 finished with value: 0.5493087564449123 and parameters: {'learning_rate': 0.21534104756085318, 'n_estimators': 53, 'num_leaves': 50}. Best is trial 2 with value: 0.5667507455671172.
[I 2025-12-17 21:04:52,033] Trial 4 finished with value: 0.5656453533216154 and parameters: {'learning_rate': 0.25140836583212

üèÉ View run LightGBM_Optuna_PCA_False at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/b2e3b9e1c5064c1ea661b5d40fda0d7b
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1
Optimizing LightGBM_Optuna_PCA_True...


[I 2025-12-17 21:05:06,334] Trial 0 finished with value: 0.5273478966392485 and parameters: {'learning_rate': 0.11861663446573512, 'n_estimators': 193, 'num_leaves': 42, 'n_components': 0.913745111997437}. Best is trial 0 with value: 0.5273478966392485.
[I 2025-12-17 21:05:06,828] Trial 1 finished with value: 0.5663673774784885 and parameters: {'learning_rate': 0.055245405728306586, 'n_estimators': 73, 'num_leaves': 21, 'n_components': 0.9645734676972377}. Best is trial 1 with value: 0.5663673774784885.
[I 2025-12-17 21:05:07,733] Trial 2 finished with value: 0.546567282269197 and parameters: {'learning_rate': 0.18432335340553055, 'n_estimators': 156, 'num_leaves': 20, 'n_components': 0.9842828719107789}. Best is trial 1 with value: 0.5663673774784885.
[I 2025-12-17 21:05:08,312] Trial 3 finished with value: 0.5471196040836594 and parameters: {'learning_rate': 0.2514083658321223, 'n_estimators': 82, 'num_leaves': 25, 'n_components': 0.8348468568721524}. Best is trial 1 with value: 0.56

üèÉ View run LightGBM_Optuna_PCA_True at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1/runs/943d3f08d73e488e9afa9c6c86a3d123
üß™ View experiment at: https://dagshub.com/abdsalam25/churn-project.mlflow/#/experiments/1
Global Best Model: Ridge_Optuna_PCA_True
Test F1 Score: 0.5802650957290133
Model saved to: ../models/best_model_optuna.joblib


In [2]:
!pip install optuna 

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
[2K   [38;2;114;156;31m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2/2[0m [optuna]
[1A[2KSuccessfully installed colorlog-6.10.1 optuna-4.6.0
