In [1]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [2]:
# =============================================================================
# FULL PIPELINE with OPTUNA
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================

import os
import sys
import numpy as np
import pandas as pd
import joblib
import mlflow
import mlflow.sklearn
import time

from dotenv import load_dotenv

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from mlflow.models import infer_signature

import optuna
from optuna.samplers import TPESampler

from sklearn.base import clone

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

# Import shared components
from churning_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()

optuna.logging.set_verbosity(optuna.logging.WARNING)

mlflow.set_experiment("Churn_Prediction_Optuna_Tuning")
print("‚úÖ Libraries imported and MLflow configured.")


# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("‚úì STEP 1: Preprocessing pipeline created.")


# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================

# Load Data
data_dir = "../data" if os.path.exists("../data") else "data"
train_path = os.path.join(data_dir, "strat_train_set.csv")
test_path = os.path.join(data_dir, "strat_test_set.csv")

# Read csv files
churning_train = pd.read_csv(train_path)
churning_test = pd.read_csv(test_path)

# Separate features and target variable
# Target variable: "exited"
X_train = churning_train.drop("exited", axis=1)
y_train = churning_train["exited"].copy()

X_test = churning_test.drop("exited", axis=1)
y_test = churning_test["exited"].copy()

X_train.columns = X_train.columns.str.lower()
X_test.columns = X_test.columns.str.lower()

print(f"‚úÖ Data loaded successfully!")
print(f"   Training set (X_train): {X_train.shape}")
print(f"   Test set (X_test):  {X_test.shape}")


# =============================================================================
# STEP 3: Configure MLflow
# =============================================================================

# 1. Load .env file
load_dotenv(override=True)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

# 2. Check if MLFLOW_TRACKING_URI exists
if not MLFLOW_TRACKING_URI:
    print("‚ö†Ô∏è WARNING: MLFLOW_TRACKING_URI is not set. Using local MLflow setup.")
else:
    # Set environment variables for authentication
    if MLFLOW_TRACKING_USERNAME:
        os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
    if MLFLOW_TRACKING_PASSWORD:
        os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

    # 3. Connect to Dagshub MLflow Tracking Server
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

    experiment_name = "Churn_Prediction_Optuna_Tuning"
    mlflow.set_experiment(experiment_name)

    print(f"‚úì STEP 4: MLflow configured. Experiment name: '{experiment_name}'")


# =============================================================================
# STEP 4: Define Optuna Objective Functions (NO PCA)
# =============================================================================

def objective_ridge(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, RidgeClassifier(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_hgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        HistGradientBoostingClassifier(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        XGBClassifier(
            objective="binary:logistic",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        LGBMClassifier(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()

print("‚úì STEP 4: Optuna objective functions defined (Simpler Version).")

# =============================================================================
# STEP 5: Run Optuna Studies for Each Model (NO PCA)
# =============================================================================

model_names = ["ridge", "histgradientboosting", "xgboost", "lightgbm"]
objective_functions = {
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb,
    "xgboost": objective_xgb,
    "lightgbm": objective_lgbm,
}

results = {}

for name in model_names:
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} (NO PCA) - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    best_cv_f1 = study.best_value
    best_params = study.best_params
    print(f"\nüèÜ Best {name.upper()} CV F1: {best_cv_f1:.4f}")
    print(f"   Best params: {best_params}")
    
    preprocessing_clone = clone(preprocessing)

    if name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
    elif name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            HistGradientBoostingClassifier(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            XGBClassifier(
                objective="reg:squarederror",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            LGBMClassifier(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    print(f"   Test F1 Score: {test_f1:.4f}")
    print(f"   Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

    results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": best_cv_f1}

    with mlflow.start_run(run_name=f"{name}_optuna_tuned"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_params(best_params)
        
        # Log Metrics
        mlflow.log_metric("cv_f1_mean", best_cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("test_accuracy", test_acc)
        mlflow.log_metric("test_tp", tp)
        mlflow.log_metric("test_tn", tn)
        mlflow.log_metric("test_fp", fp)
        mlflow.log_metric("test_fn", fn)

        # Log Model
        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="churn_model",
            signature=signature,
            input_example=X_train.iloc[:5],
            registered_model_name=f"{name}_optuna_tuned",
        )

print("\n‚úì STEP 5: All 4 models optimized and logged.")


# =============================================================================
# STEP 6: PCA Optuna Objectives
# =============================================================================

def objective_ridge_pca(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, PCA(n_components=pca_components), RidgeClassifier(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_hgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        HistGradientBoostingClassifier(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        XGBClassifier(
            objective="binary:logistic",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        LGBMClassifier(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()

print("‚úì STEP 6: Optuna objective functions defined (PCA Version).")


# =============================================================================
# STEP 7: Run Optuna Studies for PCA Models
# =============================================================================

pca_model_names = ["ridge_with_pca", "histgradientboosting_with_pca", "xgboost_with_pca", "lightgbm_with_pca"]
pca_objective_functions = {
    "ridge_with_pca": objective_ridge_pca,
    "histgradientboosting_with_pca": objective_hgb_pca,
    "xgboost_with_pca": objective_xgb_pca,
    "lightgbm_with_pca": objective_lgbm_pca,
}

pca_results = {}

for name in pca_model_names:
    base_name = name.replace("_with_pca", "")
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: pca_objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    best_cv_f1_pca = study.best_value
    best_params = study.best_params
    print(f"\nüèÜ Best {name.upper()} CV F1: {best_cv_f1_pca:.4f}")
    print(f"   Best params: {best_params}")
    
    preprocessing_clone = clone(preprocessing)

    best_pca_components = best_params["pca__n_components"]

    if base_name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_pca_components),
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
        
    elif base_name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_pca_components),
            HistGradientBoostingClassifier(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
        
    elif base_name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_pca_components),
            XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
        
    elif base_name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_pca_components),
            LGBMClassifier(
                objective="binary",
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    print(f"   Test F1 Score: {test_f1:.4f}")
    print(f"   Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

    pca_results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": best_cv_f1_pca}

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_param("model_family", base_name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_params(best_params)
        
        # Log Metrics
        mlflow.log_metric("cv_f1_mean", best_cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("test_accuracy", test_acc)
        mlflow.log_metric("test_tp", tp)
        mlflow.log_metric("test_tn", tn)
        mlflow.log_metric("test_fp", fp)
        mlflow.log_metric("test_fn", fn)

        # Log Model
        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="churn_model_pca",
            signature=signature,
            input_example=X_train.iloc[:5],
            registered_model_name=f"{base_name}_pipeline_with_pca_optuna",
        )

print("\n‚úì STEP 7: All 4 PCA models optimized and logged.")

# =============================================================================
# STEP 8: Choose GLOBAL Best Model
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best_f1 = all_results[global_best_name]["test_f1"]
global_best_cv_f1 = all_results[global_best_name]["cv_f1"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("üèÜ GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key : {global_best_name}")
print(f"Global best CV F1     : {global_best_cv_f1:.4f}")
print(f"Global best Test F1   : {global_best_f1:.4f}")
print(f"Uses PCA              : {uses_pca}")


# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

# 1. Make sure models/ directory exists
models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"üìÅ Created directory: {models_dir}")

# 2. Define save function
def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"‚úì Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving GLOBAL best Optuna-tuned model...")
print("-" * 80)


save_path = os.path.join(models_dir, "global_best_model_optuna.pkl")
save_model(global_best_pipeline, filename=save_path)

print("\nDone:")
print(f"- GLOBAL best model key : {global_best_name}")
print(f"- GLOBAL best CV F1     : {global_best_cv_f1:.4f}")
print(f"- GLOBAL best Test F1   : {global_best_f1:.4f}")

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"\n Elapsed Execution Time: {minutes} minutes and {seconds:.2f} seconds")

‚úÖ Libraries imported and MLflow configured.
‚úì STEP 1: Preprocessing pipeline created.
‚úÖ Data loaded successfully!
   Training set (X_train): (8000, 12)
   Test set (X_test):  (2000, 12)
‚úì STEP 4: MLflow configured. Experiment name: 'Churn_Prediction_Optuna_Tuning'
‚úì STEP 4: Optuna objective functions defined (Simpler Version).

Optimizing RIDGE (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


üèÜ Best RIDGE CV F1: 0.2439
   Best params: {'ridge__alpha': 0.14936568554617632}
   Test F1 Score: 0.2238
   Confusion Matrix: TP=64, TN=1492, FP=101, FN=343


Registered model 'ridge_optuna_tuned' already exists. Creating a new version of this model...
2025/12/18 15:53:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_optuna_tuned, version 3
Created version '3' of model 'ridge_optuna_tuned'.


üèÉ View run ridge_optuna_tuned at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/8601902409eb461596e8a691127d87dc
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing HISTGRADIENTBOOSTING (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


üèÜ Best HISTGRADIENTBOOSTING CV F1: 0.5901
   Best params: {'hgb__learning_rate': 0.05308767414437037, 'hgb__max_depth': 8}
   Test F1 Score: 0.5918
   Confusion Matrix: TP=195, TN=1536, FP=57, FN=212


Registered model 'histgradientboosting_optuna_tuned' already exists. Creating a new version of this model...
2025/12/18 15:55:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_optuna_tuned, version 3
Created version '3' of model 'histgradientboosting_optuna_tuned'.


üèÉ View run histgradientboosting_optuna_tuned at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/1996443f585e4a8ab28ff903398e65c8
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing XGBOOST (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


üèÜ Best XGBOOST CV F1: 0.5802
   Best params: {'xgb__learning_rate': 0.12713516576204176, 'xgb__max_depth': 6, 'xgb__n_estimators': 100}
   Test F1 Score: 0.5913
   Confusion Matrix: TP=191, TN=1545, FP=48, FN=216


Registered model 'xgboost_optuna_tuned' already exists. Creating a new version of this model...
2025/12/18 15:55:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_optuna_tuned, version 3
Created version '3' of model 'xgboost_optuna_tuned'.


üèÉ View run xgboost_optuna_tuned at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/17af64ee78a54c50bb83bc33da59ed2e
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing LIGHTGBM (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]




üèÜ Best LIGHTGBM CV F1: 0.5791
   Best params: {'lgbm__learning_rate': 0.1397987726295555, 'lgbm__num_leaves': 29, 'lgbm__n_estimators': 100}




   Test F1 Score: 0.6061
   Confusion Matrix: TP=207, TN=1524, FP=69, FN=200


Registered model 'lightgbm_optuna_tuned' already exists. Creating a new version of this model...
2025/12/18 15:56:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_optuna_tuned, version 3
Created version '3' of model 'lightgbm_optuna_tuned'.


üèÉ View run lightgbm_optuna_tuned at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/22d69e11595b490599df516a61e527c4
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

‚úì STEP 5: All 4 models optimized and logged.
‚úì STEP 6: Optuna objective functions defined (PCA Version).

Optimizing RIDGE_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


üèÜ Best RIDGE_WITH_PCA CV F1: 0.2607
   Best params: {'ridge__alpha': 0.115279871282324, 'pca__n_components': 0.9872918866945795}
   Test F1 Score: 0.2222
   Confusion Matrix: TP=62, TN=1504, FP=89, FN=345


Registered model 'ridge_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 15:58:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca_optuna, version 3
Created version '3' of model 'ridge_pipeline_with_pca_optuna'.


üèÉ View run ridge_with_pca_optuna at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/fcee9a8a54f04adda9782646a3be78d6
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing HISTGRADIENTBOOSTING_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

Exception ignored in: <function ResourceTracker.__del__ at 0x106b89c60>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes



üèÜ Best HISTGRADIENTBOOSTING_WITH_PCA CV F1: 0.5633
   Best params: {'hgb__learning_rate': 0.12713516576204176, 'hgb__max_depth': 6, 'pca__n_components': 0.9041805371447998}
   Test F1 Score: 0.5947
   Confusion Matrix: TP=201, TN=1525, FP=68, FN=206


Registered model 'histgradientboosting_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 16:00:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca_optuna, version 3
Created version '3' of model 'histgradientboosting_pipeline_with_pca_optuna'.


üèÉ View run histgradientboosting_with_pca_optuna at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/1481b18a54fd4f7b9adb01f98e2861f1
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing XGBOOST_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

Exception ignored in: <function ResourceTracker.__del__ at 0x107549c60>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes



üèÜ Best XGBOOST_WITH_PCA CV F1: 0.5462
   Best params: {'xgb__learning_rate': 0.1388621853293064, 'xgb__max_depth': 3, 'xgb__n_estimators': 250, 'pca__n_components': 0.9153471711318563}
   Test F1 Score: 0.5576
   Confusion Matrix: TP=184, TN=1524, FP=69, FN=223


Registered model 'xgboost_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 16:04:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca_optuna, version 3
Created version '3' of model 'xgboost_pipeline_with_pca_optuna'.


üèÉ View run xgboost_with_pca_optuna at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/bccdd1156323403185f736c0724c5fd0
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

Optimizing LIGHTGBM_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

Exception ignored in: <function ResourceTracker.__del__ at 0x1054f9c60>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x103129c60>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes



üèÜ Best LIGHTGBM_WITH_PCA CV F1: 0.5465
   Best params: {'lgbm__learning_rate': 0.17486639612006327, 'lgbm__num_leaves': 32, 'lgbm__n_estimators': 100, 'pca__n_components': 0.9165064058868091}




   Test F1 Score: 0.5526
   Confusion Matrix: TP=184, TN=1518, FP=75, FN=223


Registered model 'lightgbm_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 16:08:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca_optuna, version 3
Created version '3' of model 'lightgbm_pipeline_with_pca_optuna'.


üèÉ View run lightgbm_with_pca_optuna at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2/runs/d8a500efae554b3aaec97efd6786de3d
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/2

‚úì STEP 7: All 4 PCA models optimized and logged.

üèÜ GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key : lightgbm
Global best CV F1     : 0.5791
Global best Test F1   : 0.6061
Uses PCA              : False

--------------------------------------------------------------------------------
Saving GLOBAL best Optuna-tuned model...
--------------------------------------------------------------------------------
‚úì Model saved to models/global_best_model_optuna.pkl

Done:
- GLOBAL best model key : lightgbm
- GLOBAL best CV F1     : 0.5791
- GLOBAL best Test F1   : 0.6061

 Elapsed Execution Time: 15 minutes and 21.47 seconds
