In [1]:
!pip install xgboost lightgbm "mlflow<3"



In [2]:
# =============================================================================
# FULL PIPELINE:
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================

import os
import sys
import numpy as np
import pandas as pd
import joblib
import mlflow
import mlflow.sklearn
import time

from dotenv import load_dotenv

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

from mlflow.models import infer_signature

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

# Import shared components
from churning_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()

# Set MLflow experiment name
mlflow.set_experiment("Churn_Prediction_Baseline")
print("‚úÖ MLflow experiment set to 'Churn_Prediction_Baseline'")


# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("‚úì STEP 1: Preprocessing pipeline created.")


# =============================================================================
# STEP 2: Load Stratified Train and Test Sets
# =============================================================================

# Load Data
data_dir = "../data" if os.path.exists("../data") else "data"
train_path = os.path.join(data_dir, "strat_train_set.csv")
test_path = os.path.join(data_dir, "strat_test_set.csv")

# Read csv files
churning_train = pd.read_csv(train_path)
churning_test = pd.read_csv(test_path)

# Separate features and target variable
# Target variable: "exited"
X_train = churning_train.drop("exited", axis=1)
y_train = churning_train["exited"].copy()

X_test = churning_test.drop("exited", axis=1)
y_test = churning_test["exited"].copy()

X_train.columns = X_train.columns.str.lower()
X_test.columns = X_test.columns.str.lower()

print(f"‚úÖ Data loaded successfully!")
print(f"   Training set (X_train): {X_train.shape}")
print(f"   Test set (X_test):  {X_test.shape}")


# =============================================================================
# STEP 3: Define 4 Model Pipelines (WITHOUT PCA)
# =============================================================================

models = {}
for name in ["ridge", "histgradientboosting", "xgboost", "lightgbm"]:
    est = make_estimator_for_name(name)
    models[name] = make_pipeline(preprocessing, est)

print("‚úì STEP 3: 4 baseline model pipelines defined.")


# =============================================================================
# STEP 4: Configure MLflow (e.g., Dagshub) via .env
# =============================================================================

# 1. Load .env file
load_dotenv(override=True)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

# 2. Check if MLFLOW_TRACKING_URI exists
if not MLFLOW_TRACKING_URI:
    print("‚ö†Ô∏è WARNING: MLFLOW_TRACKING_URI is not set. Using local MLflow setup.")
else:
    # Set environment variables for authentication
    if MLFLOW_TRACKING_USERNAME:
        os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
    if MLFLOW_TRACKING_PASSWORD:
        os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

    # 3. Connect to Dagshub MLflow Tracking Server
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

    experiment_name = "Churn_Prediction_Multi_Models"
    mlflow.set_experiment(experiment_name)

    print(f"‚úì STEP 4: MLflow configured. Experiment name: '{experiment_name}'")


# =============================================================================
# STEP 5: Train, Evaluate, and Log 4 Baseline Models (NO PCA)
# =============================================================================

results = {}

print(f"\n{'=' * 80}")
print(f"üöÄ STEP 5: Training & Logging 4 Baseline Models (Without PCA)")
print(f"{'=' * 80}")

for name, pipeline in models.items():
    print(f"\nüîπ Training model: {name}")

    # --- 1. Cross Validation (CV) ---
    cv_scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    print(f"   CV F1 Score: {cv_mean:.4f} (+/- {cv_std:.4f})")

    # --- 2. Fit on Full Training Set ---
    pipeline.fit(X_train, y_train)

    # --- 3. Evaluate on Test Set ---
    y_pred = pipeline.predict(X_test)
    
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    
    # Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    print(f"   Test F1 Score: {test_f1:.4f}")
    print(f"   Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

    results[name] = {
        "pipeline": pipeline, 
        "test_f1": test_f1, 
        "cv_f1_mean": cv_mean
    }

    # --- 4. Log to MLflow ---
    with mlflow.start_run(run_name=f"{name}_baseline"):
        # Log basic parameters
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)

        # Log model parameters
        est_step_name = list(pipeline.named_steps.keys())[-1]
        est = pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est.get_params().items()}
        mlflow.log_params(est_params)

        # Log core metrics (Metric)
        mlflow.log_metric("cv_f1_mean", cv_mean)
        mlflow.log_metric("cv_f1_std", cv_std)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("test_accuracy", test_acc)
        
        # Log confusion matrix components
        mlflow.log_metric("test_tp", tp)
        mlflow.log_metric("test_tn", tn)
        mlflow.log_metric("test_fp", fp)
        mlflow.log_metric("test_fn", fn)

        # Log model
        signature = infer_signature(X_train, pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="churn_model", 
            signature=signature,
            input_example=X_train.iloc[:5],
            registered_model_name=f"{name}_pipeline_no_pca",
        )

print("\n‚úì STEP 5: All 4 baseline models trained and logged successfully.")

# =============================================================================
# STEP 7: Train, Evaluate, and Log PCA Versions of ALL 4 Models
# =============================================================================

pca_results = {}

print("\n" + "=" * 80)
print(f"üöÄ STEP 7: Training PCA-Augmented Models (Target: F1 Score)")
print("=" * 80)

for name in models.keys():
    print(f"\nüîπ Training PCA model: {name}")

    # 1. Get untrained estimator
    est = make_estimator_for_name(name)

    # 2. Build pipeline with PCA
    pca_pipeline = make_pipeline(
        preprocessing,
        PCA(n_components=0.95),
        est,
    )

    # 3. Cross Validation (CV)
    cv_scores_pca = cross_val_score(
        pca_pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_mean_pca = cv_scores_pca.mean()
    cv_std_pca = cv_scores_pca.std()
    
    print(f"   CV F1 Score: {cv_mean_pca:.4f} (+/- {cv_std_pca:.4f})")

    # 4. Fit on Full Training Set
    pca_pipeline.fit(X_train, y_train)

    # 5. Evaluate on Test Set
    y_pred_pca = pca_pipeline.predict(X_test)
    
    test_f1_pca = f1_score(y_test, y_pred_pca)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_pca).ravel()

    model_key = f"{name}_with_pca"
    pca_results[model_key] = {
        "pipeline": pca_pipeline,
        "test_f1": test_f1_pca,
        "cv_f1_mean": cv_mean_pca,
    }

    print(f"   Test F1 Score: {test_f1_pca:.4f}")
    print(f"   Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

    # 6. Log to MLflow
    with mlflow.start_run(run_name=model_key):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)

        # Log basic parameters
        est_step_name = list(pca_pipeline.named_steps.keys())[-1]
        est_step = pca_pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est_step.get_params().items()}
        mlflow.log_params(est_params)

        # Log PCA parameters
        pca_step = pca_pipeline.named_steps["pca"]
        mlflow.log_param("pca__n_components", pca_step.n_components)

        # Log core metrics (Metric)
        mlflow.log_metric("cv_f1_mean", cv_mean_pca)
        mlflow.log_metric("cv_f1_std", cv_std_pca)
        mlflow.log_metric("test_f1", test_f1_pca)
        
        # Log confusion matrix components
        mlflow.log_metric("test_tp", tp)
        mlflow.log_metric("test_tn", tn)
        mlflow.log_metric("test_fp", fp)
        mlflow.log_metric("test_fn", fn)

        # Log model
        signature_pca = infer_signature(X_train, pca_pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pca_pipeline,
            artifact_path="churn_model_pca",
            signature=signature_pca,
            input_example=X_train.iloc[:5],
            registered_model_name=f"{name}_pipeline_with_pca",
        )

print("\n‚úì STEP 7: All 4 PCA models trained and logged.")


# =============================================================================
# STEP 8: Choose GLOBAL Best Model (with or without PCA)
# =============================================================================


# 1. Combine all results
all_results = {}
all_results.update(results)      # 4 models without PCA
all_results.update(pca_results)  # 4 models with PCA

# 2. Choose the global best model based on Test F1 Score
global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])

# 3. Extract the best model's details
global_best_f1 = all_results[global_best_name]["test_f1"]
global_best_cv_f1 = all_results[global_best_name]["cv_f1_mean"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

# 4. Check if PCA was used
uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("üèÜ GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key : {global_best_name}")
print(f"Global best CV F1     : {global_best_cv_f1:.4f}")
print(f"Global best Test F1   : {global_best_f1:.4f}")
print(f"Uses PCA              : {uses_pca}")


# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================


# 1. Make sure models/ directory exists
models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"üìÅ Created directory: {models_dir}")

# 2. Define save function
def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"‚úì Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving GLOBAL best model...")
print("-" * 80)

# 3. Save the GLOBAL best model
save_path = os.path.join(models_dir, "global_best_model.pkl")
save_model(global_best_pipeline, filename=save_path)

# 4. Print final summary (modified to F1 metrics)
print("\nDone:")
print(f"- GLOBAL best model key : {global_best_name}")
print(f"- GLOBAL best CV F1     : {global_best_cv_f1:.4f}")
print(f"- GLOBAL best Test F1   : {global_best_f1:.4f}")

# 5. Measure total execution time
end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"\n‚è±Ô∏è Elapsed Time: {minutes} minutes and {seconds:.2f} seconds")

‚úÖ MLflow experiment set to 'Churn_Prediction_Baseline'
‚úì STEP 1: Preprocessing pipeline created.
‚úÖ Data loaded successfully!
   Training set (X_train): (8000, 12)
   Test set (X_test):  (2000, 12)
‚úì STEP 3: 4 baseline model pipelines defined.
‚úì STEP 4: MLflow configured. Experiment name: 'Churn_Prediction_Multi_Models'

üöÄ STEP 5: Training & Logging 4 Baseline Models (Without PCA)

üîπ Training model: ridge
   CV F1 Score: 0.2407 (+/- 0.0111)
   Test F1 Score: 0.2152
   Confusion Matrix: TP=58, TN=1519, FP=74, FN=349


Registered model 'ridge_pipeline_no_pca' already exists. Creating a new version of this model...
2025/12/18 19:04:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_no_pca, version 4
Created version '4' of model 'ridge_pipeline_no_pca'.


üèÉ View run ridge_baseline at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/8e0206cb309a4a5ab9ee418d69bb2c9c
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training model: histgradientboosting
   CV F1 Score: 0.5913 (+/- 0.0023)
   Test F1 Score: 0.5870
   Confusion Matrix: TP=199, TN=1521, FP=72, FN=208


Registered model 'histgradientboosting_pipeline_no_pca' already exists. Creating a new version of this model...
2025/12/18 19:04:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_no_pca, version 3
Created version '3' of model 'histgradientboosting_pipeline_no_pca'.


üèÉ View run histgradientboosting_baseline at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/80db14cfeaa4426b8468dbdb0886cb39
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training model: xgboost
   CV F1 Score: 0.5804 (+/- 0.0129)
   Test F1 Score: 0.5805
   Confusion Matrix: TP=191, TN=1533, FP=60, FN=216


Registered model 'xgboost_pipeline_no_pca' already exists. Creating a new version of this model...
2025/12/18 19:05:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_no_pca, version 3
Created version '3' of model 'xgboost_pipeline_no_pca'.


üèÉ View run xgboost_baseline at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/87b53d4d3d2f43169ed7f6282784957e
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training model: lightgbm




   CV F1 Score: 0.5906 (+/- 0.0111)




   Test F1 Score: 0.5855
   Confusion Matrix: TP=190, TN=1541, FP=52, FN=217


Registered model 'lightgbm_pipeline_no_pca' already exists. Creating a new version of this model...
2025/12/18 19:05:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_no_pca, version 3
Created version '3' of model 'lightgbm_pipeline_no_pca'.


üèÉ View run lightgbm_baseline at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/9ec47f0fdd0440eda8a0ca55f576f698
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

‚úì STEP 5: All 4 baseline models trained and logged successfully.

üöÄ STEP 7: Training PCA-Augmented Models (Target: F1 Score)

üîπ Training PCA model: ridge
   CV F1 Score: 0.2211 (+/- 0.0067)
   Test F1 Score: 0.2058
   Confusion Matrix: TP=50, TN=1564, FP=29, FN=357


Registered model 'ridge_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:06:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca, version 3
Created version '3' of model 'ridge_pipeline_with_pca'.


üèÉ View run ridge_with_pca at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/8c2ed25b7dd94b36bd959e8d522a997d
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training PCA model: histgradientboosting
   CV F1 Score: 0.5096 (+/- 0.0139)
   Test F1 Score: 0.5526
   Confusion Matrix: TP=176, TN=1539, FP=54, FN=231


Registered model 'histgradientboosting_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:07:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca, version 3
Created version '3' of model 'histgradientboosting_pipeline_with_pca'.


üèÉ View run histgradientboosting_with_pca at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/49bfe128eaaa4d638f7a3918d15ef113
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training PCA model: xgboost
   CV F1 Score: 0.5064 (+/- 0.0141)
   Test F1 Score: 0.5483
   Confusion Matrix: TP=176, TN=1534, FP=59, FN=231


Registered model 'xgboost_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:07:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca, version 3
Created version '3' of model 'xgboost_pipeline_with_pca'.


üèÉ View run xgboost_with_pca at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/46bd4fb19ea44e168731f3bd19786eb4
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

üîπ Training PCA model: lightgbm




   CV F1 Score: 0.5286 (+/- 0.0210)




   Test F1 Score: 0.5597
   Confusion Matrix: TP=178, TN=1542, FP=51, FN=229


Registered model 'lightgbm_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:08:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca, version 3
Created version '3' of model 'lightgbm_pipeline_with_pca'.


üèÉ View run lightgbm_with_pca at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0/runs/60fed03c82d7479789e56d65ed2a0a61
üß™ View experiment at: https://dagshub.com/williamzhang430/Churning_Model.mlflow/#/experiments/0

‚úì STEP 7: All 4 PCA models trained and logged.

üèÜ GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key : histgradientboosting
Global best CV F1     : 0.5913
Global best Test F1   : 0.5870
Uses PCA              : False

--------------------------------------------------------------------------------
Saving GLOBAL best model...
--------------------------------------------------------------------------------
‚úì Model saved to models/global_best_model.pkl

Done:
- GLOBAL best model key : histgradientboosting
- GLOBAL best CV F1     : 0.5913
- GLOBAL best Test F1   : 0.5870

‚è±Ô∏è Elapsed Time: 4 minutes and 16.70 seconds
