# Model Experimentation with MLflow

In this notebook, we will:
1.  **Preprocessing**: Implement the full pipeline (KNN Imputation -> One-Hot Encoding -> Scaling).
2.  **Experiment**: Train Logistic Regression, Random Forest, and XGBoost.
3.  **Track**: Use MLflow to log parameters, metrics, and models.

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

%matplotlib inline

# Set MLflow Tracking URI (Local)
# Using ../mlruns to save it in the PROJECT ROOT, so mlflow ui can find it easily from terminal
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Churn_Prediction_Experiments")

<Experiment: artifact_location='file:d:/MLOPS PROJECT CHURN PRED/experiment/../mlruns/124876933937949882', creation_time=1767696233341, experiment_id='124876933937949882', last_update_time=1767696233341, lifecycle_stage='active', name='Churn_Prediction_Experiments', tags={'mlflow.experimentKind': 'custom_model_development'}>

## 1. Load and Split Data

In [26]:
df = pd.read_csv('../customer_churn_dataset/train.csv')

# Separate Features and Target
X = df.drop('churn', axis=1)
y = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Binary Target

# Train/Test Split (Before any processing to avoid leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

Train Shape: (12800, 10)
Test Shape: (3200, 10)


## 2. Preprocessing Pipeline Implementation
We need to apply the KNN Imputation logic here on the Split data.

In [27]:
def impute_internet_service(X_data, knn_model=None, scaler=None, is_train=True):
    """
    Applies KNN imputation to 'internet_service' column.
    If is_train=True, it fits the KNN model.
    If is_train=False, it uses the passed model.
    """
    X = X_data.copy()
    impute_features = ['monthly_charges', 'total_charges', 'tenure']
    
    # Standard Scale the features used for KNN
    if is_train:
        scaler = StandardScaler()
        scaler.fit(X[impute_features])
    
    X_scaled = scaler.transform(X[impute_features])
    
    # Separate missing
    mask_missing = X['internet_service'].isnull()
    
    if is_train:
        # Train KNN
        X_train_knn = X_scaled[~mask_missing]
        y_train_knn = X.loc[~mask_missing, 'internet_service']
        
        knn_model = KNeighborsClassifier(n_neighbors=5)
        knn_model.fit(X_train_knn, y_train_knn)
    
    # Predict if there are missing values
    if mask_missing.sum() > 0:
        X_missing_knn = X_scaled[mask_missing]
        imputed_values = knn_model.predict(X_missing_knn)
        X.loc[mask_missing, 'internet_service'] = imputed_values
        
    return X, knn_model, scaler

In [28]:
# Apply Imputation
print("Imputing Training Data...")
X_train_imp, knn_imputer, knn_scaler = impute_internet_service(X_train, is_train=True)

print("Imputing Test Data...")
X_test_imp, _, _ = impute_internet_service(X_test, knn_model=knn_imputer, scaler=knn_scaler, is_train=False)

# Check missing
print("Missing in Train:", X_train_imp.isnull().sum().sum())
print("Missing in Test:", X_test_imp.isnull().sum().sum())

Imputing Training Data...
Imputing Test Data...
Missing in Train: 0
Missing in Test: 0


In [29]:
# Encoding and Final Scaling
numerical_cols = ['tenure', 'monthly_charges', 'total_charges']
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Remove target or ID if present (customer_id is usually not useful for prediction)
if 'customer_id' in categorical_cols: categorical_cols.remove('customer_id')

# One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = ohe.fit_transform(X_train_imp[categorical_cols])
X_test_encoded = ohe.transform(X_test_imp[categorical_cols])

# Get feature names
encoded_features = ohe.get_feature_names_out(categorical_cols)

# Scaling Numerical
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train_imp[numerical_cols])
X_test_scaled = scaler_final.transform(X_test_imp[numerical_cols])

# Combine
X_train_final = np.hstack([X_train_scaled, X_train_encoded])
X_test_final = np.hstack([X_test_scaled, X_test_encoded])

print("Final Train Shape:", X_train_final.shape)

Final Train Shape: (12800, 16)


## 3. Run Experiments with MLflow

In [30]:
def run_experiment(model, name, params):
    with mlflow.start_run(run_name=name):
        # Log Params
        mlflow.log_params(params)
        
        # Train
        model.fit(X_train_final, y_train)
        
        # Predict
        y_pred = model.predict(X_test_final)
        y_prob = model.predict_proba(X_test_final)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc = roc_auc_score(y_test, y_prob) if y_prob is not None else 0
        
        # Log Metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("auc_roc", roc)
        
        # Log Model 
        # FIX: explicitly using 'name' argument instead of 'artifact_path' for MLflow 3.x+
        mlflow.sklearn.log_model(model, name=name)
        
        print(f"Finished {name}: Acc={acc:.4f}, F1={f1:.4f}")

In [31]:
# 1. Logistic Regression
lr_params = {'max_iter': 1000, 'solver': 'lbfgs'}
run_experiment(LogisticRegression(**lr_params), "Logistic_Regression", lr_params)

# 2. Random Forest
rf_params = {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
run_experiment(RandomForestClassifier(**rf_params), "Random_Forest", rf_params)

# 3. XGBoost
xgb_params = {'n_estimators': 100, 'learning_rate': 0.1, 'eval_metric': 'logloss'}
run_experiment(XGBClassifier(**xgb_params), "XGBoost", xgb_params)

Finished Logistic_Regression: Acc=0.7341, F1=0.5347
Finished Random_Forest: Acc=0.7784, F1=0.5876
Finished XGBoost: Acc=0.7788, F1=0.5889
