# E-Commerce Customer Churn Prediction
## Model Training & Evaluation

**Author:** Muhammad Abdullah  
**Project:** ML Fundamentals - Customer Churn Prediction

---

### Objectives:
1. Preprocess and engineer features
2. Train multiple ML models with MLflow tracking
3. Perform hyperparameter tuning
4. Evaluate and compare models
5. Generate SHAP explanations
6. Save best model for deployment

In [None]:
# Import libraries
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

warnings.filterwarnings('ignore')

# Import project modules
from config import get_config, MODELS_DIR, PROCESSED_DATA_DIR, FIGURES_DIR
from src.data import DataLoader, DataPreprocessor
from src.features import FeatureEngineer
from src.models import ModelTrainer, ModelEvaluator, ModelExplainer

print('All modules imported successfully!')

In [None]:
# Load configuration
config = get_config()
print('Configuration loaded:')
print(f"- Target column: {config['data']['target_column']}")
print(f"- Test size: {config['data']['test_size']}")
print(f"- Random state: {config['data']['random_state']}")

## 1. Load and Prepare Data

In [None]:
# Initialize data loader
loader = DataLoader(config)

# Try to load processed data, fallback to raw
try:
    df = loader.load_processed_data('eda_data.parquet')
    print('Loaded processed data from EDA')
except FileNotFoundError:
    print('Loading raw data...')
    df = loader.load_raw_data()

print(f'Data shape: {df.shape}')
df.head()

In [None]:
# Validate data
validation = loader.validate_data(df)
print('Data Validation Results:')
print(f"- Total rows: {validation['total_rows']}")
print(f"- Total columns: {validation['total_columns']}")
print(f"- Duplicates: {validation['duplicates']}")
print(f"- Target distribution: {validation.get('target_distribution', 'N/A')}")

## 2. Feature Engineering

In [None]:
# Initialize feature engineer
fe = FeatureEngineer(config)

# Create all features
df_features = fe.create_all_features(df)

print(f'Original features: {len(df.columns)}')
print(f'After feature engineering: {len(df_features.columns)}')
print(f'\nNew features created: {fe.get_created_features()}')

## 3. Data Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(config)

# Clean data
df_clean = preprocessor.clean_data(df_features)

# Handle missing values
df_imputed = preprocessor.handle_missing_values(df_clean)

# Handle outliers (clip method)
df_processed = preprocessor.handle_outliers(df_imputed, method='iqr', action='clip')

print(f'Processed data shape: {df_processed.shape}')

In [None]:
# Split data
target_col = config['data']['target_column']

X_train, X_val, X_test, y_train, y_val, y_test = loader.get_train_test_split(
    df_processed,
    target_col=target_col,
    stratify=True
)

print(f'Training set: {X_train.shape}')
print(f'Validation set: {X_val.shape}')
print(f'Test set: {X_test.shape}')
print(f'\nChurn rate - Train: {y_train.mean():.2%}, Val: {y_val.mean():.2%}, Test: {y_test.mean():.2%}')

In [None]:
# Transform features
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names()
print(f'Transformed features: {len(feature_names)}')
print(f'Feature names: {feature_names[:10]}...')

In [None]:
# Save preprocessor and feature names
joblib.dump(preprocessor.preprocessor, MODELS_DIR / 'preprocessor.joblib')
joblib.dump(feature_names, MODELS_DIR / 'feature_names.joblib')
print('Preprocessor and feature names saved!')

## 4. Model Training with MLflow

In [None]:
# Initialize model trainer
trainer = ModelTrainer(config)

# Train all models
print('Training all models with MLflow tracking...')
print('=' * 50)

models = trainer.train_all_models(
    X_train_transformed, y_train,
    X_val=X_val_transformed, y_val=y_val
)

print(f'\nTrained {len(models)} models: {list(models.keys())}')

In [None]:
# Create ensemble model
ensemble = trainer.create_ensemble(
    X_train_transformed, y_train,
    models=['random_forest', 'xgboost', 'lightgbm'],
    method='voting'
)
print('Ensemble model created!')

## 5. Hyperparameter Tuning

In [None]:
# Tune XGBoost
print('Tuning XGBoost hyperparameters...')
best_xgb, best_params = trainer.hyperparameter_tuning(
    X_train_transformed, y_train,
    model_name='xgboost',
    n_trials=30  # Reduce for faster execution
)

print(f'\nBest XGBoost parameters: {best_params}')

## 6. Model Evaluation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(config)

# Evaluate all models on test set
print('Model Performance on Test Set:')
print('=' * 60)

comparison_df = evaluator.evaluate_all_models(
    trainer.get_all_trained_models(),
    X_test_transformed,
    y_test
)

comparison_df

In [None]:
# Plot model comparison
evaluator.plot_model_comparison(comparison_df)

In [None]:
# Plot ROC curves
evaluator.plot_roc_curves(
    trainer.get_all_trained_models(),
    X_test_transformed, y_test
)

In [None]:
# Plot Precision-Recall curves
evaluator.plot_precision_recall_curves(
    trainer.get_all_trained_models(),
    X_test_transformed, y_test
)

In [None]:
# Get best model
best_name, best_model, best_score = trainer.get_best_model(
    X_val_transformed, y_val,
    metric='f1'
)

print(f'Best Model: {best_name}')
print(f'Best F1 Score: {best_score:.4f}')

In [None]:
# Confusion matrix for best model
evaluator.plot_confusion_matrix(
    best_model,
    X_test_transformed, y_test,
    model_name=best_name
)

In [None]:
# Classification report
print('Classification Report:')
print('=' * 50)
report = evaluator.get_classification_report(
    best_model,
    X_test_transformed, y_test
)
print(report)

In [None]:
# Find optimal threshold
optimal_threshold, best_f1 = evaluator.find_optimal_threshold(
    best_model,
    X_val_transformed, y_val,
    metric='f1'
)
print(f'Optimal threshold: {optimal_threshold:.2f} (F1: {best_f1:.4f})')

## 7. Model Explainability (SHAP)

In [None]:
# Initialize explainer
explainer = ModelExplainer(config)

# Setup SHAP explainer
explainer.setup_shap_explainer(
    best_model,
    X_train_transformed[:100],  # Use subset for background
    model_type='tree'
)

print('SHAP explainer initialized!')

In [None]:
# Calculate SHAP values
shap_values = explainer.calculate_shap_values(
    X_test_transformed[:500],  # Use subset for speed
    feature_names=feature_names
)

print(f'SHAP values calculated: {shap_values.shape}')

In [None]:
# SHAP Summary Plot
explainer.plot_shap_summary(X_test_transformed[:500], max_display=15)

In [None]:
# SHAP Bar Plot (Feature Importance)
explainer.plot_shap_bar(X_test_transformed[:500], max_display=15)

In [None]:
# Feature importance ranking
importance_df = explainer.get_feature_importance_shap()
print('Top 15 Most Important Features (SHAP):')
importance_df.head(15)

In [None]:
# Single prediction explanation
sample_idx = 0
sample = X_test_transformed[sample_idx:sample_idx+1]

# Get risk factors
risk_factors = explainer.get_top_risk_factors(sample, top_n=5)
print('Top Risk Factors for Sample Customer:')
risk_factors

## 8. Save Best Model

In [None]:
# Save best model
model_path = trainer.save_model(best_model, 'best_model')
print(f'Best model saved to: {model_path}')

# Also save with specific name
trainer.save_model(best_model, best_name)

# Save all trained models
for name, model in trainer.get_all_trained_models().items():
    trainer.save_model(model, name)
    
print('All models saved!')

In [None]:
# Save model metadata
metadata = {
    'best_model': best_name,
    'metrics': comparison_df.loc[best_name].to_dict(),
    'optimal_threshold': optimal_threshold,
    'feature_names': feature_names,
    'n_features': len(feature_names),
    'training_samples': len(X_train),
}

joblib.dump(metadata, MODELS_DIR / 'model_metadata.joblib')
print('Model metadata saved!')
print(metadata)

## 9. Training Summary

In [None]:
print('='*60)
print('TRAINING SUMMARY')
print('='*60)

print(f'''
DATASET:
  - Total samples: {len(df_processed)}
  - Features (after engineering): {len(feature_names)}
  - Train/Val/Test split: {len(X_train)}/{len(X_val)}/{len(X_test)}

MODELS TRAINED:
  - {list(trainer.get_all_trained_models().keys())}

BEST MODEL: {best_name}
  - F1 Score: {best_score:.4f}
  - ROC-AUC: {comparison_df.loc[best_name, 'roc_auc']:.4f}
  - Precision: {comparison_df.loc[best_name, 'precision']:.4f}
  - Recall: {comparison_df.loc[best_name, 'recall']:.4f}
  - Optimal Threshold: {optimal_threshold:.2f}

TOP 5 IMPORTANT FEATURES:
{importance_df.head().to_string()}

ARTIFACTS SAVED:
  - Best model: {MODELS_DIR / 'best_model.joblib'}
  - Preprocessor: {MODELS_DIR / 'preprocessor.joblib'}
  - Feature names: {MODELS_DIR / 'feature_names.joblib'}
  - MLflow runs: models/mlflow/
  - Figures: reports/figures/
''')

print('\nModel training complete! Ready for deployment.')