# Customer Churn Prediction - End-to-End Analysis

This notebook contains a complete churn prediction pipeline:

1. **Data Generation and Loading**
2. **Exploratory Data Analysis (EDA)**
3. **Data Preprocessing and Feature Engineering**
4. **Model Training and Comparison**
5. **Model Evaluation**
6. **Model Explainability (SHAP)**
7. **Model Persistence**

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

import sys
sys.path.insert(0, '..')

from src.data_generator import generate_churn_dataset
from src.preprocessing import ChurnDataProcessor
from src.model_trainer import ChurnModelTrainer, print_classification_report
from src.explainability import (
    ChurnExplainer, plot_confusion_matrix, 
    plot_roc_curve, plot_precision_recall_curve
)

print('Libraries loaded successfully')

## 2. Data Generation and Loading

In [None]:
df = generate_churn_dataset(n_samples=7000, random_state=42)
df.to_csv('../data/telco_churn.csv', index=False)

print(f'Dataset created: {df.shape[0]:,} rows, {df.shape[1]} columns')
print(f'Churn rate: {(df["Churn"] == "Yes").mean():.2%}')
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
print('Dataset Information')
print('=' * 50)
print(f'Number of rows: {df.shape[0]:,}')
print(f'Number of columns: {df.shape[1]}')
print(f'\nData Types:')
print(df.dtypes.value_counts())
print(f'\nMissing Values:')
print(df.isnull().sum()[df.isnull().sum() > 0])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

churn_counts = df['Churn'].value_counts()
colors = ['#2ecc71', '#e74c3c']
axes[0].pie(churn_counts.values, labels=['Retained', 'Churned'], 
            autopct='%1.1f%%', colors=colors, explode=[0, 0.05],
            shadow=True, startangle=90)
axes[0].set_title('Churn Distribution', fontsize=14, fontweight='bold')

sns.countplot(data=df, x='Churn', hue='Churn', palette={'No': '#2ecc71', 'Yes': '#e74c3c'}, ax=axes[1], legend=False)
axes[1].set_title('Churn Counts', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Churn Status')
axes[1].set_ylabel('Customer Count')

for p in axes[1].patches:
    axes[1].annotate(f'{int(p.get_height()):,}', 
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

sns.histplot(data=df, x='tenure', hue='Churn', kde=True, ax=axes[0],
             palette={'No': '#2ecc71', 'Yes': '#e74c3c'})
axes[0].set_title('Customer Tenure', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Months')

sns.histplot(data=df, x='MonthlyCharges', hue='Churn', kde=True, ax=axes[1],
             palette={'No': '#2ecc71', 'Yes': '#e74c3c'})
axes[1].set_title('Monthly Charges', fontsize=14, fontweight='bold')
axes[1].set_xlabel('$')

sns.histplot(data=df, x='TotalCharges', hue='Churn', kde=True, ax=axes[2],
             palette={'No': '#2ecc71', 'Yes': '#e74c3c'})
axes[2].set_title('Total Charges', fontsize=14, fontweight='bold')
axes[2].set_xlabel('$')

plt.tight_layout()
plt.show()

In [None]:
categorical_cols = ['Contract', 'PaymentMethod', 'InternetService', 'TechSupport', 'OnlineSecurity']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, col in enumerate(categorical_cols):
    churn_rate = df.groupby(col)['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
    churn_rate = churn_rate.sort_values(ascending=False)
    
    sns.barplot(x=churn_rate.index, y=churn_rate.values, ax=axes[idx], palette='RdYlGn_r')
    axes[idx].set_title(f'{col} vs Churn Rate', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Churn Rate (%)')
    axes[idx].tick_params(axis='x', rotation=45)
    
    for p in axes[idx].patches:
        axes[idx].annotate(f'{p.get_height():.1f}%', 
                          (p.get_x() + p.get_width() / 2., p.get_height()),
                          ha='center', va='bottom', fontsize=10)

axes[-1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
df_numeric = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10, 8))
correlation = df_numeric.corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))
sns.heatmap(correlation, mask=mask, annot=True, cmap='RdYlBu_r', 
            center=0, fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Data Preprocessing and Feature Engineering

In [None]:
processor = ChurnDataProcessor()
df = processor.load_data('../data/telco_churn.csv')
X_train, X_test, y_train, y_test = processor.prepare_data(df, test_size=0.2, random_state=42)

print(f'Data preparation complete')
print(f'Number of features: {X_train.shape[1]}')
print(f'Feature list: {list(X_train.columns)[:10]}...')

In [None]:
print('Training Set - First 5 Rows:')
X_train.head()

## 5. Model Training and Comparison

In [None]:
trainer = ChurnModelTrainer()

results = trainer.train_all_models(
    X_train, y_train,
    X_test, y_test,
    use_sampling='smote'
)

In [None]:
comparison_df = trainer.compare_models()
print('Model Comparison:')
print('=' * 80)
comparison_df

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
x = np.arange(len(comparison_df))
width = 0.15

for i, metric in enumerate(metrics):
    ax.bar(x + i * width, comparison_df[metric], width, label=metric)

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 2)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend(loc='lower right')
ax.set_ylim(0.5, 1.0)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
best_model_name, best_model = trainer.select_best_model(metric='roc_auc')
print_classification_report(y_test, results[best_model_name]['y_pred'])

## 6. Model Evaluation

In [None]:
best_results = results[best_model_name]
plot_confusion_matrix(best_results['confusion_matrix'])

In [None]:
plot_roc_curve(y_test, best_results['y_proba'])

In [None]:
plot_precision_recall_curve(y_test, best_results['y_proba'])

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 8))

for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_proba'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Model Explainability (SHAP)

In [None]:
explainer = ChurnExplainer(best_model, X_train.columns.tolist())
explainer.create_explainer(X_train, explainer_type='tree')
shap_values = explainer.calculate_shap_values(X_test)

In [None]:
explainer.plot_summary(X_test, max_display=15)

In [None]:
explainer.plot_bar(X_test, max_display=15)

In [None]:
top_features = explainer.get_top_features(top_n=10)
print('Top 10 Most Important Features:')
print('=' * 40)
top_features

In [None]:
high_risk_idx = np.argmax(best_results['y_proba'])
print(f'High Risk Customer (Index: {high_risk_idx})')
print(f'Churn Probability: {best_results["y_proba"][high_risk_idx]:.2%}')
print('\nCustomer Features:')
print(X_test.iloc[high_risk_idx])

In [None]:
explainer.explain_instance(X_test, high_risk_idx)

In [None]:
high_risk_explanations = explainer.explain_high_risk_customers(
    X_test, best_results['y_proba'], top_n=3
)

print('Top 3 High Risk Customers Analysis:')
print('=' * 60)

for idx, explanation in high_risk_explanations.items():
    print(f'\nCustomer Index: {idx}')
    print(f'   Churn Probability: {explanation["churn_probability"]:.2%}')
    print('   Risk Factors:')
    for factor in explanation['top_factors'][:3]:
        direction = 'increases' if factor['direction'] == 'increases' else 'decreases'
        print(f'   - {factor["feature"]}: {factor["shap_value"]:.3f} ({direction} risk)')

## 8. Save Model and Processor

In [None]:
import joblib
from pathlib import Path

models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

trainer.save_model(str(models_dir / 'best_model.joblib'))
processor.save_processor(str(models_dir / 'processor.joblib'))

model_results = {
    'model_name': best_model_name,
    'accuracy': best_results['accuracy'],
    'precision': best_results['precision'],
    'recall': best_results['recall'],
    'f1': best_results['f1'],
    'roc_auc': best_results['roc_auc'],
    'feature_columns': X_train.columns.tolist()
}

joblib.dump(model_results, str(models_dir / 'model_results.joblib'))

print('All artifacts saved:')
print('   - best_model.joblib')
print('   - processor.joblib')
print('   - model_results.joblib')

## Project Summary

### Completed Tasks:
1. Generated synthetic dataset with 7,000 customer records
2. Performed comprehensive exploratory data analysis
3. Engineered 10+ new features
4. Trained and compared 5 different models
5. Selected best performing model based on ROC AUC
6. Implemented SHAP for model explainability
7. Persisted model and preprocessing artifacts

### Next Steps:
- Run the Streamlit app: `streamlit run app/streamlit_app.py`
- Use the dashboard for predictions
- Update model with real data when available

In [None]:
print('=' * 60)
print('CHURN PREDICTION PROJECT COMPLETE')
print('=' * 60)
print(f'\nBest Model: {best_model_name}')
print(f'ROC AUC Score: {best_results["roc_auc"]:.4f}')
print(f'F1 Score: {best_results["f1"]:.4f}')
print('\nTo launch Streamlit dashboard:')
print('   cd churn_prediction && streamlit run app/streamlit_app.py')