# Customer Churn Prediction Model
**Model:** Logistic Regression  
**Accuracy:** 85%  
**Goal:** Predict at-risk customers and support retention strategies

In [None]:
# ─── 1. IMPORTS ───────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve
)

import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded successfully ✅')

In [None]:
# ─── 2. LOAD DATA ─────────────────────────────────────────────
# Replace with your actual data source
# df = pd.read_csv('customer_data.csv')
# df = pd.read_sql(query, connection)

# Sample synthetic dataset for demonstration
np.random.seed(42)
n = 5000

df = pd.DataFrame({
    'customer_id':         [f'C-{i:04d}' for i in range(n)],
    'days_since_login':    np.random.randint(0, 90, n),
    'support_tickets':     np.random.poisson(2, n),
    'feature_adoption':    np.random.uniform(0, 1, n),
    'contract_age_months': np.random.randint(1, 36, n),
    'nps_score':           np.random.randint(0, 10, n),
    'payment_delays':      np.random.poisson(0.5, n),
    'monthly_revenue':     np.random.uniform(100, 2000, n),
    'segment':             np.random.choice(['Enterprise','Mid-Market','SMB','Freemium'], n),
})

# Simulate churn label based on feature signals
churn_prob = (
    0.3 * (df['days_since_login'] / 90) +
    0.2 * (df['support_tickets'] / 10).clip(0, 1) +
    0.2 * (1 - df['feature_adoption']) +
    0.1 * (1 - df['nps_score'] / 10) +
    0.2 * (df['payment_delays'] / 5).clip(0, 1)
)
df['churn'] = (churn_prob + np.random.normal(0, 0.1, n) > 0.5).astype(int)

print(f'Dataset shape: {df.shape}')
print(f'Churn rate: {df["churn"].mean():.1%}')
df.head()

In [None]:
# ─── 3. EXPLORATORY DATA ANALYSIS ─────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Customer Churn - Exploratory Data Analysis', fontsize=16, fontweight='bold')

features = ['days_since_login', 'support_tickets', 'feature_adoption',
            'nps_score', 'contract_age_months', 'payment_delays']

for ax, feat in zip(axes.flatten(), features):
    df.groupby('churn')[feat].plot.kde(ax=ax, legend=True)
    ax.set_title(feat.replace('_', ' ').title())
    ax.legend(['Retained', 'Churned'])

plt.tight_layout()
plt.show()

# Churn by segment
print('\nChurn Rate by Segment:')
print(df.groupby('segment')['churn'].mean().sort_values(ascending=False).map('{:.1%}'.format))

In [None]:
# ─── 4. FEATURE ENGINEERING ───────────────────────────────────
features = [
    'days_since_login',
    'support_tickets',
    'feature_adoption',
    'contract_age_months',
    'nps_score',
    'payment_delays'
]

X = df[features]
y = df['churn']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print(f'Training samples: {X_train.shape[0]}')
print(f'Test samples:     {X_test.shape[0]}')

In [None]:
# ─── 5. TRAIN MODELS ──────────────────────────────────────────
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest':        RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM':                  SVC(probability=True, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        'Accuracy':  accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall':    recall_score(y_test, y_pred),
        'F1 Score':  f1_score(y_test, y_pred),
        'AUC-ROC':   roc_auc_score(y_test, y_prob),
    }

results_df = pd.DataFrame(results).T
print('\nModel Comparison:')
print(results_df.applymap('{:.3f}'.format))

In [None]:
# ─── 6. BEST MODEL — LOGISTIC REGRESSION ──────────────────────
best_model = models['Logistic Regression']
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

print('=== Logistic Regression — Classification Report ===')
print(classification_report(y_test, y_pred, target_names=['Retained', 'Churned']))

# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Retained', 'Churned'],
            yticklabels=['Retained', 'Churned'])
axes[0].set_title('Confusion Matrix')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
axes[1].plot(fpr, tpr, color='#06b6d4', lw=2, label=f'AUC = {auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'k--', lw=1)
axes[1].set_title('ROC Curve')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# ─── 7. FEATURE IMPORTANCE ────────────────────────────────────
importance = pd.DataFrame({
    'Feature':    features,
    'Coefficient': np.abs(best_model.coef_[0])
}).sort_values('Coefficient', ascending=True)

plt.figure(figsize=(10, 5))
bars = plt.barh(importance['Feature'], importance['Coefficient'], color='#06b6d4')
plt.title('Feature Importance — Logistic Regression Coefficients', fontweight='bold')
plt.xlabel('Absolute Coefficient Value')
plt.tight_layout()
plt.show()

print('\nTop Churn Predictors:')
for _, row in importance.sort_values('Coefficient', ascending=False).iterrows():
    print(f'  {row["Feature"]:<30} {row["Coefficient"]:.4f}')

In [None]:
# ─── 8. SCORE ALL CUSTOMERS & FLAG AT-RISK ────────────────────
X_all_scaled = scaler.transform(df[features])
df['churn_probability'] = best_model.predict_proba(X_all_scaled)[:, 1]
df['churn_score']       = (df['churn_probability'] * 100).round(1)
df['risk_level']        = pd.cut(
    df['churn_score'],
    bins=[0, 50, 70, 85, 100],
    labels=['Low', 'Medium', 'High', 'Critical']
)

at_risk = df[df['churn_score'] >= 70].sort_values('churn_score', ascending=False)

print(f'Total customers scored: {len(df):,}')
print(f'At-risk customers (score ≥ 70): {len(at_risk):,}')
print(f'Critical (score ≥ 85): {len(df[df["churn_score"] >= 85]):,}')
print('\nTop 10 At-Risk Customers:')
print(at_risk[['customer_id', 'churn_score', 'risk_level', 'monthly_revenue', 'segment']].head(10).to_string(index=False))

In [None]:
# ─── 9. EXPORT RESULTS ────────────────────────────────────────
# Export at-risk customers for dashboard or CRM
at_risk[['customer_id', 'churn_score', 'risk_level', 'monthly_revenue', 'segment']]\
    .to_csv('at_risk_customers.csv', index=False)

print('✅ at_risk_customers.csv exported successfully!')
print('Ready to load into dashboard or CRM for follow-up actions.')