# Customer Churn Prediction

This notebook demonstrates the end-to-end process for predicting customer churn using machine learning.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve, classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")

# Set style for plots
plt.style.use('seaborn')
sns.set_palette("husl")

## 1. Data Import and Overview

In [None]:
# Import data
df = pd.read_csv('./data/bank_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Data info
df.info()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

In [None]:
# Create churn column (1 for Attrited Customer, 0 for Existing Customer)
df['Churn'] = df['Attrition_Flag'].apply(lambda val: 0 if val == "Existing Customer" else 1)
print(f"Churn distribution:\n{df['Churn'].value_counts()}")

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Churn distribution
plt.figure(figsize=(8, 6))
df['Churn'].hist(bins=2, edgecolor='black')
plt.title('Customer Churn Distribution')
plt.xlabel('Churn (0: Existing, 1: Attrited)')
plt.ylabel('Count')
plt.xticks([0, 1])
plt.savefig('./images/eda/churn_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Customer age distribution
plt.figure(figsize=(10, 6))
df['Customer_Age'].hist(bins=30, edgecolor='black', alpha=0.7)
plt.title('Customer Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.savefig('./images/eda/customer_age_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Marital status distribution
plt.figure(figsize=(8, 6))
df['Marital_Status'].value_counts().plot(kind='bar')
plt.title('Marital Status Distribution')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig('./images/eda/marital_status_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Total transaction amount distribution
plt.figure(figsize=(10, 6))
df['Total_Trans_Amt'].hist(bins=50, edgecolor='black', alpha=0.7)
plt.title('Total Transaction Amount Distribution')
plt.xlabel('Total Transaction Amount')
plt.ylabel('Count')
plt.savefig('./images/eda/total_trans_amt_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(20, 10))
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.savefig('./images/eda/heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Feature Engineering

In [None]:
# Define categorical columns for encoding
cat_columns = [
    'Gender',
    'Education_Level',
    'Marital_Status', 
    'Income_Category',
    'Card_Category'
]

# Encode categorical variables using mean of response
for col in cat_columns:
    groups = df.groupby(col)['Churn'].mean()
    df[col + '_Churn'] = df[col].map(groups)

print("Encoded categorical columns:")
for col in cat_columns:
    print(f"{col}_Churn")

In [None]:
# Select features for modeling
keep_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book',
             'Total_Relationship_Count', 'Months_Inactive_12_mon',
             'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
             'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
             'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
             'Gender_Churn', 'Education_Level_Churn', 'Marital_Status_Churn', 
             'Income_Category_Churn', 'Card_Category_Churn']

X = df[keep_cols]
y = df['Churn']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## 4. Model Training

In [None]:
# Train Random Forest model
rfc = RandomForestClassifier(random_state=42, n_estimators=500)
rfc.fit(X_train, y_train)

# Train Logistic Regression model
lrc = LogisticRegression(random_state=42, max_iter=3000)
lrc.fit(X_train, y_train)

print("Models trained successfully!")

## 5. Model Predictions

In [None]:
# Make predictions
y_train_preds_rf = rfc.predict(X_train)
y_test_preds_rf = rfc.predict(X_test)

y_train_preds_lr = lrc.predict(X_train)
y_test_preds_lr = lrc.predict(X_test)

print("Predictions completed!")

## 6. Model Evaluation

In [None]:
# Classification reports
print('Random Forest Results')
print('Test Results')
print(classification_report(y_test, y_test_preds_rf))
print('Train Results')
print(classification_report(y_train, y_train_preds_rf))

print('\nLogistic Regression Results')
print('Test Results')
print(classification_report(y_test, y_test_preds_lr))
print('Train Results')
print(classification_report(y_train, y_train_preds_lr))

In [None]:
# Plot ROC curves
plt.figure(figsize=(15, 8))

plt.subplot(1, 2, 1)
ax = plt.gca()
plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
plot_roc_curve(lrc, X_test, y_test, ax=ax, alpha=0.8)
plt.title('ROC Curves')

plt.subplot(1, 2, 2)
# Feature importance for Random Forest
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot feature importance
plt.figure(figsize=(20, 5))
plt.title("Feature Importance")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [X.columns[i] for i in indices], rotation=90)
plt.savefig('./images/results/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Save Models

In [None]:
# Save models
joblib.dump(rfc, './models/rfc_model.pkl')
joblib.dump(lrc, './models/logistic_model.pkl')

print("Models saved successfully!")