# **Churn Prediction: Final Refined Version**
This notebook implements data preprocessing, thorough EDA, and model evaluation for churn prediction using Random Forest and XGBoost.

## **Step 1: Import Libraries**

In [13]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')


## **Step 2: Load and Inspect Data**

In [None]:

# Load dataset
data = pd.read_csv('BankChurners.csv')

# Drop CLIENTNUM and Naive Bayes classifier columns
data.drop(columns=['CLIENTNUM', 
                   'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                   'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
                   'Avg_Open_To_Buy'],
          inplace=True)

# Confirm updated features
print("Updated Feature Set:", data.columns)

# Overview
print("Data Overview:")
display(data.head())
print("Shape of data:", data.shape)
print("Null Values:")
print(data.isnull().sum())


In [None]:

# Step 3: Data Preprocessing - Clean Data and Drop Redundant Features
# Drop redundant feature 'Credit_Limit'
data = data.drop(columns=['Credit_Limit'], errors='ignore')  # Avoid KeyError if already dropped
print("Dropped 'Credit_Limit'. Remaining columns:")
print(data.columns)


## **Step 3: Encode Categorical Columns and Clean Data**

In [None]:

# Encode target variable and binary features
data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})
data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})

# One-hot encode categorical features
categorical_cols = ['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category']
for col in categorical_cols:
    if 'Unknown' in data[col].unique():
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col).drop(columns='{}_Unknown'.format(col))], axis=1)
    else:
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis=1)

# Drop original categorical columns
data.drop(columns=categorical_cols, inplace=True)

print("Cleaned Data:")
display(data.head())
print("Updated Shape:", data.shape)


## **Step 4: Exploratory Data Analysis**

In [None]:

# Visualize class imbalance
sns.countplot(x='Attrition_Flag', data=data)
plt.title("Churn Distribution")
plt.show()

# Boxplots for numerical features to identify patterns
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(y=data[col], x=data['Attrition_Flag'])
    plt.title(f"{col} by Churn")
plt.tight_layout()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot for Total_Trans_Ct vs Total_Trans_Amt
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Total_Trans_Ct', y='Total_Trans_Amt', data=data)
plt.title('Relationship Between Transaction Count and Amount')
plt.show()


## **Step 5: Train-Test Split and Class Balancing with SMOTE**

In [None]:

# Split data
X = data.drop(columns=['Attrition_Flag'])
y = data['Attrition_Flag']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# SMOTE for imbalance handling
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTE:")
print(y_train_res.value_counts())


## **Step 6: Random Forest Model**

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train_res, y_train_res)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))


## **Step 7: XGBoost Model**

In [None]:
xgb_model = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_res, y_train_res)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_xgb):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_xgb):.2f}")
print("Classification Report:", classification_report(y_test, y_pred_xgb))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))



## **Step 8: Feature Importance**

In [None]:

importances = xgb_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances")
plt.show()


## **Step 9: Feature Redundancy Test**
In this section, we evaluate whether `Total_Trans_Amt` can be safely dropped without significantly impacting the model performance. This is done by training and comparing the XGBoost model with and without the feature.

In [None]:

# Redefine features and target variable to avoid earlier modifications
X = data.drop(columns=['Attrition_Flag'])  # All features
y = data['Attrition_Flag']                # Target variable

# Section: Evaluating Redundant Features (Total_Trans_Ct vs Total_Trans_Amt)

# 1. Train model with both features
X_full = X.copy()  # Original features
y_full = y.copy()

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.3, random_state=42, stratify=y_full)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res_full, y_train_res_full = smote.fit_resample(X_train_full, y_train_full)

# Train XGBoost with both features
xgb_full = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_full.fit(X_train_res_full, y_train_res_full)
y_pred_full = xgb_full.predict(X_test_full)

# Evaluate
print("Model Performance with Both Features:")
print(f"Accuracy: {accuracy_score(y_test_full, y_pred_full):.2f}")
print(f"F1-Score: {f1_score(y_test_full, y_pred_full):.2f}")

# 2. Train model without Total_Trans_Amt
X_reduced = X.drop(columns=['Total_Trans_Amt'])  # Drop redundant feature

X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_reduced, y_full, test_size=0.3, random_state=42, stratify=y_full)

# Handle class imbalance
X_train_res_red, y_train_res_red = smote.fit_resample(X_train_red, y_train_red)

# Train XGBoost without Total_Trans_Amt
xgb_red = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_red.fit(X_train_res_red, y_train_res_red)
y_pred_red = xgb_red.predict(X_test_red)

# Evaluate
print("\nModel Performance Without Total_Trans_Amt:")
print(f"Accuracy: {accuracy_score(y_test_red, y_pred_red):.2f}")
print(f"F1-Score: {f1_score(y_test_red, y_pred_red):.2f}")

# 3. Compare results
print("\nComparison of Model Performance:")
print(f"Accuracy Difference: {accuracy_score(y_test_full, y_pred_full) - accuracy_score(y_test_red, y_pred_red):.4f}")
print(f"F1-Score Difference: {f1_score(y_test_full, y_pred_full) - f1_score(y_test_red, y_pred_red):.4f}")


In [None]:
# Create Relative Change Features
data['Rel_Amt_Change'] = data['Total_Amt_Chng_Q4_Q1'] / (data['Total_Trans_Amt'] + 1)  # Avoid division by zero
data['Rel_Ct_Change'] = data['Total_Ct_Chng_Q4_Q1'] / (data['Total_Trans_Ct'] + 1)

# Inspect the new features
print(data[['Rel_Amt_Change', 'Rel_Ct_Change']].describe())


In [None]:
high_change_customers = data[
    (data['Rel_Amt_Change'] > data['Rel_Amt_Change'].quantile(0.95)) | 
    (data['Rel_Ct_Change'] > data['Rel_Ct_Change'].quantile(0.95))
]
print("High Change Customers:\n", high_change_customers['Attrition_Flag'].value_counts())
