# **Churn Prediction: Final Refined Version**
This notebook implements data preprocessing, thorough EDA, and model evaluation for churn prediction using Random Forest and XGBoost.

## **Step 1: Import Libraries**

In [4]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')


## **Step 2: Load and Inspect Data**

In [5]:
import pandas as pd 

# Load dataset
data = pd.read_csv('BankChurners.csv')

# Drop CLIENTNUM and Naive Bayes classifier columns
data.drop(columns=['CLIENTNUM', 
                   'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                   'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
                   'Avg_Open_To_Buy'],
          inplace=True)

# Confirm updated features
print("Updated Feature Set:", data.columns)

# Overview
print("Data Overview:")
display(data.head())
print("Shape of data:", data.shape)
print("Null Values:")
print(data.isnull().sum())


# Encode target variable and binary features
data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})
data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})

# One-hot encode categorical features
categorical_cols = ['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category']
for col in categorical_cols:
    if 'Unknown' in data[col].unique():
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col).drop(columns='{}_Unknown'.format(col))], axis=1)
    else:
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis=1)

# Drop original categorical columns
data.drop(columns=categorical_cols, inplace=True)

print("Cleaned Data:")
display(data.head())
print("Updated Shape:", data.shape)



Updated Feature Set: Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
       'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')
Data Overview:


Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,2.175,816,28,2.5,0.0


Shape of data: (10127, 19)
Null Values:
Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64
Cleaned Data:


  data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})
  data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})


Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,...,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,0,45,0,3,39,5,1,3,12691.0,777,...,True,False,False,False,True,False,True,False,False,False
1,0,49,1,5,44,6,1,2,8256.0,864,...,False,False,True,False,False,True,True,False,False,False
2,0,51,0,3,36,4,1,0,3418.0,0,...,False,True,False,False,True,False,True,False,False,False
3,0,40,1,4,34,3,4,1,3313.0,2517,...,False,False,True,False,False,False,True,False,False,False
4,0,40,0,3,21,5,1,0,4716.0,0,...,True,False,False,False,True,False,True,False,False,False


Updated Shape: (10127, 33)


## **Step 3: Encode Categorical Columns and Clean Data**

In [6]:

# Encode target variable and binary features
data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})
data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})

# One-hot encode categorical features
categorical_cols = ['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category']
for col in categorical_cols:
    if 'Unknown' in data[col].unique():
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col).drop(columns='{}_Unknown'.format(col))], axis=1)
    else:
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis=1)

# Drop original categorical columns
data.drop(columns=categorical_cols, inplace=True)

print("Cleaned Data:")
display(data.head())
print("Updated Shape:", data.shape)


KeyError: 'Education_Level'

## **Step 4: Exploratory Data Analysis**

In [None]:

# Visualize class imbalance
sns.countplot(x='Attrition_Flag', data=data)
plt.title("Churn Distribution")
plt.show()

# Boxplots for numerical features to identify patterns
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(y=data[col], x=data['Attrition_Flag'])
    plt.title(f"{col} by Churn")
plt.tight_layout()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot for Total_Trans_Ct vs Total_Trans_Amt
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Total_Trans_Ct', y='Total_Trans_Amt', data=data)
plt.title('Relationship Between Transaction Count and Amount')
plt.show()


## **Step 5: Train-Test Split and Class Balancing with SMOTE**

In [None]:

# Split data
X = data.drop(columns=['Attrition_Flag'])
y = data['Attrition_Flag']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# SMOTE for imbalance handling
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTE:")
print(y_train_res.value_counts())


## **Step 6: Random Forest Model**

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train_res, y_train_res)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))


## **Step 7: XGBoost Model**

In [None]:

xgb_model = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_res, y_train_res)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb):.2f}")
print(f"Recall: {recall_score(y_test, y_pred_xgb):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred_xgb):.2f}")
print("Classification Report:", classification_report(y_test, y_pred_xgb))


## **Step 8: Feature Importance**

In [None]:

importances = xgb_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances")
plt.show()


## **Step 9: Feature Redundancy Test**
In this section, we evaluate whether `Total_Trans_Amt` can be safely dropped without significantly impacting the model performance. This is done by training and comparing the XGBoost model with and without the feature.

In [None]:

# Redefine features and target variable to avoid earlier modifications
X = data.drop(columns=['Attrition_Flag'])  # All features
y = data['Attrition_Flag']                # Target variable

# Section: Evaluating Redundant Features (Total_Trans_Ct vs Total_Trans_Amt)

# 1. Train model with both features
X_full = X.copy()  # Original features
y_full = y.copy()

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.3, random_state=42, stratify=y_full)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res_full, y_train_res_full = smote.fit_resample(X_train_full, y_train_full)

# Train XGBoost with both features
xgb_full = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_full.fit(X_train_res_full, y_train_res_full)
y_pred_full = xgb_full.predict(X_test_full)

# Evaluate
print("Model Performance with Both Features:")
print(f"Accuracy: {accuracy_score(y_test_full, y_pred_full):.2f}")
print(f"F1-Score: {f1_score(y_test_full, y_pred_full):.2f}")

# 2. Train model without Total_Trans_Amt
X_reduced = X.drop(columns=['Total_Trans_Amt'])  # Drop redundant feature

X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_reduced, y_full, test_size=0.3, random_state=42, stratify=y_full)

# Handle class imbalance
X_train_res_red, y_train_res_red = smote.fit_resample(X_train_red, y_train_red)

# Train XGBoost without Total_Trans_Amt
xgb_red = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_red.fit(X_train_res_red, y_train_res_red)
y_pred_red = xgb_red.predict(X_test_red)

# Evaluate
print("\nModel Performance Without Total_Trans_Amt:")
print(f"Accuracy: {accuracy_score(y_test_red, y_pred_red):.2f}")
print(f"F1-Score: {f1_score(y_test_red, y_pred_red):.2f}")

# 3. Compare results
print("\nComparison of Model Performance:")
print(f"Accuracy Difference: {accuracy_score(y_test_full, y_pred_full) - accuracy_score(y_test_red, y_pred_red):.4f}")
print(f"F1-Score Difference: {f1_score(y_test_full, y_pred_full) - f1_score(y_test_red, y_pred_red):.4f}")
# Print the confusion matrix for both models
print("Confusion Matrix with Both Features:")
print(confusion_matrix(y_test_full, y_pred_full))

print("\nConfusion Matrix without Total_Trans_Amt:")
print(confusion_matrix(y_test_red, y_pred_red))

# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_full, y_pred_full), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Confusion Matrix with Both Features')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(confusion_matrix(y_test_red, y_pred_red), annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title('Confusion Matrix without Total_Trans_Amt')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()


In [None]:
# Step 9: Analyze High Change Customers

# 1. Calculate Rel_Amt_Change and Rel_Ct_Change if not already present
if 'Rel_Amt_Change' not in data.columns or 'Rel_Ct_Change' not in data.columns:
    print("Creating Rel_Amt_Change and Rel_Ct_Change features...")
    data['Rel_Amt_Change'] = data['Total_Amt_Chng_Q4_Q1'] / (data['Total_Trans_Amt'] + 1)  # Avoid division by zero
    data['Rel_Ct_Change'] = data['Total_Ct_Chng_Q4_Q1'] / (data['Total_Trans_Ct'] + 1)

# 2. Identify high-change customers
high_change_customers = data[
    (data['Rel_Amt_Change'] > data['Rel_Amt_Change'].quantile(0.95)) | 
    (data['Rel_Ct_Change'] > data['Rel_Ct_Change'].quantile(0.95))
]

# 3. Count churned vs. non-churned customers
print("High Change Customers Churn Analysis:")
print(high_change_customers['Attrition_Flag'].value_counts())

# 4. Visualize High Change Customers
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.countplot(x='Attrition_Flag', data=high_change_customers, palette='coolwarm')
plt.title('Churn Distribution Among High Change Customers')
plt.xlabel('Churn Flag (0 = No Churn, 1 = Churned)')
plt.ylabel('Count')
plt.show()


In [None]:
# Step 10: Test Model Performance with High-Change Features

# Add Rel_Amt_Change and Rel_Ct_Change to the feature set
X_new = data.drop(columns=['Attrition_Flag'])  # Use all current features including new ones
y = data['Attrition_Flag']

# Split the data
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42, stratify=y)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train XGBoost model
xgb_model = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_res, y_train_res)

# Evaluate model performance
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Performance with High-Change Features:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1-Score: {f1:.2f}")

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Evaluate model performance
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Model Performance with High-Change Features:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1-Score: {f1:.2f}")

# Print the Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

# Confusion Matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# Drop Redundant Features
X_reduced = X.drop(columns=['Total_Revolving_Bal'])

# Train-Test Split with Reduced Features
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(
    X_reduced, y, test_size=0.3, random_state=42, stratify=y
)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res_red, y_train_res_red = smote.fit_resample(X_train_red, y_train_red)

# Train the XGBoost model
xgb_reduced = XGBClassifier(
    scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42
)
xgb_reduced.fit(X_train_res_red, y_train_res_red)

# Evaluate the Model
y_pred_red = xgb_reduced.predict(X_test_red)
print("Performance After Dropping Redundant Features:")
print("Accuracy:", accuracy_score(y_test_red, y_pred_red))
print("Recall:", recall_score(y_test_red, y_pred_red))
print("F1-Score:", f1_score(y_test_red, y_pred_red))
print("ROC-AUC:", roc_auc_score(y_test_red, xgb_reduced.predict_proba(X_test_red)[:, 1]))

# Confusion Matrix
cm = confusion_matrix(y_test_red, y_pred_red)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix After Dropping Features")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.metrics import (
    accuracy_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    confusion_matrix
)


# Drop Redundant Features
X_reduced = X.drop(columns=['Total_Revolving_Bal']) 

X_reduced.columns



In [None]:
from sklearn.metrics import (
    accuracy_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    confusion_matrix
)

# Drop Redundant Features

# Train-Test Split with Reduced Features
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(
    X_reduced, y, test_size=0.3, random_state=42, stratify=y
)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res_red, y_train_res_red = smote.fit_resample(X_train_red, y_train_red)

# Train the XGBoost model
xgb_reduced = XGBClassifier(
    scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42
)
xgb_reduced.fit(X_train_res_red, y_train_res_red)

# Evaluate the Model
y_pred_red = xgb_reduced.predict(X_test_red)
print("Performance After Dropping Redundant Features:")
print("Accuracy:", accuracy_score(y_test_red, y_pred_red))
print("Recall:", recall_score(y_test_red, y_pred_red))
print("F1-Score:", f1_score(y_test_red, y_pred_red))
print("ROC-AUC:", roc_auc_score(y_test_red, xgb_reduced.predict_proba(X_test_red)[:, 1]))

# Confusion Matrix
cm = confusion_matrix(y_test_red, y_pred_red)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix After Dropping Features")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Prepare the data
X = data.drop(columns=['Attrition_Flag'])
y = data['Attrition_Flag']

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-Validation Function
def evaluate_model(model, X, y, model_name):
    scores = cross_val_score(model, X, y, cv=skf, scoring='recall')
    print(f"{model_name} Cross-Validation Recall Scores: {scores}")
    print(f"{model_name} Mean Recall: {scores.mean():.4f}")

# Initialize Models
xgb_model = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Evaluate Models
print("Evaluating XGBoost...")
evaluate_model(xgb_model, X, y, "XGBoost")

print("\nEvaluating Random Forest...")
evaluate_model(rf_model, X, y, "Random Forest")


In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Step 2: Load Dataset into New DataFrame
data_path = 'BankChurners.csv'  # Replace with actual file path
data = pd.read_csv(data_path)
print("Dataset Loaded Successfully!")
print(f"Shape: {data.shape}\n")
print(data.head())

# Step 3: Clean and Preprocess Data
# Drop client-specific or unnecessary columns
data.drop(columns=['CLIENTNUM'], inplace=True)


# Encode target variable
data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})

# Encode Gender
data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})

# One-Hot Encode Categorical Features
data = pd.get_dummies(data, columns=['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category'], drop_first=True)

# Step 4: Handle Feature Redundancy
# Drop redundant features: Credit_Limit, Avg_Open_To_Buy, and Avg_Utilization_Ratio
print("Dropping redundant features: ['Credit_Limit', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio']")
data.drop(columns=['Credit_Limit', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio'], inplace=True)

# Step 5: Feature Engineering - Create Derived Features
data['Rel_Amt_Change'] = data['Total_Amt_Chng_Q4_Q1'] / data['Total_Trans_Amt']
data['Rel_Ct_Change'] = data['Total_Ct_Chng_Q4_Q1'] / data['Total_Trans_Ct']

# Fill any resulting NaN or infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

print(f"Updated Data Shape: {data.shape}\n")

# Step 6: Split Data into Train and Test Sets
X = data.drop(columns=['Attrition_Flag'])  # Features
y = data['Attrition_Flag']  # Target

# Train-Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTE:")
print(y_train_res.value_counts())

# Step 7: Model Training and Evaluation Function
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred))
    if y_pred_proba is not None:
        print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

    # Classification Report
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 8: Train Models
# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
train_and_evaluate_model(rf_model, X_train_res, X_test, y_train_res, y_test, "Random Forest")

# XGBoost
xgb_model = XGBClassifier(scale_pos_weight=6, n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
train_and_evaluate_model(xgb_model, X_train_res, X_test, y_train_res, y_test, "XGBoost")

# Step 9: Cross-Validation for XGBoost
print("\nPerforming Cross-Validation on XGBoost...")
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='recall')
print("XGBoost Cross-Validation Recall Scores:", xgb_cv_scores)
print("Mean Recall (Cross-Validation):", np.mean(xgb_cv_scores))

# Step 10: High-Change Customer Analysis
# Identify High Change Customers
high_change_customers = data[
    (data['Rel_Amt_Change'] > data['Rel_Amt_Change'].quantile(0.95)) |
    (data['Rel_Ct_Change'] > data['Rel_Ct_Change'].quantile(0.95))
]

print("\nHigh Change Customers Churn Analysis:")
print(high_change_customers['Attrition_Flag'].value_counts())

sns.countplot(x='Attrition_Flag', data=high_change_customers, palette='coolwarm')
plt.title('High-Change Customers Churn Distribution')
plt.show()

print("\nScript Completed Successfully!")

# Feature Importance for Random Forest
rf_importances = rf_model.feature_importances_
rf_indices = np.argsort(rf_importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature Importances - Random Forest")
plt.bar(range(X.shape[1]), rf_importances[rf_indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[rf_indices], rotation=90)
plt.tight_layout()
plt.show()

# Feature Importance for XGBoost
xgb_importances = xgb_model.feature_importances_
xgb_indices = np.argsort(xgb_importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature Importances - XGBoost")
plt.bar(range(X.shape[1]), xgb_importances[xgb_indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[xgb_indices], rotation=90)
plt.tight_layout()
plt.show()
