In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import SMOTE

# Step 2: Load Dataset into New DataFrame
data_path = 'BankChurners.csv'  # Replace with actual file path
data = pd.read_csv(data_path)
print("Dataset Loaded Successfully!")
print(f"Shape: {data.shape}\n")
print(data.head())

# Step 3: Exploratory Data Analysis (EDA)
print("\n--- Data Information ---")
print(data.info())

print("\n--- Summary Statistics ---")
print(data.describe())

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values in Dataset")
plt.show()

# Visualize target variable distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Attrition_Flag', data=data, palette='coolwarm')
plt.title("Target Variable Distribution")
plt.show()

# Visualize numerical feature distributions
numerical_features = data.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 10))
data[numerical_features].hist(bins=20, figsize=(15, 10), layout=(4, 5))
plt.tight_layout()
plt.show()

# Step 4: Clean and Preprocess Data
# Drop client-specific or unnecessary columns
data.drop(columns=['CLIENTNUM'], inplace=True)

# Encode target variable
data['Attrition_Flag'] = data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0})

# Encode Gender
data['Gender'] = data['Gender'].replace({'F': 1, 'M': 0})

# One-Hot Encode Categorical Features
data = pd.get_dummies(data, columns=['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category'], drop_first=True)

# Step 5: Handle Feature Redundancy
# Drop redundant features: Credit_Limit, Avg_Open_To_Buy, and Avg_Utilization_Ratio
print("Dropping redundant features: ['Credit_Limit', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio']")
data.drop(columns=['Credit_Limit', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio'], inplace=True)

# Step 6: Feature Scaling
scaler = StandardScaler()
numerical_features = data.select_dtypes(include=[np.number]).columns.drop('Attrition_Flag')
data_scaled = pd.DataFrame(scaler.fit_transform(data[numerical_features]), columns=numerical_features)
print("\nData Scaled Successfully!")

# Step 7: Apply Clustering Methods
# K-Means Clustering
X = data_scaled.copy()
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
X['Cluster'] = kmeans.labels_

# Evaluate Clustering
silhouette_avg = silhouette_score(X.drop(columns=['Cluster']), kmeans.labels_)
print(f"\nK-Means Silhouette Score: {silhouette_avg:.2f}")

# Visualize Clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.drop(columns=['Cluster']))
X['PCA1'] = X_pca[:, 0]
X['PCA2'] = X_pca[:, 1]

plt.figure(figsize=(8, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=X, palette='coolwarm')
plt.title("K-Means Clustering Results (PCA Visualization)")
plt.show()

# Step 8: Analyze Cluster Characteristics
X['Cluster'] = kmeans.labels_
X['Attrition_Flag'] = data['Attrition_Flag'].reset_index(drop=True)

print("\n--- Cluster Churn Rate ---")
cluster_summary = X.groupby('Cluster')['Attrition_Flag'].mean()
print(cluster_summary)

print("\n--- Cluster Size ---")
cluster_size = X['Cluster'].value_counts()
print(cluster_size)

# Visualize Cluster Characteristics
plt.figure(figsize=(10, 6))
cluster_summary.plot(kind='bar', color='skyblue')
plt.title("Average Churn Rate by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Average Churn Rate")
plt.show()

print("\nClustering Analysis Completed Successfully!")


FileNotFoundError: [Errno 2] No such file or directory: 'BankChurners.csv'