In [4]:
import pandas as pd  
import numpy as np  
import plotly.express as px  
import plotly.graph_objects as go  
from plotly.subplots import make_subplots  
import seaborn as sns  
import matplotlib.pyplot as plt  
import json

In [None]:
# 🎯 Define the path to the CSV file  
file_path = r"C:\Projects\GitHubRepos\bank_churn_eda\data\raw\Bank_Churn.csv"  
  
# 📥 Read the CSV file into a DataFrame  
df = pd.read_csv(file_path)  
  
# ✅ Confirmation message with an emoji  
print("✅ Your file has been successfully read into a DataFrame!")  


In [None]:
# 📊 Display basic information about the dataset  
print("\n📘 Dataset Overview")  
print("Shape:", df.shape)  
print("\n📝 Dataset Information:")  
df.info() 

In [None]:
# 📈 Basic statistics  
print("\n📊 Basic Statistics:")  
print(df.describe())   

In [None]:
# 🔍 Check for missing values  
print("\n🚨 Missing Values:")  
print(df.isnull().sum())

# 1. 👥 Customer Demographics Analysis

## 1.1 🎂 Age Distribution  

In [None]:

print("\n🔍 Analyzing Age Distribution by Churn Status...")  
fig = px.histogram(  
    df,  
    x='Age',  
    color='Exited',  
    marginal='box',  
    title='🎂 Age Distribution by Churn Status',  
    labels={'Exited': 'Churned'}  
)  
fig.show()  

## 1.2 🌍 Geographic Distribution  

In [None]:
print("🔍 Analyzing Churn Rate by Geography...")  
geo_churn = df.groupby('Geography')['Exited'].agg(['count', 'mean']).reset_index()  
geo_churn['churn_rate'] = geo_churn['mean'] * 100  
  
fig = px.bar(  
    geo_churn,   
    x='Geography',   
    y='churn_rate',  
    title='🌍 Churn Rate by Geography',  
    labels={'churn_rate': 'Churn Rate (%)'},  
    text='churn_rate'  
)  
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')  
fig.show()  

# 2. 💰 Financial Analysis  

## 2.1 💳 Balance Distribution  

In [None]:
print("🔍 Analyzing Balance Distribution by Geography and Churn Status...")  
fig = px.box(  
    df,   
    x='Geography',   
    y='Balance',  
    color='Exited',  
    title='💳 Balance Distribution by Geography and Churn Status',  
    labels={'Balance': 'Account Balance'}  
)  
fig.show()  

## 2.2 📊 Credit Score Analysis  

In [None]:
print("🔍 Analyzing Credit Score vs Balance...")  
fig = px.scatter(  
    df,   
    x='CreditScore',   
    y='Balance',  
    color='Exited',  
    size='Age',  
    title='📊 Credit Score vs Balance',  
    labels={'Exited': 'Churned'},  
    hover_data=['CustomerId']  
)  
fig.show()  

# 3. 🛍️ Product Analysis  

## 3.1 📦 Number of Products 

In [None]:
print("🔍 Analyzing Churn Rate by Number of Products...")  
product_churn = df.groupby('NumOfProducts')['Exited'].agg(['count', 'mean']).reset_index()  
product_churn['churn_rate'] = product_churn['mean'] * 100  
  
fig = px.bar(  
    product_churn,   
    x='NumOfProducts',   
    y='churn_rate',  
    title='🛍️ Churn Rate by Number of Products',  
    labels={'churn_rate': 'Churn Rate (%)'},  
    text='churn_rate'  
)  
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')  
fig.show()  

# 4. 📈 Customer Engagement Analysis  

## 4.1 🔄 Active Member Status 

In [None]:
print("🔍 Analyzing Active vs Inactive Members...")  
active_churn = df.groupby('IsActiveMember')['Exited'].agg(['count', 'mean']).reset_index()  
active_churn['churn_rate'] = active_churn['mean'] * 100  
  
fig = px.pie(  
    active_churn,   
    values='count',   
    names='IsActiveMember',  
    title='🔄 Distribution of Active vs Inactive Members',  
    hole=0.4  
)  
fig.update_traces(textposition='inside', textinfo='percent+label')  
fig.show()  

# 5. 📊 Correlation Analysis  

In [None]:
# Generate Correlation Matrix  
print("🔍 Generating Correlation Matrix...")  
# Select only numeric columns for correlation matrix  
numeric_df = df.select_dtypes(include=[np.number])  
correlation = numeric_df.corr()  
fig = px.imshow(  
    correlation,  
    title='📊 Correlation Matrix',  
    color_continuous_scale='RdBu',  
    range_color=(-1, 1)  
)  
fig.update_layout(coloraxis_colorbar=dict(title="Correlation"))  
fig.show()  

# 6. 🧩 Customer Segmentation  

## 6.2 💸 Create Balance Segments  

In [None]:
print("🔍 Creating Balance Segments for Segmentation...")  
# Determine the number of unique bins  
num_bins = len(pd.qcut(df['Balance'], q=4, duplicates='drop').unique())  
# Generate appropriate number of labels  
labels = ['Low', 'Medium-Low', 'Medium-High', 'High'][:num_bins]  
df['BalanceSegment'] = pd.qcut(  
    df['Balance'],  
    q=4,  
    labels=labels,  
    duplicates='drop'  
)  
  
print(df[['Balance', 'BalanceSegment']].head())  

In [None]:
print(df['BalanceSegment'].value_counts())  

## 6.3 📊 Analyze Segments  

In [None]:
# Creating Age Groups  
print("🔍 Creating Age Groups for Segmentation...")  
age_bins = [18, 30, 40, 50, 60, 70, 80, 90]  
age_labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89']  
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)  
  
print(df[['Age', 'AgeGroup']].head()) 

In [None]:
print("🔍 Analyzing Customer Segments...")  
segment_analysis = df.groupby(['AgeGroup', 'BalanceSegment'], observed=True)['Exited'].agg(['count', 'mean']).reset_index()  
segment_analysis['churn_rate'] = segment_analysis['mean'] * 100  
  
fig = px.treemap(  
    segment_analysis,  
    path=['AgeGroup', 'BalanceSegment'],  
    values='count',  
    color='churn_rate',  
    color_continuous_scale='RdBu',  
    title='🧩 Customer Segmentation Analysis',  
    hover_data={'churn_rate': ':.2f'}  
)  
fig.show()


## 📊 Analysis Questions and Solutions

In [None]:
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt  
  
# Load the dataset  
df = pd.read_csv('data/raw/Bank_Churn.csv')  
  
# Separate churners and non-churners  
churners = df[df['Exited'] == 1]  
non_churners = df[df['Exited'] == 0]  
  
# List of categorical and numerical columns  
categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts', 'Contact']  
numerical_cols = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure']  
  
# Analyze categorical attributes  
for col in categorical_cols:  
    plt.figure(figsize=(8, 4))  
    sns.countplot(x=col, hue='Exited', data=df, palette='Set2')  
    plt.title(f'Distribution of {col} by Churn Status')  
    plt.legend(title='Exited', labels=['Non-Churners', 'Churners'])  
    plt.show()  
  
# Analyze numerical attributes  
for col in numerical_cols:  
    plt.figure(figsize=(10, 5))  
    sns.kdeplot(churners[col], label='Churners', shade=True)  
    sns.kdeplot(non_churners[col], label='Non-Churners', shade=True)  
    plt.title(f'Distribution of {col} by Churn Status')  
    plt.legend()  
    plt.show()  

# 💡 Save Insights 

In [None]:
print("💾 Saving Insights to JSON file...")  
insights = {  
    'total_customers': len(df),  
    'churn_rate': round(df['Exited'].mean() * 100, 2),  
    'avg_balance': round(df['Balance'].mean(), 2),  
    'avg_age': round(df['Age'].mean(), 2),  
    'top_churn_segment': segment_analysis.sort_values('churn_rate', ascending=False).iloc[0].to_dict()  
}  
  
# 📝 Export insights for dashboard  
with open(r"C:\Projects\GitHubRepos\bank_churn_eda\data\processed\insights.json", 'w') as f:  
    json.dump(insights, f, indent=4)  
  
print("✅ Insights have been successfully saved to 'insights.json'!")  