In [2]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset Loading

In [None]:
#load the dataset
Churn = pd.read_csv("/kaggle/input/bank-customer-churn-dataset/Bank Customer Churn Prediction.csv")

In [None]:
#inspecting the dataset
Churn.head()

In [None]:
Churn.info()

In [None]:
Churn.describe()

In [None]:
#checking for missing values
Churn.isnull().sum()

In [None]:
#checking for duplicates
Churn.duplicated().sum()

## Feature Engineering
Here are the new features we're going to derive for analysis:

### 1. Tenure Flags
 - Create `tenure_bucket` to identify recent customers; who may have a higher risk of churn.

In [None]:
Churn['tenure_bucket'] = pd.cut(
    Churn['tenure'],
    bins=[0, 2, 5, 10],
    labels=['0–2 yrs', '3–5 yrs', '6–10 yrs']
)

### 2. Engagement Flags
 - Derive `single_product_flag` and `multiple_product_flag`, indicating low and high engagement.

In [None]:
#derive single_product_flag
Churn['single_product_flag'] = (Churn['products_number'] == 1).astype(int)

#derive multiple_product_flag
Churn['multiple_products_flag'] = (Churn['products_number'] > 1).astype(int)

   - Combine `credit_card` and `active_member` to derive `credit_card_active_flag`, to indicate active engagement with credit products.

In [None]:
#combine credit_card and active_member to derive credit_card_active_flag
Churn['credit_card_active_flag'] = ((Churn['credit_card'] == 1) & (Churn['active_member'] == 1)).astype(int)

### 3. Financial Features
- Measuring relative wealth with `balance_salary_ration`, from `balance` and `estimated_salary`.

In [None]:
#use balance and estimated_salary to measure balance_salary_ration
Churn['balance_salary_ratio'] = Churn['balance'] / (Churn['estimated_salary'] + 1)
high_balance_thresh = Churn['balance'].quantile(0.75)

   - Derive `high_balance_flag`, identifying high-value customers with balance above the 75th percentile.

In [None]:
#identify high-value customers with high_balance_flag
high_balance_thresh = Churn['balance'].quantile(0.75)
Churn['high_balance_flag'] = (Churn['balance'] > high_balance_thresh).astype(int)

### 4. Age Features
   - `age_group` to segment customers into age brackets.

In [None]:
#derive age_group
Churn['age_group'] = pd.cut(
    Churn['age'], 
    bins=[17, 29, 44, 59, 120], 
    labels=['18-29', '30-44', '45-59', '60+']
)

   - `senior_flag` to identify customers aged **+60**.

In [None]:
#identify senior_flag
Churn['senior_flag'] = (Churn['age'] >= 60).astype(int)

### 5. Risk Features
   - Combine low engagement, low balance, and inactivity to mark customers at higher churn risk with `high_risk_flag`.

In [None]:
#identify customers at higher churn risk with high_risk_flag
Churn['high_risk_flag'] = (
    ((Churn['products_number'] <= 1) & (Churn['balance'] < Churn['balance'].median()) & (Churn['active_member'] == 0))
).astype(int)

### 6. Drop Unnecessary Features

In [None]:
#drop unnecessary features
churn_copy = Churn.copy() #keep a copy for Power BI model
Churn = Churn.drop(columns=['customer_id'])

## Exploratory Data Analysis (EDA)
The **goal** of this analysis is to understand **who churns**, **why they churn**, and **which signals *matter most***.

### 1. Target Variable Overview
What percentage of customers churn?

In [None]:
#reporting 
churn_count = Churn['churn'].value_counts().sort_index()
churn_pct = Churn['churn'].value_counts(normalize=True) * 100

print('\nChurn Count:')
print(f"Retained: {churn_count[0]}")
print(f"Churned: {churn_count[1]}")

In [None]:
#visualization
fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(
    churn_pct.index,
    churn_pct.values,
    width=0.5,
    color=['skyblue', 'salmon'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn Distribution', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Churn')

ax.set_xticks([0, 1])
ax.set_xticklabels(['Retained', 'Churned'])

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()

### 2. Demographic vs. Churn Analysis
Does churn increase with age?

In [None]:
age_pct = Churn['age_group'].value_counts(normalize=True).sort_index() * 100

#visualization
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(
    age_pct.index, 
    age_pct.values,
    color=['lightskyblue', 'skyblue', 'salmon', 'lightsalmon'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn by Age Group', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Age Group')
#ax.set_ylim(0, 1)

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()

Are there gender-based churn differences?

In [None]:
gender_count = Churn['gender'].value_counts().sort_index()

print('\nChurn Count:')
print(f"Male: {gender_count['Male']}")
print(f"Female: {gender_count['Female']}")

In [None]:
gender_pct = Churn['gender'].value_counts(normalize=True) * 100

#visualization
fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(
    gender_pct.index,
    gender_pct.values,
    width=0.5,
    color=['skyblue', 'salmon'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn by Gender', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Gender')

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()

Do some countries churn more than others?

In [None]:
country_pct = Churn['country'].value_counts(normalize=True) * 100

#visualization
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(
    country_pct.index, 
    country_pct.values,
    color=['skyblue', 'salmon', 'lightsalmon'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn by Country', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Country')

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')
    
plt.show()

### 3. Tenure and Loyalty Analysis
Do newer customers churn more?

In [None]:
tenure_churn_pct = Churn['tenure_bucket'].value_counts(normalize=True).sort_index() * 100

#visualization
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(
    tenure_churn_pct.index, 
    tenure_churn_pct.values,
    color=['skyblue', 'salmon', 'lightsalmon'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn Rate by Account Tenure', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Tenure Group')

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()

Is long-term loyalty protective?

In [None]:
#visualization
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(
    tenure_churn_pct.index, 
    tenure_churn_pct.values,
    color='darksalmon',
    marker='o'
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn Trend Across Customer Tenure', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Tenure Group')

for x, y in zip(tenure_churn_pct.index, tenure_churn_pct.values):
    ax.text(x, y, f'{y:.1f}%', ha='center', va='bottom')

plt.show()

### 4. Financial Behavior Analysis
Do low-balance customers churn more?

In [None]:
#visualization
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(
    x='churn',
    y='balance',
    hue='churn',
    data=Churn,
    ax=ax,
    palette=['skyblue', 'salmon'],
    legend=False,
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Account Balance by Churn', fontsize=16)
ax.set_xlabel('Churn')
ax.set_ylabel('Balance')

ax.set_xticks([0, 1])
ax.set_xticklabels(['Retained', 'Churned'])

plt.show()

Are high-salary customers more stable?

In [None]:
#visualization
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(
    x='churn',
    y='estimated_salary',
    hue='churn',
    data=Churn,
    ax=ax,
    palette=['skyblue', 'salmon'],
    legend=False,
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Estimated Salary by Churn', fontsize=16)
ax.set_xlabel('Churn')
ax.set_ylabel('Estimated Salary')

ax.set_xticks([0, 1])
ax.set_xticklabels(['Retained', 'Churned'])

plt.show()

Does low credit score correlate with churn?

In [None]:
#visualization
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(
    x='churn',
    y='credit_score',
    hue='churn',
    data=Churn,
    ax=ax,
    palette=['skyblue', 'salmon'],
    legend=False,
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Credit Score by Churn', fontsize=16)
ax.set_ylabel('Credit Score')
ax.set_xlabel('Churn')

ax.set_xticks([0, 1])
ax.set_xticklabels(['Retained', 'Churned'])

plt.show()

### 5. Product and Engagement Analysis
Do customers with more products churn less?

In [None]:
products_pct = Churn['products_number'].value_counts(normalize=True) * 100

# visualization
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(
    products_pct.index, 
    products_pct.values,
    color=['lightsalmon', 'salmon', 'skyblue', 'lightskyblue'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn Rate by Number of Products', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Number of Products')

ax.set_xticks([1, 2, 3, 4])

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()


Are inactive members more likely to churn?

In [None]:
active_pct = Churn['active_member'].value_counts(normalize=True).sort_index() * 100

# visualization
fig, ax = plt.subplots(figsize=(6, 5))
ax.bar(
    active_pct.index, 
    active_pct.values,
    color=['salmon', 'skyblue'],
    zorder=2
)
ax.grid(True, which='both', color='lightgrey', zorder=0)
ax.set_facecolor('aliceblue')

ax.set_title('Churn Rate by Active Member', fontsize=16)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Active Member')

ax.set_xticks([0, 1])
ax.set_xticklabels(['Inactive', 'Active'])

for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom')

plt.show()

### 6. Corss Analysis

### 7. Correlation Analysis

## Final Insights

## Prediction Model