# **Exploratory Data Analysis (EDA) on Churn Dataset**
This notebook performs a comprehensive EDA, including univariate, bivariate analysis, outlier detection, and correlation analysis.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# **Load the Dataset**
file_path = "/mnt/data/Churn_Modelling (2).csv"
df = pd.read_csv(file_path)
print("## Dataset Overview")
df.info()
df.head()

In [None]:
# **Univariate Analysis**
sns.set_style("whitegrid")
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(num_cols[2:]):  # Excluding RowNumber & CustomerId
    sns.histplot(df[col], kde=True, ax=axes[i], bins=30)
    axes[i].set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

In [None]:
# **Categorical Variables Analysis**
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts', 'Exited']

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    sns.countplot(x=df[col], palette="coolwarm", ax=axes[i])
    axes[i].set_title(f"Count of {col}")

plt.tight_layout()
plt.show()

In [None]:
# **Bivariate Analysis (Churn vs. Categorical Variables)**
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(cat_cols[:-1]):  # Exclude 'Exited' itself
    sns.countplot(x=df[col], hue=df['Exited'], palette="coolwarm", ax=axes[i])
    axes[i].set_title(f"Churn Distribution by {col}")

plt.tight_layout()
plt.show()

In [None]:
# **Outlier Detection using IQR**
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers.shape[0]

outlier_counts = {col: detect_outliers_iqr(df, col) for col in num_cols[2:]}
print("## Outlier Detection using IQR")
print(outlier_counts)

In [None]:
# **Correlation Analysis**
plt.figure(figsize=(10, 6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()