# E-Commerce Customer Churn Prediction
## Exploratory Data Analysis (EDA)

**Author:** Muhammad Abdullah  
**Project:** ML Fundamentals - Customer Churn Prediction

---

### Objectives:
1. Load and understand the dataset
2. Analyze data quality (missing values, duplicates)
3. Explore feature distributions
4. Analyze relationships with target variable
5. Identify patterns and insights for modeling

In [None]:
# Import libraries
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries imported successfully!')

## 1. Load Data

In [None]:
# Load the dataset
# Update the path to your actual dataset location
DATA_PATH = project_root / 'data' / 'raw' / 'ecommerce_customer_churn.csv'

# Alternative: If using Excel file
# DATA_PATH = project_root / 'data' / 'raw' / 'E Commerce Dataset.xlsx'

try:
    if DATA_PATH.suffix == '.csv':
        df = pd.read_csv(DATA_PATH)
    else:
        df = pd.read_excel(DATA_PATH, sheet_name='E Comm')
    print(f'Dataset loaded successfully!')
    print(f'Shape: {df.shape[0]} rows, {df.shape[1]} columns')
except FileNotFoundError:
    print(f'File not found at {DATA_PATH}')
    print('Please place your dataset in the data/raw folder')
    # Create sample data for demonstration
    print('\nCreating sample data for demonstration...')
    np.random.seed(42)
    n_samples = 5000
    df = pd.DataFrame({
        'CustomerID': range(1, n_samples + 1),
        'Churn': np.random.choice([0, 1], n_samples, p=[0.83, 0.17]),
        'Tenure': np.random.randint(0, 61, n_samples),
        'PreferredLoginDevice': np.random.choice(['Mobile Phone', 'Computer', 'Phone'], n_samples),
        'CityTier': np.random.choice([1, 2, 3], n_samples),
        'WarehouseToHome': np.random.uniform(5, 35, n_samples),
        'PreferredPaymentMode': np.random.choice(['Debit Card', 'Credit Card', 'E wallet', 'COD', 'UPI'], n_samples),
        'Gender': np.random.choice(['Male', 'Female'], n_samples),
        'HourSpendOnApp': np.random.uniform(0, 5, n_samples),
        'NumberOfDeviceRegistered': np.random.randint(1, 7, n_samples),
        'PreferedOrderCat': np.random.choice(['Laptop & Accessory', 'Mobile', 'Fashion', 'Grocery', 'Others'], n_samples),
        'SatisfactionScore': np.random.randint(1, 6, n_samples),
        'MaritalStatus': np.random.choice(['Single', 'Married', 'Divorced'], n_samples),
        'NumberOfAddress': np.random.randint(1, 11, n_samples),
        'Complain': np.random.choice([0, 1], n_samples, p=[0.72, 0.28]),
        'OrderAmountHikeFromlastYear': np.random.uniform(11, 26, n_samples),
        'CouponUsed': np.random.randint(0, 16, n_samples),
        'OrderCount': np.random.randint(1, 16, n_samples),
        'DaySinceLastOrder': np.random.randint(0, 46, n_samples),
        'CashbackAmount': np.random.uniform(0, 325, n_samples)
    })
    print(f'Sample data created: {df.shape}')

In [None]:
# First look at the data
df.head(10)

In [None]:
# Basic info
print('Dataset Info:')
print('=' * 50)
df.info()

In [None]:
# Statistical summary
df.describe()

## 2. Data Quality Analysis

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print('Missing Values Analysis:')
print('=' * 50)
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print('No missing values found!')

In [None]:
# Visualize missing values
if missing_df['Missing Count'].sum() > 0:
    fig = px.bar(
        missing_df[missing_df['Missing Count'] > 0].reset_index(),
        x='index', y='Missing %',
        title='Missing Values by Feature',
        labels={'index': 'Feature', 'Missing %': 'Percentage Missing'}
    )
    fig.show()

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f'Duplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)')

## 3. Target Variable Analysis

In [None]:
# Target variable distribution
target_col = 'Churn'

print('Target Variable Distribution:')
print('=' * 50)
print(df[target_col].value_counts())
print(f'\nChurn Rate: {df[target_col].mean()*100:.2f}%')

In [None]:
# Visualize target distribution
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'domain'}]])

# Bar chart
counts = df[target_col].value_counts()
fig.add_trace(
    go.Bar(x=['No Churn', 'Churn'], y=counts.values, marker_color=['#2ecc71', '#e74c3c']),
    row=1, col=1
)

# Pie chart
fig.add_trace(
    go.Pie(labels=['No Churn', 'Churn'], values=counts.values, 
           marker_colors=['#2ecc71', '#e74c3c']),
    row=1, col=2
)

fig.update_layout(title='Churn Distribution', showlegend=False, height=400)
fig.show()

## 4. Feature Analysis

In [None]:
# Identify feature types
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target and ID from numerical
if 'CustomerID' in numerical_cols:
    numerical_cols.remove('CustomerID')
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

print(f'Numerical features ({len(numerical_cols)}): {numerical_cols}')
print(f'\nCategorical features ({len(categorical_cols)}): {categorical_cols}')

In [None]:
# Numerical features distribution
n_cols = 3
n_rows = (len(numerical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    ax = axes[idx]
    df[col].hist(bins=30, ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col}', fontsize=12)
    ax.set_xlabel('')

# Hide empty subplots
for idx in range(len(numerical_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'figures' / 'numerical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Categorical features distribution
n_cols = 2
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4*n_rows))
axes = axes.flatten()

for idx, col in enumerate(categorical_cols):
    ax = axes[idx]
    df[col].value_counts().plot(kind='bar', ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col}', fontsize=12)
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=45)

# Hide empty subplots
for idx in range(len(categorical_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'figures' / 'categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Feature vs Target Analysis

In [None]:
# Numerical features vs Churn
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols[:12]):
    ax = axes[idx]
    df.boxplot(column=col, by=target_col, ax=ax)
    ax.set_title(col)
    ax.set_xlabel('Churn')

plt.suptitle('Numerical Features by Churn Status', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(project_root / 'reports' / 'figures' / 'numerical_vs_churn.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Categorical features vs Churn
for col in categorical_cols:
    fig = px.histogram(
        df, x=col, color=target_col.astype(str) if df[target_col].dtype != 'object' else target_col,
        barmode='group',
        title=f'{col} by Churn Status',
        color_discrete_map={'0': '#2ecc71', '1': '#e74c3c'}
    )
    fig.show()

In [None]:
# Churn rate by categorical features
for col in categorical_cols:
    churn_rate = df.groupby(col)[target_col].mean() * 100
    fig = px.bar(
        x=churn_rate.index, y=churn_rate.values,
        title=f'Churn Rate by {col}',
        labels={'x': col, 'y': 'Churn Rate (%)'},
        color=churn_rate.values,
        color_continuous_scale='RdYlGn_r'
    )
    fig.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
corr_cols = numerical_cols + [target_col]
corr_matrix = df[corr_cols].corr()

fig = px.imshow(
    corr_matrix,
    text_auto='.2f',
    aspect='auto',
    title='Correlation Matrix',
    color_continuous_scale='RdBu_r'
)
fig.update_layout(width=900, height=700)
fig.show()

In [None]:
# Correlation with target
target_corr = corr_matrix[target_col].drop(target_col).sort_values(ascending=False)

fig = px.bar(
    x=target_corr.values, y=target_corr.index,
    orientation='h',
    title='Correlation with Churn',
    labels={'x': 'Correlation', 'y': 'Feature'},
    color=target_corr.values,
    color_continuous_scale='RdBu_r'
)
fig.update_layout(height=500)
fig.show()

print('Top positive correlations with Churn:')
print(target_corr.head())
print('\nTop negative correlations with Churn:')
print(target_corr.tail())

## 7. Key Insights Summary

In [None]:
print('='*60)
print('KEY INSIGHTS FROM EDA')
print('='*60)

print(f'''
1. DATASET OVERVIEW:
   - Total samples: {len(df)}
   - Features: {len(df.columns) - 2} (excluding ID and target)
   - Churn rate: {df[target_col].mean()*100:.2f}%

2. DATA QUALITY:
   - Missing values: {df.isnull().sum().sum()}
   - Duplicates: {df.duplicated().sum()}
   - Class imbalance: {'Yes - minority class needs handling' if df[target_col].mean() < 0.3 else 'Moderate'}

3. TOP CHURN INDICATORS (by correlation):
   {target_corr.head(3).to_string()}

4. FEATURE ENGINEERING OPPORTUNITIES:
   - Create tenure-based segments
   - Calculate engagement scores
   - Build recency/frequency metrics
   - Create satisfaction risk flags

5. MODELING RECOMMENDATIONS:
   - Handle class imbalance (SMOTE/class weights)
   - Use ensemble methods (XGBoost, LightGBM)
   - Perform hyperparameter tuning
   - Apply cross-validation
''')

In [None]:
# Save processed data for modeling
df.to_parquet(project_root / 'data' / 'processed' / 'eda_data.parquet', index=False)
print('Data saved for modeling!')