# Customer Churn Prediction - Exploratory Data Analysis

This notebook provides comprehensive exploratory data analysis for the customer churn prediction project.

## Objectives:
- Understand data structure and quality
- Identify patterns and relationships
- Discover insights for feature engineering
- Assess data readiness for modeling

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

In [None]:
# Load the data
import sys
import os
sys.path.append('../src')

from data_preprocessing import DataPreprocessor

# Generate sample data for analysis
preprocessor = DataPreprocessor()
df = preprocessor.generate_sample_data(n_samples=10000)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()

## 1. Data Overview and Quality Assessment

In [None]:
# Basic statistics
print("=== BASIC STATISTICS ===")
print(df.describe())

print("\n=== MISSING VALUES ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

print("\n=== DATA TYPES ===")
print(df.dtypes)

print("\n=== UNIQUE VALUES ===")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()} unique values")
    if df[col].nunique() < 10:
        print(f"  Values: {df[col].unique()}")
    print()

In [None]:
# Target variable analysis
print("=== TARGET VARIABLE ANALYSIS ===")
churn_rate = df['churned'].mean()
print(f"Overall churn rate: {churn_rate:.2%}")
print(f"Churned customers: {df['churned'].sum():,}")
print(f"Retained customers: {(df['churned'] == 0).sum():,}")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
df['churned'].value_counts().plot(kind='bar', ax=axes[0], color=['skyblue', 'lightcoral'])
axes[0].set_title('Customer Churn Distribution')
axes[0].set_xlabel('Churned (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
labels = ['Retained', 'Churned']
sizes = [df['churned'].value_counts()[0], df['churned'].value_counts()[1]]
axes[1].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
axes[1].set_title('Customer Churn Percentage')

plt.tight_layout()
plt.show()

## 2. Numerical Variables Analysis

In [None]:
# Numerical variables distribution
numerical_cols = ['age', 'tenure_months', 'monthly_charges', 'total_charges', 'support_calls', 'late_payments']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    # Distribution plot
    axes[i].hist(df[col], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Distribution of {col.replace("_", " ").title()}')
    axes[i].set_xlabel(col.replace("_", " ").title())
    axes[i].set_ylabel('Frequency')
    
    # Add statistics
    mean_val = df[col].mean()
    median_val = df[col].median()
    axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.1f}')
    axes[i].axvline(median_val, color='green', linestyle='--', alpha=0.7, label=f'Median: {median_val:.1f}')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plots to identify outliers
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    df.boxplot(column=col, by='churned', ax=axes[i])
    axes[i].set_title(f'{col.replace("_", " ").title()} by Churn Status')
    axes[i].set_xlabel('Churned (0=No, 1=Yes)')
    axes[i].set_ylabel(col.replace("_", " ").title())

plt.suptitle('Numerical Variables by Churn Status', y=1.02)
plt.tight_layout()
plt.show()

## 3. Categorical Variables Analysis

In [None]:
# Categorical variables analysis
categorical_cols = ['gender', 'contract_type', 'payment_method', 'internet_service', 'online_security', 'tech_support']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_cols):
    # Create crosstab
    crosstab = pd.crosstab(df[col], df['churned'], normalize='index')
    
    # Plot
    crosstab.plot(kind='bar', ax=axes[i], color=['skyblue', 'lightcoral'])
    axes[i].set_title(f'Churn Rate by {col.replace("_", " ").title()}')
    axes[i].set_xlabel(col.replace("_", " ").title())
    axes[i].set_ylabel('Proportion')
    axes[i].legend(['Retained', 'Churned'])
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Detailed churn analysis by categorical variables
print("=== CHURN RATE BY CATEGORICAL VARIABLES ===")

for col in categorical_cols:
    print(f"\n{col.upper()}:")
    churn_by_category = df.groupby(col)['churned'].agg(['count', 'sum', 'mean']).round(3)
    churn_by_category.columns = ['Total_Customers', 'Churned_Count', 'Churn_Rate']
    churn_by_category['Churn_Rate_Pct'] = (churn_by_category['Churn_Rate'] * 100).round(1)
    print(churn_by_category.sort_values('Churn_Rate', ascending=False))

## 4. Correlation Analysis

In [None]:
# Correlation matrix for numerical variables
numerical_df = df[numerical_cols + ['churned']]
correlation_matrix = numerical_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - Numerical Variables')
plt.tight_layout()
plt.show()

# Print correlations with churn
print("\n=== CORRELATIONS WITH CHURN ===")
churn_correlations = correlation_matrix['churned'].drop('churned').sort_values(key=abs, ascending=False)
print(churn_correlations)

## 5. Advanced Analysis

In [None]:
# Customer segmentation analysis
print("=== CUSTOMER SEGMENTATION ANALYSIS ===")

# Create tenure segments
df['tenure_segment'] = pd.cut(df['tenure_months'], 
                             bins=[0, 12, 24, 48, float('inf')],
                             labels=['New (0-12m)', 'Growing (12-24m)', 'Mature (24-48m)', 'Veteran (48m+)'])

# Create charge segments
df['charge_segment'] = pd.qcut(df['monthly_charges'], 
                              q=4, 
                              labels=['Low', 'Medium', 'High', 'Premium'])

# Analyze churn by segments
segment_analysis = df.groupby(['tenure_segment', 'charge_segment'])['churned'].agg(['count', 'mean']).round(3)
segment_analysis.columns = ['Customer_Count', 'Churn_Rate']
print(segment_analysis)

# Visualize segment analysis
pivot_table = df.pivot_table(values='churned', index='tenure_segment', columns='charge_segment', aggfunc='mean')

plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='Reds', fmt='.3f', cbar_kws={'label': 'Churn Rate'})
plt.title('Churn Rate by Tenure and Charge Segments')
plt.ylabel('Tenure Segment')
plt.xlabel('Charge Segment')
plt.tight_layout()
plt.show()

In [None]:
# Customer lifetime value analysis
print("=== CUSTOMER LIFETIME VALUE ANALYSIS ===")

# Calculate basic CLV metrics
df['avg_monthly_revenue'] = df['total_charges'] / df['tenure_months']
df['clv_estimate'] = df['monthly_charges'] * 24  # 2-year estimate

# CLV by churn status
clv_analysis = df.groupby('churned').agg({
    'monthly_charges': ['mean', 'sum'],
    'total_charges': ['mean', 'sum'],
    'clv_estimate': ['mean', 'sum'],
    'tenure_months': 'mean'
}).round(2)

print("CLV Analysis by Churn Status:")
print(clv_analysis)

# Visualize CLV distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# CLV by churn status
df.boxplot(column='clv_estimate', by='churned', ax=axes[0])
axes[0].set_title('CLV Distribution by Churn Status')
axes[0].set_xlabel('Churned (0=No, 1=Yes)')
axes[0].set_ylabel('Estimated CLV ($)')

# Revenue at risk
churned_clv = df[df['churned'] == 1]['clv_estimate'].sum()
retained_clv = df[df['churned'] == 0]['clv_estimate'].sum()

axes[1].bar(['Retained', 'Churned'], [retained_clv, churned_clv], 
           color=['skyblue', 'lightcoral'])
axes[1].set_title('Total CLV by Churn Status')
axes[1].set_ylabel('Total CLV ($)')

# Add value labels
for i, v in enumerate([retained_clv, churned_clv]):
    axes[1].text(i, v + churned_clv * 0.01, f'${v:,.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"\nRevenue at risk from churned customers: ${churned_clv:,.2f}")
print(f"Percentage of total CLV at risk: {(churned_clv / (churned_clv + retained_clv)) * 100:.1f}%")

## 6. Key Insights and Recommendations

In [None]:
# Generate key insights
print("=== KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS ===")
print()

# 1. Overall churn insights
churn_rate = df['churned'].mean()
print(f"1. CHURN OVERVIEW:")
print(f"   • Overall churn rate: {churn_rate:.1%}")
print(f"   • {df['churned'].sum():,} out of {len(df):,} customers have churned")
print()

# 2. High-risk segments
print(f"2. HIGH-RISK SEGMENTS:")
high_risk_contract = df.groupby('contract_type')['churned'].mean().idxmax()
high_risk_payment = df.groupby('payment_method')['churned'].mean().idxmax()
print(f"   • Highest risk contract type: {high_risk_contract} ({df.groupby('contract_type')['churned'].mean().max():.1%} churn rate)")
print(f"   • Highest risk payment method: {high_risk_payment} ({df.groupby('payment_method')['churned'].mean().max():.1%} churn rate)")
print()

# 3. Financial impact
print(f"3. FINANCIAL IMPACT:")
monthly_loss = df[df['churned'] == 1]['monthly_charges'].sum()
clv_loss = df[df['churned'] == 1]['clv_estimate'].sum()
print(f"   • Immediate monthly revenue loss: ${monthly_loss:,.2f}")
print(f"   • Estimated CLV loss: ${clv_loss:,.2f}")
print(f"   • Average CLV per churned customer: ${clv_loss / df['churned'].sum():,.2f}")
print()

# 4. Correlation insights
print(f"4. KEY CORRELATIONS WITH CHURN:")
top_correlations = correlation_matrix['churned'].drop('churned').abs().sort_values(ascending=False).head(3)
for var, corr in top_correlations.items():
    direction = "positive" if correlation_matrix['churned'][var] > 0 else "negative"
    print(f"   • {var}: {direction} correlation ({correlation_matrix['churned'][var]:.3f})")
print()

# 5. Recommendations
print(f"5. RECOMMENDATIONS FOR MODELING:")
print(f"   • Focus on contract type and payment method as key features")
print(f"   • Consider tenure-based features for customer lifecycle modeling")
print(f"   • Include support calls and late payments as behavioral indicators")
print(f"   • Segment customers by CLV for targeted retention strategies")
print(f"   • Address class imbalance in modeling (churn rate: {churn_rate:.1%})")
print()

print("=== END OF EXPLORATORY DATA ANALYSIS ===")

In [None]:
# Save processed data for modeling
df.to_csv('../data/processed/eda_processed_data.csv', index=False)
print("✅ Processed data saved for modeling pipeline")