# Comprehensive Exploratory Data Analysis (EDA)
## Debt Collection ML System

This notebook provides in-depth exploratory data analysis for the debt collection dataset, focusing on understanding patterns that influence repayment behavior.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load data
import sys
import os
sys.path.append('../src')

from data.data_generator import DebtCollectionDataGenerator

# Generate or load data
if os.path.exists('../data/raw/debt_collection_data.csv'):
    df = pd.read_csv('../data/raw/debt_collection_data.csv')
    print("Loaded existing data")
else:
    generator = DebtCollectionDataGenerator(n_samples=10000)
    df = generator.generate_dataset()
    
    # Save data
    os.makedirs('../data/raw', exist_ok=True)
    df.to_csv('../data/raw/debt_collection_data.csv', index=False)
    print("Generated new data")

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n=== DATA TYPES ===")
print(df.dtypes)
print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

## 2. Target Variable Analysis

In [None]:
# Target variable distribution
outcome_counts = df['Outcome'].value_counts()
outcome_pct = df['Outcome'].value_counts(normalize=True) * 100

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Outcome Distribution (Count)', 'Outcome Distribution (Percentage)'),
    specs=[[{"type": "bar"}, {"type": "pie"}]]
)

# Bar chart
fig.add_trace(
    go.Bar(x=outcome_counts.index, y=outcome_counts.values, name='Count'),
    row=1, col=1
)

# Pie chart
fig.add_trace(
    go.Pie(labels=outcome_pct.index, values=outcome_pct.values, name='Percentage'),
    row=1, col=2
)

fig.update_layout(height=400, title_text="Target Variable Analysis")
fig.show()

print("Outcome Distribution:")
for outcome, count in outcome_counts.items():
    print(f"{outcome}: {count} ({outcome_pct[outcome]:.1f}%)")

## 3. Demographic Analysis

In [None]:
# Age distribution by outcome
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age distribution
for i, outcome in enumerate(df['Outcome'].unique()):
    subset = df[df['Outcome'] == outcome]
    axes[0, 0].hist(subset['Age'], alpha=0.7, label=outcome, bins=30)
axes[0, 0].set_title('Age Distribution by Outcome')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Income distribution (log scale)
for i, outcome in enumerate(df['Outcome'].unique()):
    subset = df[df['Outcome'] == outcome]
    axes[0, 1].hist(np.log10(subset['Income']), alpha=0.7, label=outcome, bins=30)
axes[0, 1].set_title('Income Distribution by Outcome (Log Scale)')
axes[0, 1].set_xlabel('Log10(Income)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Occupation distribution
occupation_outcome = pd.crosstab(df['Occupation'], df['Outcome'], normalize='index') * 100
occupation_outcome.plot(kind='bar', ax=axes[1, 0], stacked=True)
axes[1, 0].set_title('Outcome Distribution by Occupation')
axes[1, 0].set_ylabel('Percentage')
axes[1, 0].tick_params(axis='x', rotation=45)

# Region distribution
region_outcome = pd.crosstab(df['Region'], df['Outcome'], normalize='index') * 100
region_outcome.plot(kind='bar', ax=axes[1, 1], stacked=True)
axes[1, 1].set_title('Outcome Distribution by Region')
axes[1, 1].set_ylabel('Percentage')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Financial Metrics Analysis

In [None]:
# Create financial ratios for analysis
df_analysis = df.copy()
df_analysis['Debt_to_Income_Ratio'] = df_analysis['Outstanding_Balance'] / df_analysis['Income']
df_analysis['Loan_Utilization'] = df_analysis['Outstanding_Balance'] / df_analysis['Loan_Amount']

# Financial metrics by outcome
financial_metrics = ['Outstanding_Balance', 'Loan_Amount', 'Credit_Score', 'Debt_to_Income_Ratio']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, metric in enumerate(financial_metrics):
    df_analysis.boxplot(column=metric, by='Outcome', ax=axes[i])
    axes[i].set_title(f'{metric} by Outcome')
    axes[i].set_xlabel('Outcome')
    
plt.suptitle('Financial Metrics Analysis', y=1.02)
plt.tight_layout()
plt.show()

# Statistical summary by outcome
print("\n=== FINANCIAL METRICS BY OUTCOME ===")
for outcome in df['Outcome'].unique():
    print(f"\n{outcome.upper()}:")
    subset = df_analysis[df_analysis['Outcome'] == outcome]
    print(subset[financial_metrics].describe())

## 5. Behavioral Patterns Analysis

In [None]:
# Communication behavior analysis
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Days Past Due Distribution', 'Response Rate by Outcome', 
                   'Number of Calls vs Response Rate', 'Contact Channel Effectiveness')
)

# Days Past Due by outcome
for outcome in df['Outcome'].unique():
    subset = df[df['Outcome'] == outcome]
    fig.add_trace(
        go.Histogram(x=subset['Days_Past_Due'], name=f'{outcome}', opacity=0.7),
        row=1, col=1
    )

# Response rate by outcome
response_by_outcome = df.groupby('Outcome')['Response_Rate'].mean()
fig.add_trace(
    go.Bar(x=response_by_outcome.index, y=response_by_outcome.values, name='Avg Response Rate'),
    row=1, col=2
)

# Scatter: Number of calls vs Response rate (colored by outcome)
for outcome in df['Outcome'].unique():
    subset = df[df['Outcome'] == outcome]
    fig.add_trace(
        go.Scatter(x=subset['Number_of_Calls'], y=subset['Response_Rate'], 
                  mode='markers', name=f'{outcome}', opacity=0.6),
        row=2, col=1
    )

# Contact channel effectiveness
channel_effectiveness = df.groupby(['Last_Contact_Channel', 'Outcome']).size().unstack(fill_value=0)
channel_effectiveness_pct = channel_effectiveness.div(channel_effectiveness.sum(axis=1), axis=0) * 100

for outcome in channel_effectiveness_pct.columns:
    fig.add_trace(
        go.Bar(x=channel_effectiveness_pct.index, y=channel_effectiveness_pct[outcome], 
               name=f'{outcome}'),
        row=2, col=2
    )

fig.update_layout(height=800, title_text="Behavioral Patterns Analysis")
fig.show()

## 6. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'Customer_ID']

# Calculate correlation matrix
corr_matrix = df[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append((
                corr_matrix.columns[i], 
                corr_matrix.columns[j], 
                corr_matrix.iloc[i, j]
            ))

print("\n=== HIGHLY CORRELATED FEATURES (|r| > 0.7) ===")
for pair in high_corr_pairs:
    print(f"{pair[0]} <-> {pair[1]}: {pair[2]:.3f}")

## 7. Risk Segmentation Analysis

In [None]:
# Create risk segments based on multiple factors
def create_risk_segments(row):
    score = 0
    
    # Days past due (40% weight)
    if row['Days_Past_Due'] > 180:
        score += 4
    elif row['Days_Past_Due'] > 90:
        score += 3
    elif row['Days_Past_Due'] > 30:
        score += 2
    else:
        score += 1
    
    # Credit score (30% weight)
    if row['Credit_Score'] < 550:
        score += 3
    elif row['Credit_Score'] < 650:
        score += 2
    elif row['Credit_Score'] < 750:
        score += 1
    
    # Response rate (20% weight)
    if row['Response_Rate'] < 20:
        score += 2
    elif row['Response_Rate'] < 50:
        score += 1
    
    # Outstanding balance relative to income (10% weight)
    debt_ratio = row['Outstanding_Balance'] / row['Income']
    if debt_ratio > 0.5:
        score += 1
    
    # Classify risk
    if score <= 3:
        return 'Low Risk'
    elif score <= 6:
        return 'Medium Risk'
    else:
        return 'High Risk'

df_analysis['Risk_Segment'] = df_analysis.apply(create_risk_segments, axis=1)

# Analyze risk segments
risk_outcome = pd.crosstab(df_analysis['Risk_Segment'], df_analysis['Outcome'], normalize='index') * 100

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Risk segment distribution
risk_counts = df_analysis['Risk_Segment'].value_counts()
axes[0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%')
axes[0].set_title('Risk Segment Distribution')

# Outcome by risk segment
risk_outcome.plot(kind='bar', ax=axes[1], stacked=True)
axes[1].set_title('Outcome Distribution by Risk Segment')
axes[1].set_ylabel('Percentage')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\n=== RISK SEGMENT ANALYSIS ===")
print(risk_outcome.round(1))

## 8. Time-based Analysis

In [None]:
# Days Past Due analysis
dpd_bins = [0, 30, 60, 90, 180, 365, float('inf')]
dpd_labels = ['0-30', '31-60', '61-90', '91-180', '181-365', '365+']
df_analysis['DPD_Category'] = pd.cut(df_analysis['Days_Past_Due'], bins=dpd_bins, labels=dpd_labels)

# Payment behavior by DPD category
dpd_outcome = pd.crosstab(df_analysis['DPD_Category'], df_analysis['Outcome'], normalize='index') * 100

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# DPD category distribution
dpd_counts = df_analysis['DPD_Category'].value_counts().sort_index()
axes[0, 0].bar(range(len(dpd_counts)), dpd_counts.values)
axes[0, 0].set_xticks(range(len(dpd_counts)))
axes[0, 0].set_xticklabels(dpd_counts.index, rotation=45)
axes[0, 0].set_title('Distribution of Days Past Due Categories')
axes[0, 0].set_ylabel('Count')

# Payment rate by DPD category
payment_rate_by_dpd = df_analysis.groupby('DPD_Category')['Payment_Made_Last_30_Days'].mean() * 100
axes[0, 1].bar(range(len(payment_rate_by_dpd)), payment_rate_by_dpd.values)
axes[0, 1].set_xticks(range(len(payment_rate_by_dpd)))
axes[0, 1].set_xticklabels(payment_rate_by_dpd.index, rotation=45)
axes[0, 1].set_title('Recent Payment Rate by DPD Category')
axes[0, 1].set_ylabel('Payment Rate (%)')

# Outcome distribution by DPD category
dpd_outcome.plot(kind='bar', ax=axes[1, 0], stacked=True)
axes[1, 0].set_title('Outcome Distribution by DPD Category')
axes[1, 0].set_ylabel('Percentage')
axes[1, 0].tick_params(axis='x', rotation=45)

# Average outstanding balance by DPD category
avg_balance_by_dpd = df_analysis.groupby('DPD_Category')['Outstanding_Balance'].mean()
axes[1, 1].bar(range(len(avg_balance_by_dpd)), avg_balance_by_dpd.values)
axes[1, 1].set_xticks(range(len(avg_balance_by_dpd)))
axes[1, 1].set_xticklabels(avg_balance_by_dpd.index, rotation=45)
axes[1, 1].set_title('Average Outstanding Balance by DPD Category')
axes[1, 1].set_ylabel('Outstanding Balance')

plt.tight_layout()
plt.show()

## 9. Key Insights and Business Recommendations

In [None]:
# Generate key insights
insights = []

# Target distribution insight
paid_rate = (df['Outcome'] == 'Paid').mean() * 100
insights.append(f"Overall payment rate: {paid_rate:.1f}%")

# Best performing segments
best_occupation = df.groupby('Occupation')['Outcome'].apply(lambda x: (x == 'Paid').mean()).idxmax()
best_occupation_rate = df.groupby('Occupation')['Outcome'].apply(lambda x: (x == 'Paid').mean()).max() * 100
insights.append(f"Best performing occupation: {best_occupation} ({best_occupation_rate:.1f}% payment rate)")

best_region = df.groupby('Region')['Outcome'].apply(lambda x: (x == 'Paid').mean()).idxmax()
best_region_rate = df.groupby('Region')['Outcome'].apply(lambda x: (x == 'Paid').mean()).max() * 100
insights.append(f"Best performing region: {best_region} ({best_region_rate:.1f}% payment rate)")

# Communication insights
best_channel = df.groupby('Last_Contact_Channel')['Outcome'].apply(lambda x: (x == 'Paid').mean()).idxmax()
best_channel_rate = df.groupby('Last_Contact_Channel')['Outcome'].apply(lambda x: (x == 'Paid').mean()).max() * 100
insights.append(f"Most effective contact channel: {best_channel} ({best_channel_rate:.1f}% payment rate)")

# Risk insights
high_risk_payment_rate = df_analysis[df_analysis['Risk_Segment'] == 'High Risk']['Outcome'].apply(lambda x: x == 'Paid').mean() * 100
low_risk_payment_rate = df_analysis[df_analysis['Risk_Segment'] == 'Low Risk']['Outcome'].apply(lambda x: x == 'Paid').mean() * 100
insights.append(f"Payment rate difference: Low Risk ({low_risk_payment_rate:.1f}%) vs High Risk ({high_risk_payment_rate:.1f}%)")

# Financial insights
avg_debt_ratio_paid = df_analysis[df_analysis['Outcome'] == 'Paid']['Debt_to_Income_Ratio'].mean()
avg_debt_ratio_not_paid = df_analysis[df_analysis['Outcome'] == 'Not Paid']['Debt_to_Income_Ratio'].mean()
insights.append(f"Debt-to-income ratio: Paid customers ({avg_debt_ratio_paid:.2f}) vs Non-paid ({avg_debt_ratio_not_paid:.2f})")

print("=== KEY INSIGHTS ===")
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

# Business recommendations
recommendations = [
    f"Focus collection efforts on {best_channel} channel for higher success rates",
    f"Prioritize {best_occupation} customers as they show highest payment propensity",
    "Implement early intervention for customers with debt-to-income ratio > 0.5",
    "Develop specialized strategies for high-risk segments (DPD > 90 days)",
    "Leverage response rate as a key predictor - customers with <20% response rate need different approach",
    f"Expand operations in {best_region} region due to higher collection success"
]

print("\n=== BUSINESS RECOMMENDATIONS ===")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

## 10. Data Quality Assessment

In [None]:
# Comprehensive data quality assessment
from data.data_preprocessor import DataQualityChecker

quality_checker = DataQualityChecker()
quality_report = quality_checker.assess_data_quality(df)

print(quality_checker.generate_quality_report())

# Visualize data quality
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Missing data visualization
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
if len(missing_data) > 0:
    axes[0, 0].bar(range(len(missing_data)), missing_data.values)
    axes[0, 0].set_xticks(range(len(missing_data)))
    axes[0, 0].set_xticklabels(missing_data.index, rotation=45)
    axes[0, 0].set_title('Missing Data by Column')
    axes[0, 0].set_ylabel('Missing Count')
else:
    axes[0, 0].text(0.5, 0.5, 'No Missing Data', ha='center', va='center', transform=axes[0, 0].transAxes)
    axes[0, 0].set_title('Missing Data by Column')

# Duplicate analysis
duplicate_count = df.duplicated().sum()
axes[0, 1].pie([len(df) - duplicate_count, duplicate_count], 
               labels=['Unique', 'Duplicates'], autopct='%1.1f%%')
axes[0, 1].set_title(f'Duplicate Records ({duplicate_count} duplicates)')

# Cardinality analysis
cardinality = df.nunique().sort_values(ascending=False)
axes[1, 0].bar(range(len(cardinality)), cardinality.values)
axes[1, 0].set_xticks(range(len(cardinality)))
axes[1, 0].set_xticklabels(cardinality.index, rotation=45)
axes[1, 0].set_title('Feature Cardinality')
axes[1, 0].set_ylabel('Unique Values')

# Data types distribution
dtype_counts = df.dtypes.value_counts()
axes[1, 1].pie(dtype_counts.values, labels=dtype_counts.index, autopct='%1.1f%%')
axes[1, 1].set_title('Data Types Distribution')

plt.tight_layout()
plt.show()

print(f"\nOverall Data Quality Score: {quality_report['quality_score']:.1f}/100")

## Summary

This comprehensive EDA has revealed several key patterns in the debt collection data:

1. **Target Distribution**: The dataset shows realistic class imbalance typical in debt collection scenarios
2. **Demographic Patterns**: Clear differences in payment behavior across age groups, occupations, and regions
3. **Financial Indicators**: Strong correlation between credit scores, debt-to-income ratios, and payment outcomes
4. **Behavioral Insights**: Communication patterns and response rates are strong predictors of payment behavior
5. **Risk Segmentation**: Clear risk tiers can be established based on multiple factors
6. **Data Quality**: High-quality dataset with minimal missing data and realistic distributions

These insights will inform our feature engineering and model development strategies.