# Credit Risk Model - Exploratory Data Analysis

This notebook performs exploratory data analysis for the credit risk modeling project.
We'll analyze customer transaction data and develop RFM-based risk proxies.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from datetime import datetime, timedelta
import sys
import os

# Add src to path
sys.path.append('../src')

from data_processing import RFMAnalyzer, FeatureEngineer

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Data Loading and Initial Exploration

In [None]:
# For demonstration, we'll create synthetic data
# In practice, you would load your actual transaction data here

def create_synthetic_data(n_customers=1000, seed=42):
    """
    Create synthetic transaction data for demonstration
    """
    np.random.seed(seed)
    
    customers = [f"CUST_{i:04d}" for i in range(1, n_customers + 1)]
    data = []
    
    base_date = datetime(2023, 1, 1)
    end_date = datetime(2024, 12, 31)
    
    for customer in customers:
        # Simulate different customer behaviors
        customer_type = np.random.choice(['high_value', 'medium_value', 'low_value', 'churned'], 
                                       p=[0.15, 0.35, 0.35, 0.15])
        
        if customer_type == 'high_value':
            n_transactions = np.random.randint(50, 200)
            amount_range = (100, 1000)
            recency_bias = 30  # Recent transactions
        elif customer_type == 'medium_value':
            n_transactions = np.random.randint(20, 80)
            amount_range = (50, 500)
            recency_bias = 60
        elif customer_type == 'low_value':
            n_transactions = np.random.randint(5, 30)
            amount_range = (20, 200)
            recency_bias = 120
        else:  # churned
            n_transactions = np.random.randint(1, 10)
            amount_range = (10, 100)
            recency_bias = 300
        
        for i in range(n_transactions):
            # Bias transaction dates based on customer type
            days_back = np.random.exponential(recency_bias)
            days_back = min(days_back, 730)  # Max 2 years back
            
            transaction_date = end_date - timedelta(days=days_back)
            
            # Add some time of day variation
            hour = np.random.choice(range(24), p=[
                0.01, 0.01, 0.01, 0.01, 0.01, 0.02,  # 0-5
                0.03, 0.04, 0.05, 0.06, 0.07, 0.08,  # 6-11
                0.09, 0.08, 0.07, 0.06, 0.05, 0.04,  # 12-17
                0.05, 0.06, 0.05, 0.04, 0.03, 0.02   # 18-23
            ])
            
            transaction_date = transaction_date.replace(hour=hour)
            
            amount = np.random.uniform(*amount_range)
            
            data.append({
                'customer_id': customer,
                'transaction_date': transaction_date,
                'amount': round(amount, 2),
                'customer_type': customer_type  # For validation only
            })
    
    return pd.DataFrame(data)

# Create synthetic data
df = create_synthetic_data(n_customers=1000)
print(f"Created dataset with {len(df)} transactions for {df['customer_id'].nunique()} customers")
print(f"Date range: {df['transaction_date'].min()} to {df['transaction_date'].max()}")
df.head()

## 3. Basic Data Exploration

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total transactions: {len(df):,}")
print(f"Unique customers: {df['customer_id'].nunique():,}")
print(f"Date range: {df['transaction_date'].min().date()} to {df['transaction_date'].max().date()}")
print(f"Total transaction value: ${df['amount'].sum():,.2f}")
print(f"Average transaction value: ${df['amount'].mean():.2f}")

print("\nTransaction Amount Statistics:")
print(df['amount'].describe())

In [None]:
# Visualize transaction distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Transaction amount distribution
axes[0, 0].hist(df['amount'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Distribution of Transaction Amounts')
axes[0, 0].set_xlabel('Amount ($)')
axes[0, 0].set_ylabel('Frequency')

# Log-transformed amount distribution
axes[0, 1].hist(np.log1p(df['amount']), bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Log-Transformed Transaction Amounts')
axes[0, 1].set_xlabel('Log(Amount + 1)')
axes[0, 1].set_ylabel('Frequency')

# Transactions over time
daily_transactions = df.groupby(df['transaction_date'].dt.date).size()
axes[1, 0].plot(daily_transactions.index, daily_transactions.values)
axes[1, 0].set_title('Daily Transaction Volume')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Number of Transactions')
axes[1, 0].tick_params(axis='x', rotation=45)

# Transaction hour distribution
hourly_dist = df['transaction_date'].dt.hour.value_counts().sort_index()
axes[1, 1].bar(hourly_dist.index, hourly_dist.values, alpha=0.7)
axes[1, 1].set_title('Transaction Distribution by Hour')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Number of Transactions')

plt.tight_layout()
plt.show()

## 4. RFM Analysis

In [None]:
# Initialize RFM Analyzer
rfm_analyzer = RFMAnalyzer(reference_date=datetime(2024, 12, 31))

# Calculate RFM metrics
rfm_data = rfm_analyzer.calculate_rfm(
    df.drop('customer_type', axis=1),  # Remove the synthetic label
    customer_id_col='customer_id',
    transaction_date_col='transaction_date',
    amount_col='amount'
)

print("RFM Metrics calculated:")
print(rfm_data.head())
print("\nRFM Statistics:")
print(rfm_data[['recency', 'frequency', 'monetary_total', 'monetary_avg']].describe())

In [None]:
# Visualize RFM distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Recency distribution
axes[0, 0].hist(rfm_data['recency'], bins=30, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Recency Distribution (Days since last transaction)')
axes[0, 0].set_xlabel('Days')
axes[0, 0].set_ylabel('Number of Customers')

# Frequency distribution
axes[0, 1].hist(rfm_data['frequency'], bins=30, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Frequency Distribution (Number of transactions)')
axes[0, 1].set_xlabel('Number of Transactions')
axes[0, 1].set_ylabel('Number of Customers')

# Monetary distribution
axes[1, 0].hist(rfm_data['monetary_total'], bins=30, alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Monetary Distribution (Total spent)')
axes[1, 0].set_xlabel('Total Amount ($)')
axes[1, 0].set_ylabel('Number of Customers')

# Average transaction amount
axes[1, 1].hist(rfm_data['monetary_avg'], bins=30, alpha=0.7, edgecolor='black')
axes[1, 1].set_title('Average Transaction Amount Distribution')
axes[1, 1].set_xlabel('Average Amount ($)')
axes[1, 1].set_ylabel('Number of Customers')

plt.tight_layout()
plt.show()

In [None]:
# Create RFM scores
rfm_scores = rfm_analyzer.create_rfm_scores(rfm_data)

print("RFM Scores created:")
print(rfm_scores[['customer_id', 'r_score', 'f_score', 'm_score', 'rfm_score']].head())

# Visualize score distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, score in enumerate(['r_score', 'f_score', 'm_score']):
    score_counts = rfm_scores[score].value_counts().sort_index()
    axes[i].bar(score_counts.index, score_counts.values, alpha=0.7)
    axes[i].set_title(f'{score.upper()} Distribution')
    axes[i].set_xlabel('Score')
    axes[i].set_ylabel('Number of Customers')
    axes[i].set_xticks(range(1, 6))

plt.tight_layout()
plt.show()

## 5. Risk Segmentation

In [None]:
# Create risk segments
risk_segments = rfm_analyzer.create_risk_segments(rfm_scores)

print("Risk Segments created:")
print("\nSegment Distribution:")
segment_dist = risk_segments['segment'].value_counts()
print(segment_dist)

print("\nRisk Category Distribution:")
risk_dist = risk_segments['risk_category'].value_counts()
print(risk_dist)

print("\nDefault Proxy Distribution:")
default_dist = risk_segments['default_proxy'].value_counts()
print(f"Good customers (0): {default_dist[0]} ({default_dist[0]/len(risk_segments)*100:.1f}%)")
print(f"Bad customers (1): {default_dist[1]} ({default_dist[1]/len(risk_segments)*100:.1f}%)")

In [None]:
# Visualize segments
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Customer segments
segment_counts = risk_segments['segment'].value_counts()
axes[0].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Customer Segments Distribution')

# Risk categories
risk_counts = risk_segments['risk_category'].value_counts()
colors = ['green', 'orange', 'red']
axes[1].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', 
           colors=colors[:len(risk_counts)], startangle=90)
axes[1].set_title('Risk Categories Distribution')

# Default proxy
default_counts = risk_segments['default_proxy'].value_counts()
labels = ['Good (0)', 'Bad (1)']
colors = ['lightblue', 'lightcoral']
axes[2].pie(default_counts.values, labels=labels, autopct='%1.1f%%', 
           colors=colors, startangle=90)
axes[2].set_title('Default Proxy Distribution')

plt.tight_layout()
plt.show()

## 6. Feature Engineering

In [None]:
# Initialize Feature Engineer
feature_engineer = FeatureEngineer()

# Create transaction features
transaction_features = feature_engineer.create_transaction_features(
    df.drop('customer_type', axis=1)
)

print("Transaction Features created:")
print(transaction_features.head())
print("\nFeature Statistics:")
print(transaction_features.describe())

In [None]:
# Combine all features for modeling
model_data = feature_engineer.prepare_model_data(risk_segments, transaction_features)

print(f"Model dataset created with {len(model_data)} customers and {len(model_data.columns)} features")
print("\nFeature columns:")
print(model_data.columns.tolist())

print("\nTarget variable distribution:")
print(model_data['default_proxy'].value_counts())

## 7. Feature Analysis and Correlations

In [None]:
# Analyze feature correlations with target
numeric_features = model_data.select_dtypes(include=[np.number]).columns
correlations = model_data[numeric_features].corr()['default_proxy'].sort_values(key=abs, ascending=False)

print("Feature correlations with default_proxy:")
print(correlations.head(15))

# Visualize top correlations
top_correlations = correlations.head(10)
plt.figure(figsize=(10, 6))
colors = ['red' if x < 0 else 'blue' for x in top_correlations.values]
plt.barh(range(len(top_correlations)), top_correlations.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_correlations)), top_correlations.index)
plt.xlabel('Correlation with Default Proxy')
plt.title('Top 10 Feature Correlations with Default Risk')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by risk category
key_features = ['recency', 'frequency', 'monetary_total', 'transaction_velocity']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    for risk_cat in ['Low', 'Medium', 'High']:
        data = model_data[model_data['risk_category'] == risk_cat][feature]
        if len(data) > 0:
            axes[i].hist(data, alpha=0.6, label=f'{risk_cat} Risk', bins=20)
    
    axes[i].set_title(f'{feature} Distribution by Risk Category')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

plt.tight_layout()
plt.show()

## 8. Validation Against Synthetic Labels

In [None]:
# Validate our risk proxy against the synthetic customer types
# (This is only possible because we created synthetic data with known labels)

# Get customer types for validation
customer_types = df.groupby('customer_id')['customer_type'].first().reset_index()
validation_data = model_data.merge(customer_types, on='customer_id', how='left')

# Cross-tabulation
cross_tab = pd.crosstab(validation_data['customer_type'], validation_data['risk_category'])
print("Cross-tabulation: Synthetic Customer Type vs Risk Category")
print(cross_tab)

# Percentage breakdown
cross_tab_pct = pd.crosstab(validation_data['customer_type'], validation_data['risk_category'], normalize='index') * 100
print("\nPercentage breakdown:")
print(cross_tab_pct.round(1))

# Visualize the validation
plt.figure(figsize=(10, 6))
sns.heatmap(cross_tab_pct, annot=True, fmt='.1f', cmap='RdYlBu_r', cbar_kws={'label': 'Percentage'})
plt.title('Customer Type vs Risk Category (% within each customer type)')
plt.ylabel('Synthetic Customer Type')
plt.xlabel('Predicted Risk Category')
plt.tight_layout()
plt.show()

## 9. Summary and Insights

In [None]:
print("=== EXPLORATORY DATA ANALYSIS SUMMARY ===")
print(f"\n1. Dataset Overview:")
print(f"   - Total transactions: {len(df):,}")
print(f"   - Unique customers: {df['customer_id'].nunique():,}")
print(f"   - Average transactions per customer: {len(df) / df['customer_id'].nunique():.1f}")
print(f"   - Total transaction value: ${df['amount'].sum():,.2f}")

print(f"\n2. RFM Analysis Results:")
print(f"   - Average recency: {rfm_data['recency'].mean():.1f} days")
print(f"   - Average frequency: {rfm_data['frequency'].mean():.1f} transactions")
print(f"   - Average monetary value: ${rfm_data['monetary_total'].mean():.2f}")

print(f"\n3. Risk Segmentation:")
risk_dist = model_data['risk_category'].value_counts()
for category in ['Low', 'Medium', 'High']:
    if category in risk_dist.index:
        count = risk_dist[category]
        pct = count / len(model_data) * 100
        print(f"   - {category} Risk: {count} customers ({pct:.1f}%)")

print(f"\n4. Default Proxy:")
default_rate = model_data['default_proxy'].mean() * 100
print(f"   - Default rate: {default_rate:.1f}%")
print(f"   - Good customers: {(model_data['default_proxy'] == 0).sum()}")
print(f"   - Bad customers: {(model_data['default_proxy'] == 1).sum()}")

print(f"\n5. Key Insights:")
top_corr_features = correlations.head(5).index.tolist()
print(f"   - Top predictive features: {', '.join(top_corr_features)}")
print(f"   - Model-ready dataset: {len(model_data)} customers, {len(model_data.columns)} features")
print(f"   - Ready for model training and validation")

print("\n=== NEXT STEPS ===")
print("1. Train multiple models (Logistic Regression, Random Forest, XGBoost)")
print("2. Evaluate model performance using appropriate metrics")
print("3. Implement model interpretability and explainability")
print("4. Deploy the best model via API")
print("5. Set up monitoring and model drift detection")

In [None]:
# Save processed data for model training
model_data.to_csv('../data/processed/model_data.csv', index=False)
print("Model data saved to '../data/processed/model_data.csv'")

# Save RFM analysis results
risk_segments.to_csv('../data/processed/rfm_analysis.csv', index=False)
print("RFM analysis saved to '../data/processed/rfm_analysis.csv'")