# Project Nova: EDA and Fairness Analysis

This notebook provides exploratory data analysis and fairness evaluation for the Nova Credit Scoring Engine.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("📊 Project Nova - EDA and Fairness Analysis")
print("=" * 50)

## Load Dataset

In [None]:
# Load the generated dataset
data_path = Path('../data/partners.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"✅ Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
    
    # Load metadata if available
    meta_path = Path('../data/metadata.json')
    if meta_path.exists():
        with open(meta_path) as f:
            metadata = json.load(f)
        print(f"📋 Metadata: {metadata}")
else:
    print("❌ Dataset not found. Please run the data generation first:")
    print("   python src/generate_data.py --n 50000 --seed 42 --out data/partners.csv")
    df = None

## Data Overview

In [None]:
if df is not None:
    print("🔍 Dataset Info:")
    print(df.info())
    print("\n📈 Basic Statistics:")
    display(df.describe())

In [None]:
if df is not None:
    print("🎯 Target Variable Distribution:")
    target_dist = df['defaulted_12m'].value_counts()
    print(f"Non-defaulted: {target_dist[0]:,} ({target_dist[0]/len(df)*100:.1f}%)")
    print(f"Defaulted: {target_dist[1]:,} ({target_dist[1]/len(df)*100:.1f}%)")
    
    # Plot target distribution
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Count plot
    sns.countplot(data=df, x='defaulted_12m', ax=axes[0])
    axes[0].set_title('Default Distribution (Count)')
    axes[0].set_xlabel('Defaulted in 12 months')
    
    # Pie chart
    axes[1].pie(target_dist.values, labels=['No Default', 'Default'], autopct='%1.1f%%')
    axes[1].set_title('Default Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()

## Demographic Analysis

In [None]:
if df is not None:
    # Analyze sensitive attributes
    sensitive_attrs = ['gender', 'region', 'role']
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, attr in enumerate(sensitive_attrs):
        # Distribution by attribute
        df[attr].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution by {attr.title()}')
        axes[i].set_xlabel(attr.title())
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)
    
    # Age distribution
    df['age'].hist(bins=20, ax=axes[3])
    axes[3].set_title('Age Distribution')
    axes[3].set_xlabel('Age')
    axes[3].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## Fairness Analysis by Sensitive Attributes

In [None]:
if df is not None:
    # Default rates by sensitive attributes
    print("⚖️ Default Rates by Sensitive Attributes:")
    print("=" * 50)
    
    for attr in sensitive_attrs:
        default_rates = df.groupby(attr)['defaulted_12m'].agg(['count', 'mean']).round(4)
        default_rates.columns = ['Count', 'Default_Rate']
        print(f"\n{attr.upper()}:")
        print(default_rates)
        
        # Statistical test for differences
        from scipy.stats import chi2_contingency
        contingency = pd.crosstab(df[attr], df['defaulted_12m'])
        chi2, p_value, _, _ = chi2_contingency(contingency)
        print(f"Chi-square test p-value: {p_value:.4f}")
        if p_value < 0.05:
            print("⚠️  Significant difference detected!")
        else:
            print("✅ No significant difference")

In [None]:
if df is not None:
    # Visualize default rates by sensitive attributes
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for i, attr in enumerate(sensitive_attrs):
        default_rates = df.groupby(attr)['defaulted_12m'].mean()
        default_rates.plot(kind='bar', ax=axes[i], color='skyblue', edgecolor='black')
        axes[i].set_title(f'Default Rate by {attr.title()}')
        axes[i].set_xlabel(attr.title())
        axes[i].set_ylabel('Default Rate')
        axes[i].tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for j, v in enumerate(default_rates.values):
            axes[i].text(j, v + 0.002, f'{v:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## Feature Analysis

In [None]:
if df is not None:
    # Key numerical features
    key_features = ['earnings_monthly', 'trips_weekly', 'on_time_rate', 'cancel_rate', 
                   'customer_rating', 'income_volatility', 'tenure_months']
    
    # Feature distributions by default status
    fig, axes = plt.subplots(3, 3, figsize=(20, 15))
    axes = axes.flatten()
    
    for i, feature in enumerate(key_features):
        if i < len(axes):
            # Box plot by default status
            sns.boxplot(data=df, x='defaulted_12m', y=feature, ax=axes[i])
            axes[i].set_title(f'{feature.replace("_", " ").title()} by Default Status')
            axes[i].set_xlabel('Defaulted')
    
    # Remove unused subplot
    if len(key_features) < len(axes):
        fig.delaxes(axes[-1])
        fig.delaxes(axes[-2])
    
    plt.tight_layout()
    plt.show()

In [None]:
if df is not None:
    # Correlation analysis
    print("🔗 Feature Correlations with Default:")
    correlations = df[key_features + ['defaulted_12m']].corr()['defaulted_12m'].drop('defaulted_12m')
    correlations = correlations.sort_values(key=abs, ascending=False)
    
    plt.figure(figsize=(10, 6))
    correlations.plot(kind='barh', color=['red' if x < 0 else 'green' for x in correlations])
    plt.title('Feature Correlations with Default Risk')
    plt.xlabel('Correlation Coefficient')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nTop correlations:")
    for feature, corr in correlations.head(10).items():
        print(f"{feature:25}: {corr:7.4f}")

## Model Results Analysis (if available)

In [None]:
# Load model results if available
results_files = {
    'baseline': '../reports/metrics_baseline.json',
    'fair': '../reports/metrics_fair.json'
}

fairness_files = {
    'baseline': '../reports/fairness_baseline.json',
    'fair': '../reports/fairness_fair.json'
}

model_results = {}
fairness_results = {}

for model_name, filepath in results_files.items():
    if Path(filepath).exists():
        with open(filepath) as f:
            model_results[model_name] = json.load(f)
        print(f"✅ Loaded {model_name} model metrics")
    else:
        print(f"❌ {model_name} model metrics not found")

for model_name, filepath in fairness_files.items():
    if Path(filepath).exists():
        with open(filepath) as f:
            fairness_results[model_name] = json.load(f)
        print(f"✅ Loaded {model_name} fairness metrics")
    else:
        print(f"❌ {model_name} fairness metrics not found")

In [None]:
if model_results:
    print("📊 Model Performance Comparison:")
    print("=" * 40)
    
    # Create comparison DataFrame
    metrics_df = pd.DataFrame(model_results).T
    display(metrics_df.round(4))
    
    # Plot comparison
    if len(model_results) > 1:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.flatten()
        
        key_metrics = ['auc', 'accuracy', 'precision', 'recall']
        
        for i, metric in enumerate(key_metrics):
            if metric in metrics_df.columns:
                metrics_df[metric].plot(kind='bar', ax=axes[i], color=['skyblue', 'lightcoral'])
                axes[i].set_title(f'{metric.upper()} Comparison')
                axes[i].set_ylabel(metric.upper())
                axes[i].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

In [None]:
if fairness_results:
    print("⚖️ Fairness Metrics Comparison:")
    print("=" * 40)
    
    # Extract fairness metrics for comparison
    fairness_summary = {}
    
    for model_name, results in fairness_results.items():
        fairness_summary[model_name] = {}
        
        # Extract demographic parity and equalized odds
        for attr in ['gender', 'region', 'role']:
            if f'demographic_parity_{attr}' in results:
                fairness_summary[model_name][f'dem_parity_{attr}'] = results[f'demographic_parity_{attr}']
            if f'equalized_odds_{attr}' in results:
                fairness_summary[model_name][f'eq_odds_{attr}'] = results[f'equalized_odds_{attr}']
    
    if fairness_summary:
        fairness_df = pd.DataFrame(fairness_summary).T
        display(fairness_df.round(4))
        
        # Plot fairness comparison
        if len(fairness_summary) > 1 and not fairness_df.empty:
            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            
            # Demographic parity
            dem_parity_cols = [col for col in fairness_df.columns if 'dem_parity' in col]
            if dem_parity_cols:
                fairness_df[dem_parity_cols].plot(kind='bar', ax=axes[0])
                axes[0].set_title('Demographic Parity Difference')
                axes[0].set_ylabel('Difference')
                axes[0].axhline(y=0, color='red', linestyle='--', alpha=0.7)
                axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            
            # Equalized odds
            eq_odds_cols = [col for col in fairness_df.columns if 'eq_odds' in col]
            if eq_odds_cols:
                fairness_df[eq_odds_cols].plot(kind='bar', ax=axes[1])
                axes[1].set_title('Equalized Odds Difference')
                axes[1].set_ylabel('Difference')
                axes[1].axhline(y=0, color='red', linestyle='--', alpha=0.7)
                axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            
            plt.tight_layout()
            plt.show()

## Nova Scores Analysis (if available)

In [None]:
# Load Nova scores if available
scores_files = {
    'baseline': '../data/partners_scores_baseline.csv',
    'fair': '../data/partners_scores_fair.csv'
}

scores_data = {}

for model_name, filepath in scores_files.items():
    if Path(filepath).exists():
        scores_data[model_name] = pd.read_csv(filepath)
        print(f"✅ Loaded {model_name} Nova scores")
    else:
        print(f"❌ {model_name} Nova scores not found")

In [None]:
if scores_data:
    print("📊 Nova Scores Analysis:")
    
    for model_name, data in scores_data.items():
        if 'nova_score' in data.columns:
            print(f"\n{model_name.upper()} MODEL:")
            print(f"Nova Score Range: {data['nova_score'].min():.0f} - {data['nova_score'].max():.0f}")
            print(f"Mean Nova Score: {data['nova_score'].mean():.1f}")
            print(f"Std Nova Score: {data['nova_score'].std():.1f}")
            
            # Score distribution by sensitive attributes
            fig, axes = plt.subplots(2, 2, figsize=(15, 10))
            axes = axes.flatten()
            
            # Overall distribution
            data['nova_score'].hist(bins=30, ax=axes[0], alpha=0.7, edgecolor='black')
            axes[0].set_title(f'{model_name.title()} - Nova Score Distribution')
            axes[0].set_xlabel('Nova Score')
            axes[0].set_ylabel('Frequency')
            
            # By sensitive attributes
            for i, attr in enumerate(sensitive_attrs):
                if attr in data.columns and i < 3:
                    sns.boxplot(data=data, x=attr, y='nova_score', ax=axes[i+1])
                    axes[i+1].set_title(f'Nova Score by {attr.title()}')
                    axes[i+1].tick_params(axis='x', rotation=45)
            
            plt.tight_layout()
            plt.show()
            
            # Score distribution by default status
            if 'defaulted_12m' in data.columns:
                plt.figure(figsize=(10, 6))
                for default_status in [0, 1]:
                    subset = data[data['defaulted_12m'] == default_status]
                    plt.hist(subset['nova_score'], bins=30, alpha=0.7, 
                            label=f"{'Defaulted' if default_status else 'Non-defaulted'}")
                
                plt.title(f'{model_name.title()} - Nova Score by Default Status')
                plt.xlabel('Nova Score')
                plt.ylabel('Frequency')
                plt.legend()
                plt.grid(alpha=0.3)
                plt.show()

## Summary and Recommendations

In [None]:
print("📋 ANALYSIS SUMMARY")
print("=" * 50)

if df is not None:
    print(f"\n📊 Dataset Overview:")
    print(f"• Total partners: {len(df):,}")
    print(f"• Overall default rate: {df['defaulted_12m'].mean()*100:.1f}%")
    
    print(f"\n👥 Demographics:")
    for attr in sensitive_attrs:
        unique_vals = df[attr].nunique()
        print(f"• {attr.title()}: {unique_vals} categories")
    
    print(f"\n⚖️ Fairness Observations:")
    for attr in sensitive_attrs:
        rates = df.groupby(attr)['defaulted_12m'].mean()
        min_rate, max_rate = rates.min(), rates.max()
        diff = max_rate - min_rate
        print(f"• {attr.title()} default rate range: {min_rate*100:.1f}% - {max_rate*100:.1f}% (diff: {diff*100:.1f}pp)")
        if diff > 0.02:  # More than 2 percentage points
            print(f"  ⚠️  Potential fairness concern detected")

if model_results:
    print(f"\n🤖 Model Performance:")
    for model_name, metrics in model_results.items():
        auc = metrics.get('auc', 'N/A')
        print(f"• {model_name.title()}: AUC = {auc}")

print(f"\n💡 Recommendations:")
print(f"• Monitor fairness metrics across all sensitive attributes")
print(f"• Consider additional bias mitigation techniques if needed")
print(f"• Validate model performance on holdout data")
print(f"• Implement continuous monitoring in production")

## Next Steps

1. **Run the full pipeline**: `python run_project.py`
2. **Analyze results**: Re-run this notebook after model training
3. **Experiment**: Try different fairness mitigation strategies
4. **Deploy**: Implement the Nova scoring system
5. **Monitor**: Set up ongoing fairness and performance monitoring

For more details, check the project README and documentation.