# MILESTONE TWO: Model Development, Evaluation, and Deployment (PART B)

**Course**: DSC8201 - Data Science Lifecycle  
**Project**: Financial Credit Scoring & Fairness Auditing  
**Student**: Atuhaire (B35093)  
**Date**: December 2025

---

## Table of Contents
1. [Model Selection & Justification](#section1)
2. [Model Development & Experiment Tracking](#section2)
3. [Model Evaluation & Interpretation](#section3)
4. [Fairness Analysis](#section4)
5. [Summary](#section5)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os
from pathlib import Path
import joblib

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)

# Classical ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Deep Learning (optional)
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    TF_AVAILABLE = True
except:
    TF_AVAILABLE = False
    print("‚ö†Ô∏è TensorFlow not available, skipping deep learning models")

# Model Explainability
try:
    import shap
    SHAP_AVAILABLE = True
except:
    SHAP_AVAILABLE = False
    print("‚ö†Ô∏è SHAP not available")

try:
    from lime.lime_tabular import LimeTabularExplainer
    LIME_AVAILABLE = True
except:
    LIME_AVAILABLE = False
    print("‚ö†Ô∏è LIME not available")

# MLflow for experiment tracking
try:
    import mlflow
    import mlflow.sklearn
    MLFLOW_AVAILABLE = True
except:
    MLFLOW_AVAILABLE = False
    print("‚ö†Ô∏è MLflow not available")

# Fairness libraries
try:
    from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference
    from fairlearn.reductions import ExponentiatedGradient, DemographicParity
    FAIRLEARN_AVAILABLE = True
except:
    FAIRLEARN_AVAILABLE = False
    print("‚ö†Ô∏è Fairlearn not available")

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))
from utils import *
from preprocessing import *

warnings.filterwarnings('ignore')

# Set options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ Libraries imported successfully!")
print(f"Working Directory: {os.getcwd()}")

### Load Preprocessed Data

In [None]:
print_section_header("LOADING PREPROCESSED DATA")

# Load the clean dataset
data_path = Path.cwd().parent / 'data' / 'cleaned' / 'Atuhaire.csv'

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"‚úÖ Dataset loaded successfully!")
    print(f"   Shape: {df.shape}")
    print(f"   File: {data_path}")
else:
    print(f"‚ùå Dataset not found at {data_path}")
    print("   Please run the data preparation notebook first!")
    raise FileNotFoundError(f"Dataset not found: {data_path}")

# Display basic info
print(f"\nDataset Info:")
print(f"  Columns: {len(df.columns)}")
print(f"  Rows: {len(df):,}")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

df.head()

---
<a id='section1'></a>
## 1. Model Selection & Justification [8 Marks]

### 1.1 Model Categories & Selection Criteria

In [None]:
print_section_header("MODEL SELECTION FRAMEWORK")

print("""
üéØ MODEL SELECTION STRATEGY FOR CREDIT SCORING:

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
SELECTION CRITERIA:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. HYPOTHESIS ALIGNMENT:
   ‚Ä¢ H‚ÇÅ: Financial attributes predict credit default
   ‚Ä¢ Need models that capture both linear and non-linear relationships
   ‚Ä¢ Require probability outputs for risk scoring

2. DATA CHARACTERISTICS:
   ‚Ä¢ Size: 40,000 samples (medium-sized dataset)
   ‚Ä¢ Features: ~50-60 after encoding (tabular data)
   ‚Ä¢ Class Imbalance: ~15-20% default rate (will use SMOTE)
   ‚Ä¢ Mixed feature types: Numerical + Categorical (encoded)

3. ETHICAL & REGULATORY CONSIDERATIONS:
   ‚Ä¢ Model must be interpretable (regulatory requirement)
   ‚Ä¢ Need to detect and mitigate bias
   ‚Ä¢ Explainability critical for loan rejections
   ‚Ä¢ Must comply with fair lending laws

4. INTERPRETABILITY REQUIREMENTS:
   ‚Ä¢ High: Regulatory compliance, customer explanations
   ‚Ä¢ Use SHAP/LIME for complex models
   ‚Ä¢ Feature importance must be calculable

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
SELECTED MODELS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üìä CATEGORY 1: CLASSICAL MACHINE LEARNING

   1. Logistic Regression
      ‚úì Highly interpretable (coefficients = feature importance)
      ‚úì Fast training and prediction
      ‚úì Probability calibration built-in
      ‚úì Industry standard for credit scoring
      ‚úó Assumes linear relationships
      ‚Üí Use as baseline model
   
   2. Random Forest Classifier
      ‚úì Handles non-linear relationships
      ‚úì Feature importance via Gini/entropy
      ‚úì Robust to outliers
      ‚úì No scaling required
      ‚úó Less interpretable than logistic regression
      ‚Üí Good for ensemble and feature selection
   
   3. XGBoost (Gradient Boosting)
      ‚úì State-of-the-art performance on tabular data
      ‚úì Handles class imbalance well (scale_pos_weight)
      ‚úì Feature importance available
      ‚úì Regularization prevents overfitting
      ‚úì SHAP integration
      ‚Üí Expected best performer
   
   4. LightGBM
      ‚úì Faster than XGBoost on large datasets
      ‚úì Handles categorical features natively
      ‚úì Memory efficient
      ‚úì Similar performance to XGBoost
      ‚Üí Alternative to XGBoost

üß† CATEGORY 2: DEEP LEARNING (Optional)

   5. Deep Neural Network (DNN)
      ‚úì Can learn complex patterns
      ‚úì Automatic feature interaction learning
      ‚úó Requires more data for optimal performance
      ‚úó Less interpretable (use SHAP)
      ‚úó Longer training time
      ‚Üí Include if performance gains are significant

üìà CATEGORY 3: FAIRNESS-AWARE ML

   6. Fairness-Constrained Classifier
      ‚úì Explicitly optimizes for fairness metrics
      ‚úì Uses Exponentiated Gradient method
      ‚úì Addresses demographic parity
      ‚úó May sacrifice some accuracy for fairness
      ‚Üí Critical for compliance

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
JUSTIFICATION SUMMARY:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

‚úÖ HYPOTHESIS: Multiple models test different relationships (linear vs non-linear)
‚úÖ DATA SIZE: 40K samples suitable for both classical ML and shallow DNNs
‚úÖ ETHICS: Fairness-aware models + SHAP explainability ensure compliance
‚úÖ INTERPRETABILITY: Mix of transparent (LogReg) and explainable (XGB+SHAP) models

Final Model Selection: Will be based on:
  1. Predictive Performance (AUC-ROC, F1-score)
  2. Fairness Metrics (Disparate Impact Ratio)
  3. Interpretability (SHAP values, feature importance)
  4. Business Requirements (speed, explainability)
""")

### 1.2 Prepare Data for Modeling

In [None]:
print_section_header("DATA PREPARATION FOR MODELING")

# Separate features and target
# Exclude sensitive attributes from features (fairness requirement)
sensitive_features = ['gender', 'age_group', 'marital_status']  # Keep for fairness analysis only

# Identify target
target_col = 'default_status'

# Get all column names
all_cols = df.columns.tolist()

# Features to exclude from modeling
exclude_cols = [target_col, 'applicant_id'] + [col for col in sensitive_features if col in all_cols]

# Select features
feature_cols = [col for col in all_cols if col not in exclude_cols]

print(f"\nüìã Feature Selection:")
print(f"   Total columns: {len(all_cols)}")
print(f"   Target: {target_col}")
print(f"   Features for modeling: {len(feature_cols)}")
print(f"   Excluded (sensitive/ID): {len(exclude_cols)}")

# Create X and y
X = df[feature_cols].copy()
y = df[target_col].copy()

# Store sensitive attributes separately for fairness analysis
sensitive_attrs = df[[col for col in sensitive_features if col in df.columns]].copy()

print(f"\n‚úÖ Data prepared:")
print(f"   X shape: {X.shape}")
print(f"   y shape: {y.shape}")
print(f"   Class distribution: {y.value_counts().to_dict()}")

In [None]:
# Train-Test-Validation Split
print("\nüìä Creating Train-Validation-Test Split...\n")

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Second split: 75% train, 25% validation (of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

# Also split sensitive attributes for fairness analysis
if len(sensitive_attrs) > 0:
    sensitive_temp, sensitive_test = train_test_split(
        sensitive_attrs, test_size=0.20, random_state=42, stratify=y
    )
    sensitive_train, sensitive_val = train_test_split(
        sensitive_temp, test_size=0.25, random_state=42, stratify=y_temp
    )

print(f"Train Set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  - Default rate: {y_train.mean()*100:.2f}%")
print(f"\nValidation Set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  - Default rate: {y_val.mean()*100:.2f}%")
print(f"\nTest Set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"  - Default rate: {y_test.mean()*100:.2f}%")
print(f"\n‚úÖ Stratified split ensures consistent class distribution")

### 1.3 Address Class Imbalance with SMOTE

In [None]:
print_section_header("HANDLING CLASS IMBALANCE")

print("\nüìä Original Class Distribution:")
print(f"   No Default (0): {(y_train == 0).sum():,} ({(y_train == 0).mean()*100:.2f}%)")
print(f"   Default (1): {(y_train == 1).sum():,} ({(y_train == 1).mean()*100:.2f}%)")
print(f"   Imbalance Ratio: {(y_train == 0).sum() / (y_train == 1).sum():.2f}:1")

# Apply SMOTE to training data
smote = SMOTE(random_state=42, sampling_strategy=0.7)  # Increase minority class to 70% of majority
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\n‚úÖ After SMOTE Resampling:")
print(f"   No Default (0): {(y_train_balanced == 0).sum():,} ({(y_train_balanced == 0).mean()*100:.2f}%)")
print(f"   Default (1): {(y_train_balanced == 1).sum():,} ({(y_train_balanced == 1).mean()*100:.2f}%)")
print(f"   New Imbalance Ratio: {(y_train_balanced == 0).sum() / (y_train_balanced == 1).sum():.2f}:1")
print(f"\nTraining data size increased from {len(X_train):,} to {len(X_train_balanced):,}")

---
**Continued in next cells...**