# Credit Risk Modeling: Static vs Sequential Approaches

**Objective**: Compare Logistic Regression, XGBoost, and LSTM models for predicting
probability of loan default using mobile money transaction data.

**Target**: Binary classification -- default vs non-default (among borrowers only).

**Models**:
1. Logistic Regression (static, user-level features)
2. XGBoost (static, user-level features)
3. LSTM (sequential, per-transaction features)

## 1. Setup

In [None]:
import sys
import os

# Ensure src is importable from notebooks directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.models import (
    CreditRiskDataLoader, LogisticRegressionModel,
    XGBoostModel, LSTMModel, ModelEvaluator, set_random_seeds
)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

set_random_seeds(42)

%matplotlib inline
plt.rcParams['figure.dpi'] = 100
sns.set_style('whitegrid')

print('Setup complete.')

## 2. Data Exploration

In [None]:
# Load raw data for exploration
df_features = pd.read_csv(os.path.join(project_root, 'data/user_features.csv'))
df_summaries = pd.read_csv(os.path.join(project_root, 'data/user_summaries.csv'))

print(f'User features: {df_features.shape}')
print(f'User summaries: {df_summaries.shape}')
print(f'\nCredit risk label distribution (all users):')
print(df_summaries['credit_risk_label'].value_counts().sort_index())

In [None]:
# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# All users
labels_all = df_summaries['credit_risk_label'].value_counts().sort_index()
label_names = {-1: 'No Loans', 0: 'Good', 1: 'Late', 2: 'Default'}
ax1.bar([label_names[k] for k in labels_all.index], labels_all.values,
        color=['#95a5a6', '#2ecc71', '#f39c12', '#e74c3c'])
ax1.set_title('Credit Risk Labels (All Users)')
ax1.set_ylabel('Count')
for i, v in enumerate(labels_all.values):
    ax1.text(i, v + 50, str(v), ha='center')

# Borrowers only (binary target)
borrowers = df_summaries[df_summaries['credit_risk_label'] != -1]
binary = (borrowers['credit_risk_label'] == 2).astype(int)
counts = binary.value_counts().sort_index()
ax2.bar(['Non-Default', 'Default'], counts.values,
        color=['#3498db', '#e74c3c'])
ax2.set_title('Binary Target (Borrowers Only)')
ax2.set_ylabel('Count')
for i, v in enumerate(counts.values):
    ax2.text(i, v + 20, f'{v} ({v/len(borrowers)*100:.1f}%)', ha='center')

plt.tight_layout()
plt.show()

print(f'\nTotal borrowers: {len(borrowers)}')
print(f'Default rate: {binary.mean():.3f} ({binary.sum()} defaults)')

In [None]:
# Credit archetype distribution among borrowers
print('Credit archetypes (borrowers):')
print(borrowers['credit_archetype'].value_counts())

## 3. Data Preparation

In [None]:
# Initialize data loader with paths relative to project root
loader = CreditRiskDataLoader(
    features_path=os.path.join(project_root, 'data/user_features.csv'),
    summaries_path=os.path.join(project_root, 'data/user_summaries.csv'),
    transactions_dir=os.path.join(project_root, 'data/user_transactions'),
)

# Prepare static data splits
static_data = loader.prepare_static_splits()

print(f"Training samples: {len(static_data['y_train'])}")
print(f"Test samples: {len(static_data['y_test'])}")
print(f"Default rate (train): {static_data['y_train'].mean():.4f}")
print(f"Default rate (test): {static_data['y_test'].mean():.4f}")
print(f"Features: {len(static_data['feature_names'])}")
print(f"Scale pos weight: {loader.get_scale_pos_weight():.2f}")
print(f"\nFeature list:")
for i, name in enumerate(static_data['feature_names']):
    print(f"  {i+1:2d}. {name}")

In [None]:
# Feature correlation with target (top features)
X_train_df = static_data['X_train'].copy()
X_train_df['default'] = static_data['y_train']
correlations = X_train_df.corr()['default'].drop('default').abs().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_corr = correlations.head(20)
ax.barh(range(len(top_corr)), top_corr.values, color='#3498db')
ax.set_yticks(range(len(top_corr)))
ax.set_yticklabels(top_corr.index, fontsize=9)
ax.set_xlabel('Absolute Correlation with Default')
ax.set_title('Top 20 Features by Correlation with Default')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 4. Logistic Regression

In [None]:
# Train Logistic Regression
lr_model = LogisticRegressionModel(class_weight='balanced')
lr_model.fit(static_data['X_train_scaled'], static_data['y_train'])

# Cross-validation
print('Logistic Regression - 5-Fold Cross-Validation:')
lr_cv = lr_model.cross_validate(static_data['X_train_scaled'], static_data['y_train'])
for metric, values in lr_cv.items():
    print(f'  {metric}: {np.mean(values):.4f} +/- {np.std(values):.4f}')

In [None]:
# Test predictions
lr_proba = lr_model.predict_proba(static_data['X_test_scaled'])

print('Logistic Regression - Test Set Performance:')
print(f'  AUC-ROC: {roc_auc_score(static_data["y_test"], lr_proba):.4f}')
print(f'  AUC-PR: {average_precision_score(static_data["y_test"], lr_proba):.4f}')

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

# Feature coefficients
lr_coefs = lr_model.get_coefficients(static_data['feature_names'])

fig, ax = plt.subplots(figsize=(10, 8))
top_coefs = lr_coefs.head(15)
colors = ['#e74c3c' if c > 0 else '#3498db' for c in top_coefs['coefficient']]
ax.barh(range(len(top_coefs)), top_coefs['coefficient'], color=colors)
ax.set_yticks(range(len(top_coefs)))
ax.set_yticklabels(top_coefs['feature'], fontsize=9)
ax.set_xlabel('Coefficient')
ax.set_title('Logistic Regression - Top 15 Features by |Coefficient|')
ax.invert_yaxis()
ax.axvline(x=0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

print('Red = increases default probability, Blue = decreases')

## 5. XGBoost Classifier

In [None]:
# Train XGBoost
xgb_model = XGBoostModel(scale_pos_weight=loader.get_scale_pos_weight())
xgb_model.fit(
    static_data['X_train_scaled'], static_data['y_train'],
    X_val=static_data['X_test_scaled'], y_val=static_data['y_test'],
)

# Cross-validation
print('XGBoost - 5-Fold Cross-Validation:')
xgb_cv = xgb_model.cross_validate(static_data['X_train_scaled'], static_data['y_train'])
for metric, values in xgb_cv.items():
    print(f'  {metric}: {np.mean(values):.4f} +/- {np.std(values):.4f}')

In [None]:
# Test predictions
xgb_proba = xgb_model.predict_proba(static_data['X_test_scaled'])

print('XGBoost - Test Set Performance:')
print(f'  AUC-ROC: {roc_auc_score(static_data["y_test"], xgb_proba):.4f}')
print(f'  AUC-PR: {average_precision_score(static_data["y_test"], xgb_proba):.4f}')

In [None]:
# Feature importance
xgb_importance = xgb_model.get_feature_importance(static_data['feature_names'])

fig, ax = plt.subplots(figsize=(10, 8))
top_imp = xgb_importance.head(15)
ax.barh(range(len(top_imp)), top_imp['importance'], color='#2ecc71')
ax.set_yticks(range(len(top_imp)))
ax.set_yticklabels(top_imp['feature'], fontsize=9)
ax.set_xlabel('Importance')
ax.set_title('XGBoost - Top 15 Feature Importances')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 6. LSTM Model

In [None]:
# Load sequence data
seq_data = loader.load_sequences(
    max_seq_len=50,
    cache_path=os.path.join(project_root, 'data/lstm_sequences.npz'),
)

print(f"Train sequences shape: {seq_data['X_train_seq'].shape}")
print(f"Test sequences shape: {seq_data['X_test_seq'].shape}")
print(f"Train default rate: {seq_data['y_train'].mean():.4f}")
print(f"Test default rate: {seq_data['y_test'].mean():.4f}")
print(f"LSTM features ({len(seq_data['feature_names'])}):")
for i, name in enumerate(seq_data['feature_names']):
    print(f"  {i+1:2d}. {name}")

In [None]:
# Build and train LSTM
input_shape = (seq_data['X_train_seq'].shape[1], seq_data['X_train_seq'].shape[2])
class_weights = {0: 1.0, 1: loader.get_scale_pos_weight()}

lstm_model = LSTMModel()
lstm_model.build_model(input_shape)
lstm_model.model.summary()

history = lstm_model.fit(
    seq_data['X_train_seq'], seq_data['y_train'],
    X_val=seq_data['X_test_seq'], y_val=seq_data['y_test'],
    epochs=100, batch_size=32,
    class_weight=class_weights,
)

In [None]:
# Training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history.history['loss'], label='Train Loss')
ax1.plot(history.history['val_loss'], label='Val Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(history.history['auc'], label='Train AUC')
ax2.plot(history.history['val_auc'], label='Val AUC')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('AUC')
ax2.set_title('Training AUC')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Test predictions
lstm_proba = lstm_model.predict_proba(seq_data['X_test_seq'])

print('LSTM - Test Set Performance:')
print(f'  AUC-ROC: {roc_auc_score(seq_data["y_test"], lstm_proba):.4f}')
print(f'  AUC-PR: {average_precision_score(seq_data["y_test"], lstm_proba):.4f}')

In [None]:
# LSTM Cross-validation (this will take a while)
print('LSTM - 5-Fold Cross-Validation:')
lstm_cv = lstm_model.cross_validate(
    seq_data['X_train_seq'], seq_data['y_train'],
    n_splits=5, epochs=100, batch_size=32,
    class_weight=class_weights,
)
for metric, values in lstm_cv.items():
    print(f'  {metric}: {np.mean(values):.4f} +/- {np.std(values):.4f}')

## 7. Model Comparison

In [None]:
# Create evaluator with the shared test set
evaluator = ModelEvaluator(static_data['y_test'])
evaluator.add_model('Logistic Regression', lr_proba)
evaluator.add_model('XGBoost', xgb_proba)
evaluator.add_model('LSTM', lstm_proba)

# Comparison table
comparison = evaluator.get_comparison_table()
print('\nModel Comparison (Test Set):')
print(comparison.round(4).to_string())

In [None]:
# Cross-validation comparison
cv_results = {}
for name, cv in [('Logistic Regression', lr_cv), ('XGBoost', xgb_cv), ('LSTM', lstm_cv)]:
    cv_results[name] = {
        metric: f"{np.mean(values):.4f} +/- {np.std(values):.4f}"
        for metric, values in cv.items()
    }

cv_df = pd.DataFrame(cv_results).T
print('\nCross-Validation Results (5-Fold):')
print(cv_df.to_string())

In [None]:
# ROC and PR curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
evaluator.plot_roc_curves(ax=ax1)
evaluator.plot_pr_curves(ax=ax2)
plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices
evaluator.plot_confusion_matrices(figsize=(16, 4))
plt.show()

In [None]:
# Feature importance comparison (LR vs XGBoost)
evaluator.plot_feature_importance_comparison(
    lr_coefs, xgb_importance, top_n=15
)
plt.show()

In [None]:
# Threshold analysis for each model
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, name in zip(axes, ['Logistic Regression', 'XGBoost', 'LSTM']):
    evaluator.plot_threshold_analysis(name, ax=ax)
plt.tight_layout()
plt.show()

In [None]:
# Classification reports
evaluator.print_classification_reports()

## 8. Analysis

### Key Findings

**Static vs Sequential Performance:**
- Compare AUC-ROC and AUC-PR scores across models to determine whether
  the LSTM's ability to model temporal transaction patterns provides
  meaningful lift over user-level aggregates used by LR and XGBoost.

**Feature Importance Insights:**
- Loan-specific features (repayment_to_loan_ratio, has_any_repayment, etc.)
  are expected to be strong predictors given the target is loan default.
- Balance dynamics and transaction patterns may provide additional signal.

**Class Imbalance:**
- With ~9% default rate, AUC-PR (Average Precision) is more informative
  than AUC-ROC for assessing model quality on the minority class.

**Practical Considerations:**
- Logistic Regression offers full interpretability via coefficients.
- XGBoost typically offers the best accuracy-to-complexity tradeoff.
- LSTM captures sequential patterns but requires more data and compute.

In [None]:
# Save comparison table
comparison.to_csv(os.path.join(project_root, 'data/model_comparison.csv'))
print('Comparison table saved to data/model_comparison.csv')