# 03 - Credit Risk Modeling

This notebook builds baseline predictive models for credit default using the processed German Credit dataset. We will train Logistic Regression and Random Forest models, evaluate metrics, and produce professional insights.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_DIR = Path('../data/processed')
DATA_DIR.mkdir(exist_ok=True, parents=True)

## 1. Load Cleaned German Credit Data (My Version)

In [None]:
df = DATA_DIR / 'german_credit_clean.csv'
if df.exists():
    credit_data_model = pd.read_csv(df)
    print('Loaded processed German Credit dataset:', credit_data_model.shape)
else:
    print('Cleaned dataset not found. Please run 02_exploratory_analysis.ipynb first.')

## 2. Data Preprocessing (Steps I Applied)

In [None]:
if 'credit_data_model' in locals():
    # Example: assuming 'default' is target, encode categoricals
    target = 'default'
    X = credit_data_model.drop(columns=[target])
    y = credit_data_model[target]

    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])

## 3. Split Data into Training and Test Sets

In [None]:
if 'X' in locals():
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Training set:', X_train_data.shape, 'Test set:', X_test_data.shape)

## 4. Logistic Regression Baseline Model

In [None]:
if 'X_train_data' in locals():
    lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))])
    lr_pipeline.fit(X_train_data, y_train_data)
    y_pred_lr = lr_pipeline.predict(X_test_data)
    y_prob_lr = lr_pipeline.predict_proba(X_test_data)[:,1]
    print('Logistic Regression Classification Report:')
    print(classification_report(y_test_data, y_pred_lr))
    print('ROC-AUC:', roc_auc_score(y_test_data, y_prob_lr))

## 5. Random Forest Model & Feature Importance

In [None]:
if 'X_train_data' in locals():
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])
    rf_pipeline.fit(X_train_data, y_train_data)
    y_pred_rf = rf_pipeline.predict(X_test_data)
    y_prob_rf = rf_pipeline.predict_proba(X_test_data)[:,1]
    print('Random Forest Classification Report:')
    print(classification_report(y_test_data, y_pred_rf))
    print('ROC-AUC:', roc_auc_score(y_test_data, y_prob_rf))
    # Feature importance example
    rf_model = rf_pipeline.named_steps['classifier']
    if hasattr(rf_model, 'feature_importances_'):
        importance = rf_model.feature_importances_
        print('Random Forest feature importances calculated.')

## 6. My Insights and Interpretation
- Compare Logistic Regression vs Random Forest performance.
- Highlight important predictors of default.
- Discuss implications for credit risk management.