# Credit Risk Modelling: Data Modelling

In [10]:
import argparse

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
import lightgbm

from category_encoders.ordinal import OrdinalEncoder


## Loading data

In [11]:
df = pd.read_csv("../../data/raw/dataset.csv", sep=";")
df.shape

(99976, 43)

# Train/Test split

In [12]:
df_train = df[~df.default.isnull()]
df_test = df[df.default.isnull()]

# Features

In [13]:
CATEGORICAL_FEATURES = [
    "merchant_category",
    "merchant_group",
    "name_in_email",
]
NUMERICAL_FEATURES = [
    "account_amount_added_12_24m",
    "account_days_in_dc_12_24m",
    "account_days_in_rem_12_24m",
    "account_days_in_term_12_24m",
    "account_incoming_debt_vs_paid_0_24m",
    "account_status",
    "account_worst_status_0_3m",
    "account_worst_status_12_24m",
    "account_worst_status_3_6m",
    "account_worst_status_6_12m",
    "age",
    "avg_payment_span_0_12m",
    "avg_payment_span_0_3m",
    "max_paid_inv_0_12m",
    "max_paid_inv_0_24m",
    "num_active_div_by_paid_inv_0_12m",
    "num_active_inv",
    "num_arch_dc_0_12m",
    "num_arch_dc_12_24m",
    "num_arch_ok_0_12m",
    "num_arch_ok_12_24m",
    "num_arch_rem_0_12m",
    "num_unpaid_bills",
    "status_last_archived_0_24m",
    "status_2nd_last_archived_0_24m",
    "status_3rd_last_archived_0_24m",
    "status_max_archived_0_6_months",
    "status_max_archived_0_12_months",
    "status_max_archived_0_24_months",
    "recovery_debt",
    "sum_capital_paid_account_0_12m",
    "sum_capital_paid_account_12_24m",
    "sum_paid_inv_0_12m",
    "time_hours",
    "worst_status_active_inv",
]
# Features which won't be used during the training
OTHER_FEATURES = [
    "has_paid",
    "num_arch_written_off_0_12m",
    "num_arch_written_off_12_24m",
    "uuid",
    "default",
]
MODEL_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
VALID_FEATURES = MODEL_FEATURES + OTHER_FEATURES

# Sanity check: making sure that the same feature doesn't appear multiple times in the list
assert len(set(VALID_FEATURES)) == len(VALID_FEATURES)


# Model

In [14]:
model = Pipeline([
    ("feature_preprocessor", ColumnTransformer([
        # Note: Oridnal encoding works quite well for the models based on decision trees
        ("categorical", OrdinalEncoder(handle_missing="return_nan"), CATEGORICAL_FEATURES),
        # Numerical features will be passed through the model without any changes
        ("numerical", "passthrough", NUMERICAL_FEATURES)
    ])),
    ("classifier", lightgbm.LGBMClassifier(
        n_estimators=400,
        num_leaves=12,
        max_depth=4,
        learning_rate=0.02,
        colsample_bytree=0.5,
    )),
])

# Cross validataion

In [15]:
def run_cross_validation(df_train):
    kfold = KFold(n_splits=10)

    X = df_train
    y = df_train.default

    results = []

    for index, (train_index, test_index) in enumerate(kfold.split(X), start=1):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        probas = model.predict_proba(X_val)
        positive_label_proba = probas[:, 1]

        auc_score = roc_auc_score(y_val, positive_label_proba)
        print(f"[Fold #{index}] ROC AUC Score: {auc_score:.3f}")
        results.append(auc_score)

    print(f"ROC AUC Score: {np.mean(results):.3f} (+/-{2 * np.std(results):.3f})")

In [16]:
run_cross_validation(df_train)

[Fold #1] ROC AUC Score: 0.928
[Fold #2] ROC AUC Score: 0.901
[Fold #3] ROC AUC Score: 0.919
[Fold #4] ROC AUC Score: 0.915
[Fold #5] ROC AUC Score: 0.890
[Fold #6] ROC AUC Score: 0.891
[Fold #7] ROC AUC Score: 0.924
[Fold #8] ROC AUC Score: 0.919
[Fold #9] ROC AUC Score: 0.926
[Fold #10] ROC AUC Score: 0.906
ROC AUC Score: 0.912 (+/-0.027)
