# Create an XGBoost model

Using the data created in the previous notebook, we will try to predict the probability of default in the next twelve months. Other situations, such as loan ongoing and loan being repaid will be counted as a non-default

In [1]:
import numpy as np
import pandas as pd

path_to_csv = "../data/mortgage_default_transformed.csv"
df = pd.read_csv(path_to_csv)
df.set_index(df.pop("Unnamed: 0"), inplace=True)

X = df.dropna(axis=0)
y = X.pop("result")

In [2]:
# custom features
X["time_since_start"] = X.time - X.orig_time
X["time_until_end"] = X.mat_time - X.time
X["mortgage_duration"] = X.mat_time - X.orig_time
X["balance_change"] = X.balance_time-X.balance_orig_time
X["LTV_change"] = X.LTV_time - X.LTV_orig_time
X["interest_change"] = X.interest_rate_time - X.Interest_Rate_orig_time
# X["interest_change_as_fraction"] = X.interest_change / X.Interest_Rate_orig_time
X["hpi_change"] = X.hpi_time - X.hpi_orig_time

In [3]:
# select useful columns
used_columns = [
    "time_since_start", "time_until_end", "mortgage_duration",
    "balance_time", "balance_orig_time", "balance_change",
    "LTV_time", "LTV_orig_time", "LTV_change",
    "interest_rate_time", "Interest_Rate_orig_time", "interest_change",
    #"interest_change_as_fraction",
    "hpi_time", "hpi_orig_time",
    "hpi_change", "uer_time"
]

In [4]:
X_data = X.loc[:, used_columns]
X_data.head()

Unnamed: 0_level_0,time_since_start,time_until_end,mortgage_duration,balance_time,balance_orig_time,balance_change,LTV_time,LTV_orig_time,LTV_change,interest_rate_time,Interest_Rate_orig_time,interest_change,hpi_time,hpi_orig_time,hpi_change,uer_time
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,33,87,120,41061.95,45000.0,-3938.05,24.483867,69.4,-44.916133,9.2,9.2,0.0,225.1,87.03,138.07,4.7
1,51,69,120,30585.43,45000.0,-14414.57,27.202731,69.4,-42.197269,9.2,9.2,0.0,150.91,87.03,63.88,9.2
2,28,93,121,60882.42,63750.0,-2867.58,34.007232,81.8,-47.792768,10.5,10.5,0.0,225.1,97.99,127.11,4.7
3,39,82,121,59023.8,63750.0,-4726.2,48.394751,81.8,-33.405249,9.25,10.5,-1.25,153.35,97.99,55.36,9.0
4,60,61,121,53400.71,63750.0,-10349.29,35.644306,81.8,-46.155694,10.5,10.5,0.0,188.37,97.99,90.38,6.2


In [5]:
# split the data for testing the model
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X_data, y,
    train_size=0.7,
    test_size=0.3)

In [6]:
# we will try to select a better model using accuracy scoring
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def score_model(X_train, y_train, X_valid, y_valid, n_estimators, learning_rate=0.1):
    model = XGBClassifier(
                n_estimators = n_estimators,
                learning_rate = learning_rate,
                n_jobs = 5,
                eval_metric = 'mlogloss'
            )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    
    accuracy = accuracy_score(predictions, y_valid)
    return accuracy

  from pandas import MultiIndex, Int64Index


In [8]:
# shut up warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# we will try different number of estimators to see, which is the best fit
estimator_count = [10, 25, 36, 50, 100]

last_accuracy = 0
for n_estimators in estimator_count:
    accuracy = score_model(
        X_train, y_train, X_valid, y_valid,
        n_estimators=n_estimators,
        learning_rate=0.5 # let's speed things up
    )
    print(f"The accuracy with n_estimators is {n_estimators} with an accuracy of {accuracy}")

The accuracy with n_estimators is 10 with an accuracy of 0.8278959550314119
The accuracy with n_estimators is 25 with an accuracy of 0.825967155295933
The accuracy with n_estimators is 36 with an accuracy of 0.8240383555604541
The accuracy with n_estimators is 50 with an accuracy of 0.8233219442301334
The accuracy with n_estimators is 100 with an accuracy of 0.8210073845475587


In [204]:
some_model = XGBClassifier(
    n_estimators = 100,
    max_depth=5,
    learning_rate = 0.01,
    reg_alpha = 0.3,
    eval_metric = "rmse",
    scale_pos_weight = 2.1 # this tries to prevent overfitting
)
some_model.fit(X_train, y_train)
predictions = some_model.predict(X_valid)
accuracy = accuracy_score(predictions, y_valid)
print(f"Accuracy is {accuracy}")

Accuracy is 0.8094896946985561


In [202]:
# save the model
path_to_model = "../models/proper_model.json"
some_model.save_model(path_to_model)

This is much better than the previous model, where repayment was a also possible output scenario. By only focusing on the probability of default, we have achieved over 80% accuracy! Let's try calculating the Mean Squared Error next!

In [10]:
import random

arbitrary_predictions = [random.randint(0,1) for _ in range(y_valid.shape[0])]

print(f"Accuracy of randomly generated results is {accuracy_score(arbitrary_predictions, y_valid)}")

Accuracy of randomly generated results is 0.4973547889342004


Just for reference, a randomly generated array of answers would be around 50% accurate at predicting a default. This means, that our model is around 60% better than just randomly guessing. Which is good, but we can make it much better.

In [120]:
hits = 0
total = len(y_valid)
for val1, val2 in zip(predictions, y_valid):
    if val1 == val2:
        hits += 1
print(f"Prediction accuracy is {hits/total}") 

Prediction accuracy is 0.8127962085308057


In [121]:
rand_hits = 0
total = len(y_valid)
for val1, val2 in zip(arbitrary_predictions, y_valid):
    if val1 == val2:
        rand_hits += 1
print(f"Random guessing results in {rand_hits/total} accuracy")

Random guessing results in 0.4973547889342004 accuracy


In [205]:
# make sure that the ML model is not just guessing zeroes
sample_default_rate = sum(y_valid)/len(y_valid)
print(f"The number of defaults compared to the rest of the validation dataset is {sample_default_rate}")
default_rate = sum(predictions)/len(predictions)
print(f"The number of defaults compared to the rest of the prediction data is {default_rate}")

The number of defaults compared to the rest of the validation dataset is 0.18400749476468642
The number of defaults compared to the rest of the data is 0.18141739226275763
