# Model Regression for Loss Given Default 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error

%config IPCompleter.greedy=True

pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings("ignore")

In [2]:

df = pd.read_pickle('../../02_Data/03_Work/df_board_lgd.pickle')


In [3]:
x = df.drop(columns='target_lgd')
y = df.target_lgd

## MODEL

In [4]:

train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.3)


In [5]:
pipe = Pipeline([('algorithm',Ridge())])

grid = [      
        {'algorithm': [Ridge()],
         'algorithm__alpha': list(np.arange(0.1,1.1,0.1))},
        
        {'algorithm': [Lasso()],
         'algorithm__alpha': list(np.arange(0.1,1.1,0.1))},
    
        {'algorithm': [HistGradientBoostingRegressor(min_samples_leaf = 100, scoring = 'neg_mean_absolute_percentage_error')],
         'algorithm__learning_rate': [0.01,0.025,0.05,0.1],
         'algorithm__max_iter': [50,100,200],
         'algorithm__max_depth': [5,10,20],
         'algorithm__l2_regularization': [0,0.25,0.5,0.75,1]}
       ]

#### Optimize hyperparameters

In [None]:
grid_search = GridSearchCV(estimator= pipe, 
                           param_grid = grid, 
                           cv = 3, 
                           scoring = 'neg_mean_absolute_error',
                           verbose = 0,
                           n_jobs = -1)

model = grid_search.fit(train_x,train_y)

In [None]:
# since it's very polarized, the model makes a lot of error
# the model makes a regression towards the mean
# this makes the model be "good" overall, but fails a lot when looking at specific cases



In [None]:
output = pd.DataFrame(grid_search.cv_results_).sort_values(by = 'rank_test_score').reset_index(drop=True).head(10)
output

In [None]:
model_lgd = HistGradientBoostingRegressor(learning_rate = 0.1,
                                          max_iter = 200,
                                          max_depth = 20,
                                          min_samples_leaf = 100,
                                          scoring = 'neg_mean_absolute_percentage_error',
                                          l2_regularization = 0.25)
model_lgd.fit(train_x,train_y)

### Evaluate

In [None]:
pred = model_lgd.predict(val_x)

pred = np.where(pred < 0, 0, pred)
pred = np.where(pred > 1, 1, pred)

In [None]:
mean_absolute_error(val_y, pred)

### Report

In [None]:
check_validation = pd.DataFrame({'lgd_real':val_y, 'lgd_pred':pred})
check_validation

In [None]:
check_validation.corr()

In [None]:
check_validation.plot.scatter(x='lgd_real',y='lgd_pred');

In [None]:
check_validation.plot.hist(bins = 100,figsize = (10,6), alpha = 0.3);