In [1]:
import numpy as np
import pandas as pd
import pickle

%config IPCompleter.greedy=True

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [2]:
path = '../../02_Data/01_Originals/loans.csv'

df = pd.read_csv(path, index_col = 0)

In [3]:
df.columns.to_list()

['employment',
 'employment_duration',
 'income',
 'verified_income',
 'rating',
 'dti',
 'housing',
 'num_mortgages',
 'num_credit_lines',
 'pct_cards_75p',
 'pct_revolving_utilization',
 'num_cancellations_12months',
 'num_derogatories',
 'months_since_last_delinquency',
 'loan_id',
 'description',
 'purpose',
 'principal',
 'interest_rate',
 'num_installments',
 'installment_amount',
 'amount_amortized',
 'status',
 'amount_recovered']

In [4]:
final_variables = ['verified_income',
                  'housing',
                  'purpose',
                  'num_installments',
                  'employment_duration',
                  'rating',
                  'income',
                  'dti',
                  'num_credit_lines',
                  'pct_revolving_utilization',
                  'principal',
                  'interest_rate',
                  'installment_amount',
                  'num_derogatories',
                  'status',
                  'amount_amortized',
                  'amount_recovered']

## Structure of the datasets

In [5]:
to_delete = df.loc[df.income > 300000].index.values

In [6]:
df = df[~df.index.isin(to_delete)]

In [7]:
df = df[final_variables]

## Pipeline

In [8]:
def data_quality(temp):
    temp['employment_duration'] = temp['employment_duration'].fillna('unknown')
    
    for column in temp.select_dtypes('number').columns:
        temp[column] = temp[column].fillna(0)
    
    return temp

In [9]:
def variable_creation_pd(df):
    
    temp = df.copy()
    
    temp['target_pd'] = np.where(temp.status.isin(['Charged Off','Does not meet the credit policy. Status:Charged Off','Default']), 1, 0)
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    

    temp.drop(columns = ['status','amount_amortized','amount_recovered'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [10]:
def variable_creation_ead(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.amount_amortized
    
    temp['target_ead'] = temp.remaining / temp.principal
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    
    temp.drop(columns = ['status','amount_amortized','amount_recovered','remaining'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [11]:
def variable_creation_lgd(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.amount_amortized
    
    temp['target_lgd'] = 1 - (temp.amount_recovered / temp.remaining)
    
    temp['target_lgd'] = temp['target_lgd'].fillna(0)
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    
    temp.drop(columns = ['status','amount_amortized','amount_recovered','remaining'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [12]:
x_pd, y_pd = variable_creation_pd(data_quality(df))

x_ead, y_ead = variable_creation_ead(data_quality(df))

x_lgd, y_lgd = variable_creation_lgd(data_quality(df))

### Instantiate variable transformation

In [13]:
#ONE HOT ENCODING
var_ohe = [ 'verified_income', 'housing','purpose','num_installments']
ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')


#ORDINAL ENCODING
var_oe = ['employment_duration','rating']

order_emp_dur = ['unknown','< 1 year','1 year','2 years','3 years','4 years',
                           '5 years','6 years','7 years','8 years','9 years','10+ years']

order_rating = ['A','B','C','D','E','F','G']

oe = OrdinalEncoder(categories = [order_emp_dur,order_rating],
                    handle_unknown = 'use_encoded_value',
                    unknown_value = 12)

#BINNING
var_bin = ['num_derogatories']
bina = Binarizer(threshold=0)


#MIN-MAX SCALING
var_mms = ['income','dti','num_credit_lines','pct_revolving_utilization',
            'principal','interest_rate','installment_amount']
mms = MinMaxScaler()


In [14]:
# create pipe

ct = make_column_transformer(
    (ohe, var_ohe),
    (oe, var_oe),
    (bina, var_bin),
    (mms, var_mms),
    remainder='passthrough')

In [15]:
# instantiate models

model_pd = LogisticRegression(solver = 'saga', n_jobs=-1, C = 0.25, penalty = 'l1')

model_ead = HistGradientBoostingRegressor(learning_rate = 0.1,
                                          max_iter = 200,
                                          max_depth = 10,
                                          min_samples_leaf = 100,
                                          scoring = 'neg_mean_absolute_percentage_error',
                                          l2_regularization = 0.75)

model_lgd = HistGradientBoostingRegressor(learning_rate = 0.1,
                                          max_iter = 200,
                                          max_depth = 20,
                                          min_samples_leaf = 100,
                                          scoring = 'neg_mean_absolute_percentage_error',
                                          l2_regularization = 0)

In [16]:
# training pipes

pipe_training_pd = make_pipeline(ct,model_pd)
pipe_training_ead = make_pipeline(ct,model_ead)
pipe_training_lgd = make_pipeline(ct,model_lgd)


In [17]:
# SAVE the pipes


pipe_training_pd_path  = '../../04_Models/pipe_training_pd.pickle'
pipe_training_ead_path = '../../04_Models/pipe_training_ead.pickle'
pipe_training_lgd_path = '../../04_Models/pipe_training_lgd.pickle'

with open(pipe_training_pd_path, mode='wb') as file:
   pickle.dump(pipe_training_pd, file)
with open(pipe_training_ead_path, mode='wb') as file:
   pickle.dump(pipe_training_ead, file)
with open(pipe_training_lgd_path, mode='wb') as file:
   pickle.dump(pipe_training_lgd, file)



### Train the pipes

In [18]:
pipe_execution_pd  = pipe_training_pd.fit(x_pd,y_pd)
pipe_execution_ead = pipe_training_ead.fit(x_ead,y_ead)
pipe_execution_lgd = pipe_training_lgd.fit(x_lgd,y_lgd)

## Save the pipe

In [19]:
path_pipe_execution_pd  = '../../04_Models/pipe_execution_pd.pickle'
path_pipe_execution_ead = '../../04_Models/pipe_execution_ead.pickle'
path_pipe_execution_lgd = '../../04_Models/pipe_execution_lgd.pickle'


with open(path_pipe_execution_pd, mode='wb') as file:
   pickle.dump(pipe_execution_pd, file)

with open(path_pipe_execution_ead, mode='wb') as file:
   pickle.dump(pipe_execution_ead, file)

with open(path_pipe_execution_lgd, mode='wb') as file:
   pickle.dump(pipe_execution_lgd, file)