In [1]:
import numpy as np
import pandas as pd
import pickle

%config IPCompleter.greedy=True

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [2]:
path = '../../02_Data/01_Originals/loans.csv'

df = pd.read_csv(path, index_col = 0)

In [4]:
df.columns.to_list()

['employment',
 'employment_duration',
 'income',
 'verified_income',
 'rating',
 'dti',
 'housing',
 'num_mortgages',
 'num_credit_lines',
 'pct_cards_75p',
 'pct_revolving_utilization',
 'num_cancellations_12months',
 'num_derogatories',
 'months_since_last_delinquency',
 'loan_id',
 'description',
 'purpose',
 'principal',
 'interest_rate',
 'num_installments',
 'installment_amount',
 'amount_amortized',
 'status',
 'amount_recovered']

In [5]:
final_variables = ['verified_income',
                  'housing',
                  'purpose',
                  'num_mortgages',
                  'employment_duration',
                  'rating',
                  'income',
                  'dti',
                  'num_credit_lines',
                  'pct_revolving_utilization',
                  'principal',
                  'interest_rate',
                  'installment_amount',
                  'num_installments',
                  'status',
                  'amount_amortized',
                  'amount_recovered']

## Structure of the datasets

In [10]:
to_delete = df.loc[df.income > 300000].index.values

In [12]:
df = df[~df.index.isin(to_delete)]

In [13]:
df = df[final_variables]

## Pipeline

In [15]:
def data_quality(temp):
    temp['employment_duration'] = temp['employment_duration'].fillna('unknown')
    
    for column in temp.select_dtypes('number').columns:
        temp[column] = temp[column].fillna(0)
    
    return temp

In [16]:
def variable_creation_pd(df):
    
    temp = df.copy()
    
    temp['target_pd'] = np.where(temp.status.isin(['Charged Off','Does not meet the credit policy. Status:Charged Off','Default']), 1, 0)
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    

    temp.drop(columns = ['status','amount_amortized','amount_recovered'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [17]:
def variable_creation_ead(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.amount_amortized
    
    temp['target_ead'] = temp.remaining / temp.principal
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    
    temp.drop(columns = ['status','amount_amortized','amount_recovered','remaining'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [20]:
def variable_creation_lgd(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.amount_amortized
    
    temp['target_lgd'] = 1 - (temp.amount_recovered / temp.remaining)
    
    temp['target_lgd'] = temp['target_lgd'].fillna(0)
    
    temp.housing = temp.housing.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.purpose = temp.purpose.replace(['wedding','educational','renewable_energy'],'other')
    
    temp.drop(columns = ['status','amount_amortized','amount_recovered','remaining'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [21]:
x_pd, y_pd = variable_creation_pd(data_quality(df))

x_ead, y_ead = variable_creation_ead(data_quality(df))

x_lgd, y_lgd = variable_creation_lgd(data_quality(df))

### Instantiate variable transformation