In [62]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.tree import DecisionTreeRegressor

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (correlation_heatmap, register_amputation, preprocess_df, 
                    scaling_values_df, plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                    plot_feature_importances, visualize_roc_curve, color_negative_red,clean_outliers,)

from constants import cols

## READING DATAFRAME

In [3]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [4]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
len(df)

150000

## FEATURE ENGINEERING

In [6]:
df = clean_outliers(df.copy())

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142500 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        142500 non-null int64
RevolvingUtilizationOfUnsecuredLines    142500 non-null float64
age                                     142500 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    142500 non-null int64
DebtRatio                               142500 non-null float64
MonthlyIncome                           142500 non-null float64
NumberOfOpenCreditLinesAndLoans         142500 non-null int64
NumberOfTimes90DaysLate                 142500 non-null int64
NumberRealEstateLoansOrLines            142500 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    142500 non-null int64
NumberOfDependents                      142500 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.0 MB


In [8]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [9]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0
mean,0.063586,3.664576,49.202042,0.305782,353.312505,5197.10167,8.332463,0.153502,1.012407,0.134196,0.736786
std,0.244015,139.248834,10.750373,2.780339,1109.832004,5883.519409,4.821058,2.738397,1.072852,2.724658,1.098754
min,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029791,41.0,0.0,0.178625,1500.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.152392,52.0,0.0,0.368924,4356.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.549972,60.0,0.0,0.894324,7375.0,11.0,0.0,2.0,0.0,1.0
max,1.0,13498.0,60.0,98.0,34719.0,208333.0,57.0,98.0,32.0,98.0,10.0


## NGBOOST MODEL

In [68]:
X = df.drop(columns=['SeriousDlqin2yrs']).values
y = df.SeriousDlqin2yrs.values

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
import random
list_of_ids=random.sample(range(0,len(X_train)), int(len(X_train) * 0.2))

In [71]:
df_train = pd.DataFrame(X_train, columns = cols [1:])
df_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.444704,48.0,0.0,0.290345,1418.0,3.0,1.0,0.0,0.0,0.0
1,0.293498,50.0,0.0,0.726341,3708.0,9.0,0.0,2.0,0.0,0.0
2,0.031428,60.0,0.0,801.000000,0.0,7.0,0.0,1.0,0.0,0.0
3,0.026946,45.0,0.0,0.215653,3500.0,10.0,0.0,1.0,0.0,1.0
4,0.000000,51.0,0.0,0.694084,8400.0,9.0,0.0,3.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...
113995,1.000000,57.0,0.0,0.063240,4474.0,1.0,0.0,0.0,0.0,0.0
113996,0.041357,41.0,0.0,0.469676,12250.0,7.0,0.0,2.0,0.0,2.0
113997,0.409299,45.0,0.0,0.321954,7000.0,11.0,0.0,2.0,0.0,3.0
113998,0.003483,60.0,0.0,3.000000,0.0,4.0,0.0,0.0,0.0,1.0


In [72]:
for idx in list_of_ids:
    for col in ['RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate']:
        df_train.at[idx, col] = 0

In [73]:
X_train = df_train.values

In [74]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

Resampled dataset shape Counter({0: 106727, 1: 106727})


In [75]:
df_train = pd.DataFrame(X_train_resampled, columns = cols [1:])
df_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.444704,48.000000,0.000000,0.290345,1418.000000,3.000000,1.000000,0.000000,0.000000,0.000000
1,0.293498,50.000000,0.000000,0.726341,3708.000000,9.000000,0.000000,2.000000,0.000000,0.000000
2,0.031428,60.000000,0.000000,801.000000,0.000000,7.000000,0.000000,1.000000,0.000000,0.000000
3,0.026946,45.000000,0.000000,0.215653,3500.000000,10.000000,0.000000,1.000000,0.000000,1.000000
4,0.000000,51.000000,0.000000,0.694084,8400.000000,9.000000,0.000000,3.000000,0.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...
213449,0.555632,58.333102,0.555633,0.506651,10000.000000,3.222530,0.000000,1.111265,1.111265,0.000000
213450,0.872735,39.816900,2.408450,0.836808,6666.000000,11.802817,0.000000,2.605633,0.000000,0.394367
213451,0.695271,55.091518,2.945089,2459.274554,0.000000,9.926786,0.000000,2.000000,0.000000,0.000000
213452,1.000000,50.805753,0.097123,0.016408,4028.776988,0.097123,0.194247,0.000000,0.000000,3.000000


In [76]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([0.53407292, 7.8372061 ])

In [77]:
import numpy as np
npa = np.asarray([class_weights[0] if x == 0 else class_weights[1] for x in  y_train_resampled], dtype=np.float32)

In [78]:
ngb_clf = NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=50,
              natural_gradient=True, tol=0.0001,
              verbose=True, verbose_eval=0)
ngb_clf.fit(X_train_resampled,y_train_resampled,sample_weight=npa)

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=50,
              natural_gradient=True,
              random_state=RandomState(MT19

In [79]:
def process_unit_cost(x, rate):
    """
    Process the unit cost for each observation, according to our cost matrix.
    
    Args:
        - x: data to be identified with name column (real, predicted or LoanPrincipal)
        - rate: interest rate
        
    Returns for each case his cost value.
    """
    if (x['predicted'] == 1) & (x['real'] == 0):
        return x['LoanPrincipal']*rate
    elif (x['predicted'] == 0) & (x['real'] == 1):
        return x['LoanPrincipal']
    else:
        return 0
#https://www.ecb.europa.eu/press/pr/stats/mfi/html/ecb.mir1907~a86424a725.en.html 0.0156
#sum(df_aux.apply(lambda x: costo_error(x,0.0075), axis=1))

In [80]:
def cost_score(loan, y_pred, y_true):
    """
    From input data, generates auxiliar dataframe in order to apply process_unit_cost for each row and then summarize that.
    
    Args:
        - loan: data about the requested amount of money
        - y_pred: list of predictions
        - y_true: list of true values
        
    Returns sum of unit costs
    """
    aux_df = pd.DataFrame(data = {'LoanPrincipal': loan, 'predicted': y_pred, 'real':y_true})
    return sum(aux_df.apply(lambda x: process_unit_cost(x,0.0075), axis=1))

In [84]:
df_cost = pd.DataFrame(X_test, columns = cols [1:])
df_cost['LoanPrincipal'] = df_cost['MonthlyIncome'] * 2
df_cost['y_true'] = list(y_test)
df_cost['y_pred'] = list(ngb_clf.predict(X_test))

In [85]:
cost_score(list(df_cost['LoanPrincipal']), list(df_cost['y_pred']), list(df_cost['y_true']))

2664698.6950001135