In [19]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.tree import DecisionTreeRegressor

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (correlation_heatmap, register_amputation, preprocess_df, 
                    scaling_values_df, plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                    plot_feature_importances, visualize_roc_curve, color_negative_red,clean_outliers,
                  get_sample_weights,)

from constants import cols

## READING DATAFRAME

In [2]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [3]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
len(df)

150000

## FEATURE ENGINEERING

In [5]:
df = clean_outliers(df.copy(), True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135000 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        135000 non-null int64
RevolvingUtilizationOfUnsecuredLines    135000 non-null float64
age                                     135000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    135000 non-null int64
DebtRatio                               135000 non-null float64
MonthlyIncome                           135000 non-null float64
NumberOfOpenCreditLinesAndLoans         135000 non-null int64
NumberOfTimes90DaysLate                 135000 non-null int64
NumberRealEstateLoansOrLines            135000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    135000 non-null int64
NumberOfDependents                      135000 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.4 MB


In [7]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [8]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0,135000.0
mean,0.060726,0.294336,52.143319,0.280696,0.328392,5089.181052,8.098289,0.134,0.996363,0.11717,0.724215
std,0.238828,0.335301,14.336519,2.515978,0.67352,5747.339847,4.526645,2.472258,1.045003,2.459597,1.084068
min,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.024932,41.0,0.0,0.003842,1251.75,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.132987,52.0,0.0,0.214178,4250.0,7.0,0.0,1.0,0.0,0.0
75%,0.0,0.499763,62.0,0.0,0.412272,7200.0,11.0,0.0,2.0,0.0,1.0
max,1.0,1.0,99.0,98.0,10.0,151855.0,57.0,98.0,32.0,98.0,10.0


## NGBOOST MODEL

In [9]:
X = df.drop(columns=['SeriousDlqin2yrs']).values
y = df.SeriousDlqin2yrs.values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
import random
list_of_ids=random.sample(range(0,len(X_train)), int(len(X_train) * 0.2))

In [12]:
df_train = pd.DataFrame(X_train, columns = cols [1:])
df_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.074229,52.0,0.0,0.019753,69000.0,10.0,0.0,1.0,0.0,1.0
1,0.677837,29.0,0.0,0.481669,3545.0,7.0,0.0,1.0,0.0,2.0
2,0.113926,59.0,0.0,0.392699,4300.0,12.0,0.0,1.0,0.0,2.0
3,0.939826,55.0,0.0,0.363687,1800.0,9.0,0.0,0.0,0.0,0.0
4,0.949051,47.0,0.0,0.126517,4283.0,3.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
107995,0.010324,58.0,0.0,0.406999,5800.0,11.0,0.0,2.0,0.0,0.0
107996,0.193164,84.0,0.0,0.910072,833.0,11.0,0.0,0.0,0.0,0.0
107997,0.742393,46.0,0.0,0.509249,10000.0,8.0,1.0,2.0,0.0,2.0
107998,0.098779,41.0,2.0,0.117085,4500.0,6.0,0.0,0.0,0.0,2.0


In [13]:
for idx in list_of_ids:
    for col in ['RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate']:
        df_train.at[idx, col] = 0

In [14]:
X_train = df_train.values

In [15]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

Resampled dataset shape Counter({0: 101492, 1: 101492})


In [16]:
df_train = pd.DataFrame(X_train_resampled, columns = cols [1:])
df_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.074229,52.000000,0.000000,0.019753,69000.000000,10.000000,0.000000,1.000000,0.000000,1.000000
1,0.000000,29.000000,0.000000,0.481669,3545.000000,7.000000,0.000000,1.000000,0.000000,2.000000
2,0.113926,59.000000,0.000000,0.392699,4300.000000,12.000000,0.000000,1.000000,0.000000,2.000000
3,0.939826,55.000000,0.000000,0.363687,1800.000000,9.000000,0.000000,0.000000,0.000000,0.000000
4,0.949051,47.000000,0.000000,0.126517,4283.000000,3.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...
202979,0.705761,42.439555,3.879110,0.000000,0.000000,15.120890,0.000000,2.000000,0.439555,0.000000
202980,0.848636,44.155897,2.688206,0.000000,0.000000,3.155897,0.000000,0.000000,0.844103,0.000000
202981,0.852700,56.952634,0.000000,0.488888,7916.047366,8.094732,0.000000,1.000000,0.952634,0.952634
202982,0.000000,54.369139,0.000000,0.000000,0.000000,7.630861,0.630861,0.630861,1.369139,0.630861


In [17]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([0.53206164, 8.29748002])

In [20]:
ngb_clf = NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=50,
              natural_gradient=True, tol=0.0001,
              verbose=True, verbose_eval=0)
ngb_clf.fit(X_train_resampled,y_train_resampled,sample_weight=get_sample_weights(y_train, y_train_resampled))

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=50,
              natural_gradient=True,
              random_state=RandomState(MT19

In [21]:
def process_unit_cost(x, rate):
    """
    Process the unit cost for each observation, according to our cost matrix.
    
    Args:
        - x: data to be identified with name column (real, predicted or LoanPrincipal)
        - rate: interest rate
        
    Returns for each case his cost value.
    """
    if (x['predicted'] == 1) & (x['real'] == 0):
        return x['LoanPrincipal']*rate
    elif (x['predicted'] == 0) & (x['real'] == 1):
        return x['LoanPrincipal']
    else:
        return 0
#https://www.ecb.europa.eu/press/pr/stats/mfi/html/ecb.mir1907~a86424a725.en.html 0.0156
#sum(df_aux.apply(lambda x: costo_error(x,0.0075), axis=1))

In [22]:
def cost_score(loan, y_pred, y_true):
    """
    From input data, generates auxiliar dataframe in order to apply process_unit_cost for each row and then summarize that.
    
    Args:
        - loan: data about the requested amount of money
        - y_pred: list of predictions
        - y_true: list of true values
        
    Returns sum of unit costs
    """
    aux_df = pd.DataFrame(data = {'LoanPrincipal': loan, 'predicted': y_pred, 'real':y_true})
    return sum(aux_df.apply(lambda x: process_unit_cost(x,0.0075), axis=1))

In [23]:
df_cost = pd.DataFrame(X_test, columns = cols [1:])
df_cost['LoanPrincipal'] = df_cost['MonthlyIncome'] * 2
df_cost['y_true'] = list(y_test)
df_cost['y_pred'] = list(ngb_clf.predict(X_test))

In [24]:
cost_score(list(df_cost['LoanPrincipal']), list(df_cost['y_pred']), list(df_cost['y_true']))

2099027.210000043

In [25]:
visualize_roc_curve(ngb_clf, X_train, y_train, X_test, y_test)

TRAIN: ROC AUC=0.809
TEST: ROC AUC=0.812
