In [127]:
from ngboost import NGBRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

In [128]:
def correlation_heatmap(df, title, absolute_bounds=True):
    '''Plot a correlation heatmap for the entire dataframe'''
    heatmap = go.Heatmap(
        z=df.corr(method='pearson').as_matrix(),
        x=df.columns,
        y=df.columns,
        colorbar=dict(title='Pearson Coefficient'),
        colorscale='Reds',
    )
    
    layout = go.Layout(title=title)
    
    if absolute_bounds:
        heatmap['zmax'] = 1.0
        heatmap['zmin'] = -1.0
        
    fig = go.Figure(data=[heatmap], layout=layout)
    iplot(fig)

In [129]:
def preprocess_df(df):
    df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean(),inplace=True)
    df['NumberOfDependents'].fillna(df['NumberOfDependents'].mode()[0], inplace=True)

In [130]:
cols = [
 'SeriousDlqin2yrs',
 'RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [131]:
df_train = pd.read_csv("./data/cs-training.csv", usecols =cols)
df_test = pd.read_csv("./data/cs-test.csv", usecols =cols)

In [132]:
df_train.head()
#df_test.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [133]:
print(df_train.isnull().sum())

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [134]:
df_train.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [135]:
preprocess_df(df_train)
preprocess_df(df_test)

In [136]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           150000 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      150000 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.6 MB


In [137]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101503 entries, 0 to 101502
Data columns (total 11 columns):
SeriousDlqin2yrs                        0 non-null float64
RevolvingUtilizationOfUnsecuredLines    101503 non-null float64
age                                     101503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    101503 non-null int64
DebtRatio                               101503 non-null float64
MonthlyIncome                           101503 non-null float64
NumberOfOpenCreditLinesAndLoans         101503 non-null int64
NumberOfTimes90DaysLate                 101503 non-null int64
NumberRealEstateLoansOrLines            101503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    101503 non-null int64
NumberOfDependents                      101503 non-null float64
dtypes: float64(5), int64(6)
memory usage: 8.5 MB


In [138]:
print(df_train.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [139]:
df_train.corr()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.018002,-0.029669,0.117175,-0.007038,0.102261,0.046869
RevolvingUtilizationOfUnsecuredLines,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.006565,-0.011281,-0.001061,0.006235,-0.001048,0.001193
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.032984,0.147705,-0.061005,0.03315,-0.057159,-0.215693
NumberOfTime30-59DaysPastDueNotWorse,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.007636,-0.055312,0.983603,-0.030565,0.987005,-0.00459
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.005355,0.049565,-0.00832,0.120046,-0.007533,-0.044476
MonthlyIncome,-0.018002,0.006565,0.032984,-0.007636,-0.005355,1.0,0.082319,-0.009484,0.113823,-0.008259,0.058192
NumberOfOpenCreditLinesAndLoans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.082319,1.0,-0.079984,0.433959,-0.071077,0.074026
NumberOfTimes90DaysLate,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.009484,-0.079984,1.0,-0.045205,0.992796,-0.011962
NumberRealEstateLoansOrLines,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.113823,0.433959,-0.045205,1.0,-0.039722,0.129399
NumberOfTime60-89DaysPastDueNotWorse,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.008259,-0.071077,0.992796,-0.039722,1.0,-0.012678


In [140]:
df = df_train.append(df_test)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1.0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0.0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0.0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0.0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0.0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [141]:
cor=df.corr()
correlation_heatmap(cor, "Matriz de correlaciones", absolute_bounds=True)

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251503 entries, 0 to 101502
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null float64
RevolvingUtilizationOfUnsecuredLines    251503 non-null float64
age                                     251503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    251503 non-null int64
DebtRatio                               251503 non-null float64
MonthlyIncome                           251503 non-null float64
NumberOfOpenCreditLinesAndLoans         251503 non-null int64
NumberOfTimes90DaysLate                 251503 non-null int64
NumberRealEstateLoansOrLines            251503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    251503 non-null int64
NumberOfDependents                      251503 non-null float64
dtypes: float64(5), int64(6)
memory usage: 23.0 MB


In [143]:
X_train = df_train.drop(columns=['SeriousDlqin2yrs']).values
Y_train = df_train.SeriousDlqin2yrs.values
X_test = df_test.drop(columns=['SeriousDlqin2yrs']).values
Y_test = df_test.SeriousDlqin2yrs.values

In [144]:
X_train

array([[ 0.76612661, 45.        ,  2.        , ...,  6.        ,
         0.        ,  2.        ],
       [ 0.95715102, 40.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.65818014, 38.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.24604392, 58.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , 30.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85028295, 64.        ,  0.        , ...,  2.        ,
         0.        ,  0.        ]])

In [145]:
ngb = NGBRegressor(n_estimators=50, verbose_eval=10).fit(X_train, Y_train)

[iter 0] loss=0.0316 val_loss=0.0000 scale=1.0000 norm=0.8753
[iter 10] loss=-0.0930 val_loss=0.0000 scale=2.0000 norm=1.5333
[iter 20] loss=-0.1528 val_loss=0.0000 scale=2.0000 norm=1.4710
[iter 30] loss=-0.1947 val_loss=0.0000 scale=2.0000 norm=1.4517
[iter 40] loss=-0.2284 val_loss=0.0000 scale=2.0000 norm=1.4503


In [146]:
Y_preds = ngb.predict(X_test)
Y_dists = ngb.pred_dist(X_test)

In [89]:
X_test

array([[8.85519080e-01, 4.30000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.63295269e-01, 5.70000000e+01, 0.00000000e+00, ...,
        4.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       [4.32750360e-02, 5.90000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       ...,
       [8.15963730e-02, 7.00000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00,            nan],
       [3.35456547e-01, 5.60000000e+01, 0.00000000e+00, ...,
        2.00000000e+00, 1.00000000e+00, 3.00000000e+00],
       [4.41841663e-01, 2.90000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])