In [52]:
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

In [49]:
def correlation_heatmap(df):
    """
    Plot a correlation heatmap for the entire dataframe
    
    Args:
        - df (DataFrame object): dataframe to be illustrated
    """
    heatmap = go.Heatmap(
        z=df.corr(method='pearson').as_matrix(),
        x=df.columns,
        y=df.columns,
        colorbar=dict(title='Pearson Coefficient'),
        colorscale='Reds',
    )
    
    layout = go.Layout(title="Matriz de correlaciones")
    
    fig = go.Figure(data=[heatmap], layout=layout)
    iplot(fig)

In [3]:
def preprocess_df(df):
    """
    Preprocess df imputing certain columns such as MonthlyIncome and NumberOfDependencies
    
    Args:
        -df (DataFrame object): df to be computed
    """
    df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean(),inplace=True)
    df['NumberOfDependents'].fillna(df['NumberOfDependents'].mode()[0], inplace=True)

In [None]:
def plot_feature_importances(features, clf):
    """
    Show feature importances plot.
    
    Args:
        - features (list of strings): list of name columns
        - clf (XGboost model): XGboost model that was trained
    """
    trace1 = go.Bar(
    y=features,
    x=clf.feature_importances_[0],
    marker=dict(color="cornflowerblue",
            opacity=1),orientation='h'
    )

    data = [trace1]
    layout = go.Layout(
        barmode='group', margin=go.layout.Margin(
            l=120,
            r=50,
            b=100,
            t=100,
            pad=4
        ),title = 'Feature importances',
                  xaxis = dict(title = 'Importance'),
                  yaxis = dict(title = 'Features')
                  )
    fig = dict(data=data, layout=layout)
    iplot(fig)


In [4]:
cols = [
 'SeriousDlqin2yrs',
 'RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [5]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [6]:
pd.read_csv("./data/cs-test.csv", usecols =cols).SeriousDlqin2yrs.unique()

array([nan])

In [12]:
df.head()
#df_test.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [13]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [14]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.737413
std,0.249746,249.755371,14.771866,4.192781,2037.818523,12880.45,5.145951,4.169304,1.129771,4.155179,1.107021
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3903.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,6600.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [15]:
preprocess_df(df)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           150000 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      150000 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.6 MB


In [17]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [18]:
df.corr()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.018002,-0.029669,0.117175,-0.007038,0.102261,0.046869
RevolvingUtilizationOfUnsecuredLines,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.006565,-0.011281,-0.001061,0.006235,-0.001048,0.001193
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.032984,0.147705,-0.061005,0.03315,-0.057159,-0.215693
NumberOfTime30-59DaysPastDueNotWorse,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.007636,-0.055312,0.983603,-0.030565,0.987005,-0.00459
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.005355,0.049565,-0.00832,0.120046,-0.007533,-0.044476
MonthlyIncome,-0.018002,0.006565,0.032984,-0.007636,-0.005355,1.0,0.082319,-0.009484,0.113823,-0.008259,0.058192
NumberOfOpenCreditLinesAndLoans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.082319,1.0,-0.079984,0.433959,-0.071077,0.074026
NumberOfTimes90DaysLate,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.009484,-0.079984,1.0,-0.045205,0.992796,-0.011962
NumberRealEstateLoansOrLines,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.113823,0.433959,-0.045205,1.0,-0.039722,0.129399
NumberOfTime60-89DaysPastDueNotWorse,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.008259,-0.071077,0.992796,-0.039722,1.0,-0.012678


In [51]:
cor=df.corr()
correlation_heatmap(cor)

In [20]:
X = df.drop(columns=['SeriousDlqin2yrs']).values
y = df.SeriousDlqin2yrs.values

In [21]:
len(X)

150000

In [22]:
len(y)

150000

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## NGBOOST MODEL

In [25]:
ngb_clf = NGBClassifier(Dist=Bernoulli)
ngb_clf.fit(X_train, y_train)
preds = ngb_clf.pred_dist(X_test)
#print("ROC:", roc_auc_score(y_test, preds.prob))

[iter 0] loss=0.2468 val_loss=0.0000 scale=0.5000 norm=1.0000
[iter 100] loss=0.1918 val_loss=0.0000 scale=2.0000 norm=3.3921
[iter 200] loss=0.1839 val_loss=0.0000 scale=1.0000 norm=1.7944
[iter 300] loss=0.1816 val_loss=0.0000 scale=0.5000 norm=0.9285
[iter 400] loss=0.1804 val_loss=0.0000 scale=0.5000 norm=0.9458


In [23]:
ngb_clf

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=3,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, learning_rate=0.01,
              minibatch_frac=1.0, n_estimators=500, natural_gradient=True,
              random_state=RandomState(MT19937) at 0x7F624C0C7E20, tol=0

In [53]:
plot_feature_importances(cols[1:], ngb_clf)

In [27]:
ngb_clf.feature_importances_

array([[3.69645936e-01, 1.59058256e-01, 1.01928653e-01, 3.25576936e-02,
        5.57906951e-02, 9.88551278e-02, 8.00425269e-02, 4.41494293e-02,
        5.77417991e-02, 2.29883171e-04]])

In [54]:
ngb_clf.natural_gradient

True

In [199]:
ngb.get_params

<bound method BaseEstimator.get_params of NGBRegressor(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'),
             Dist=<class 'ngboost.distns.normal.Normal'>,
             Score=<class 'ngboost.scores.LogScore'>, learning_rate=0.01,
             minibatch_frac=1.0, n_estimators=50, natural_gradient=True,
             random_state=RandomState(MT19937) at 0x7F29D40D8E20, tol=0.0001,
             verbose=True, verbose_eval=10)>

In [200]:
ngb.base_models

[[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, max_features=None,
                        max_leaf_nodes=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        presort=False, random_state=None, splitter='best'),
  DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, max_features=None,
                        max_leaf_nodes=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        presort=False, random_state=None, splitter='best')],
 [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, max_features=None,
                        max_leaf_nodes=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_

In [204]:
ngb.pred_param

<bound method NGBoost.pred_param of NGBRegressor(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'),
             Dist=<class 'ngboost.distns.normal.Normal'>,
             Score=<class 'ngboost.scores.LogScore'>, learning_rate=0.01,
             minibatch_frac=1.0, n_estimators=50, natural_gradient=True,
             random_state=RandomState(MT19937) at 0x7F29D40D8E20, tol=0.0001,
             verbose=True, verbose_eval=10)>

In [None]:
ngb_clf.