In [1]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (register_imputation, preprocess_df, 
                    scaling_values_df, clean_outliers, get_sample_weights, 
                  process_unit_cost, cost_score, generate_y_pred_with_custom_threshold, check_counts,)
from plot_utils import (correlation_heatmap, visualize_permutation_feature_importances, 
                        plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                        plot_feature_importances, visualize_roc_curve, color_negative_red, )

from constants import cols, cols_with_missing_indicators

In [2]:
SEED = 2020

In [3]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [4]:
from sklearn.model_selection import train_test_split

df, df_test = train_test_split(df, test_size=0.2,random_state=42)

In [5]:
df = clean_outliers(df.copy(), True)

In [6]:
X_train = df.drop(columns=['SeriousDlqin2yrs']).values
y_train = df.SeriousDlqin2yrs.values

In [7]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

Resampled dataset shape Counter({0: 101298, 1: 101298})


In [8]:
df_test = register_imputation(df_test.copy())
X_test = df_test.drop(columns=['SeriousDlqin2yrs']).values
y_test = df_test.SeriousDlqin2yrs.values

In [9]:
from sklearn.metrics import roc_auc_score
df_collector_estimators = pd.DataFrame(columns=  ["estimators", "cost", "count_zero", "count_one", "train_auc", "test_auc"])
k = 0
threshold = 0.2
for estimator in [50,75, 100, 200, 300, 500, 700]:
    print(f"estimators: {estimator}")
    ngb_clf = NGBClassifier(Dist=Bernoulli, verbose=True, n_estimators = estimator,  verbose_eval = 0, random_state = 2020)
    ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))
    df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
    df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
    df_aux['real'] = list(y_test)
    df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
    df_collector_estimators.ix[str(k),'estimators'] = estimator
    df_collector_estimators.ix[str(k),'cost'] = cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)
    count_zero, count_one = check_counts(ngb_clf, X_test, threshold)
    df_collector_estimators.ix[str(k),'count_zero'] = count_zero
    df_collector_estimators.ix[str(k),'count_one'] = count_one
    # predict probabilities
    train_probs = ngb_clf.predict_proba(X_train)
    test_probs = ngb_clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    train_probs = train_probs[:, 1]
    test_probs = test_probs[:, 1]
    # calculate scores
    train_auc = roc_auc_score(y_train, train_probs)
    test_auc = roc_auc_score(y_test, test_probs)
    df_collector_estimators.ix[str(k),'train_auc'] = train_auc
    df_collector_estimators.ix[str(k),'test_auc'] = test_auc
    k+=1
    del ngb_clf
    

df_collector_lr = pd.DataFrame(columns=  ["lr", "cost", "count_zero", "count_one", "train_auc", "test_auc"])
k = 0
for lr in [0.1, 0.01, 0.001, 0.0001]:
    print(f"learning_rate: {lr}")
    ngb_clf = NGBClassifier(Dist=Bernoulli, verbose=True, learning_rate = lr,  verbose_eval = 0, random_state = 2020)
    ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))
    df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
    df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
    df_aux['real'] = list(y_test)
    df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
    df_collector_lr.ix[str(k),'lr'] = lr
    df_collector_lr.ix[str(k),'cost'] = cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)
    count_zero, count_one = check_counts(ngb_clf, X_test, threshold)
    df_collector_lr.ix[str(k),'count_zero'] = count_zero
    df_collector_lr.ix[str(k),'count_one'] = count_one
    # predict probabilities
    train_probs = ngb_clf.predict_proba(X_train)
    test_probs = ngb_clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    train_probs = train_probs[:, 1]
    test_probs = test_probs[:, 1]
    # calculate scores
    train_auc = roc_auc_score(y_train, train_probs)
    test_auc = roc_auc_score(y_test, test_probs)
    df_collector_lr.ix[str(k),'train_auc'] = train_auc
    df_collector_lr.ix[str(k),'test_auc'] = test_auc
    k+=1
    del ngb_clf
    
    
df_collector_base = pd.DataFrame(columns=  ["base", "max_depth", "cost", "count_zero", "count_one", "train_auc", "test_auc"])
k = 0
for base in [base1,base2,base3, base4, base5]:
    print(f"Base: {base}")
    ngb_clf = NGBClassifier(Dist=Bernoulli, verbose=True, Base=base,  verbose_eval = 0, random_state = 2020)
    ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))
    df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
    df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
    df_aux['real'] = list(y_test)
    df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
    df_collector_base.ix[str(k),'base'] = base
    df_collector_base.ix[str(k),'max_depth'] = base.max_depth
    df_collector_base.ix[str(k),'cost'] = cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)
    count_zero, count_one = check_counts(ngb_clf, X_test, threshold)
    df_collector_base.ix[str(k),'count_zero'] = count_zero
    df_collector_base.ix[str(k),'count_one'] = count_one
    # predict probabilities
    train_probs = ngb_clf.predict_proba(X_train)
    test_probs = ngb_clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    train_probs = train_probs[:, 1]
    test_probs = test_probs[:, 1]
    # calculate scores
    train_auc = roc_auc_score(y_train, train_probs)
    test_auc = roc_auc_score(y_test, test_probs)
    df_collector_base.ix[str(k),'train_auc'] = train_auc
    df_collector_base.ix[str(k),'test_auc'] = test_auc
    k+=1
    del ngb_clf

estimators: 50
count_zero 0
count_one 30000
estimators: 75
count_zero 0
count_one 30000
estimators: 100
count_zero 0
count_one 30000
estimators: 200
count_zero 0
count_one 30000
estimators: 300
count_zero 0
count_one 30000
estimators: 500
count_zero 0
count_one 30000
estimators: 700
count_zero 0
count_one 30000
learning_rate: 0.1
count_zero 0
count_one 30000
learning_rate: 0.01
count_zero 0
count_one 30000
learning_rate: 0.001
count_zero 0
count_one 30000
learning_rate: 0.0001
count_zero 0
count_one 30000


NameError: name 'base1' is not defined

In [10]:
base1 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=4, random_state=2020)
base2 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=6, random_state=2020)
base3 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=8, random_state=2020)
base4 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=12, random_state=2020)
base5 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=16, random_state=2020)

In [11]:
df_collector_base = pd.DataFrame(columns=  ["base", "max_depth", "cost", "count_zero", "count_one", "train_auc", "test_auc"])
k = 0
for base in [base1,base2,base3, base4, base5]:
    print(f"Base: {base}")
    ngb_clf = NGBClassifier(Dist=Bernoulli, verbose=True, Base=base,  verbose_eval = 0, random_state = 2020)
    ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))
    df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
    df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
    df_aux['real'] = list(y_test)
    df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
    df_collector_base.ix[str(k),'base'] = base
    df_collector_base.ix[str(k),'max_depth'] = base.max_depth
    df_collector_base.ix[str(k),'cost'] = cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)
    count_zero, count_one = check_counts(ngb_clf, X_test, threshold)
    df_collector_base.ix[str(k),'count_zero'] = count_zero
    df_collector_base.ix[str(k),'count_one'] = count_one
    # predict probabilities
    train_probs = ngb_clf.predict_proba(X_train)
    test_probs = ngb_clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    train_probs = train_probs[:, 1]
    test_probs = test_probs[:, 1]
    # calculate scores
    train_auc = roc_auc_score(y_train, train_probs)
    test_auc = roc_auc_score(y_test, test_probs)
    df_collector_base.ix[str(k),'train_auc'] = train_auc
    df_collector_base.ix[str(k),'test_auc'] = test_auc
    k+=1
    del ngb_clf

Base: DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=4,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2020, splitter='best')
count_zero 0
count_one 30000
Base: DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=6,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2020, splitter='best')
count_zero 1806
count_one 28194
Base: DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=8,
                      max_featu

In [20]:
df_collector_estimators.cost.iloc[0]

2817421.839999991

In [12]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
trace0 = go.Scatter(
        x=df_collector_estimators.estimators,
        y=df_collector_estimators.cost,
        name="cost",
        line=dict(color="rgb(167, 103, 4)", width=4),
    )
data = [trace0]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to # of estimators",
        xaxis=dict(title="# Estimators"),
        yaxis=dict(title=f"Cost"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [13]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=df_collector_estimators.estimators,
        y=df_collector_estimators.train_auc,
        name="train_auc",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=df_collector_estimators.estimators,
        y=df_collector_estimators.test_auc,
        name="test_auc",
        line=dict(color="gray", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to # of estimators",
        xaxis=dict(title="# Estimators"),
        yaxis=dict(title=f"AUC (train/test)"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [21]:
df_collector_lr.cost.iloc[0]

2817421.839999991

In [14]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
trace0 = go.Scatter(
        x=df_collector_lr.lr,
        y=df_collector_lr.cost,
        name="cost",
        line=dict(color="rgb(167, 103, 4)", width=4),
    )
data = [trace0]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to learning_rate",
        xaxis=dict(title="Learning_rate"),
        yaxis=dict(title=f"Cost"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [15]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=df_collector_lr.lr,
        y=df_collector_lr.train_auc,
        name="train_auc",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=df_collector_lr.lr,
        y=df_collector_lr.test_auc,
        name="test_auc",
        line=dict(color="gray", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to learning_rate",
        xaxis=dict(title="learning_rate"),
        yaxis=dict(title=f"AUC (train/test)"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [16]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace0 = go.Scatter(
        x=[df_collector_base['base'].iloc[i].max_depth for i in range(len(df_collector_base['base']))],
        y=df_collector_base.cost,
        name="cost",
        line=dict(color="rgb(167, 103, 4)", width=4),
    )
data = [trace0]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to the max_depth of Base Learner",
        xaxis=dict(title="max_depth of Base Learner"),
        yaxis=dict(title=f"Cost"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [17]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=[df_collector_base['base'].iloc[i].max_depth for i in range(len(df_collector_base['base']))],
        y=df_collector_base.train_auc,
        name="train_auc",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=[df_collector_base['base'].iloc[i].max_depth for i in range(len(df_collector_base['base']))],
        y=df_collector_base.test_auc,
        name="test_auc",
        line=dict(color="gray", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of performance metrics according to the max_depth of Base Learner",
        xaxis=dict(title="max_depth of Base Learner"),
        yaxis=dict(title=f"AUC (train/test)"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [23]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=df_collector_estimators.estimators,
        y=df_collector_estimators.count_zero,
        name="Responsible (target = 0)",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=df_collector_estimators.estimators,
        y=df_collector_estimators.count_one,
        name="Delinquent (target = 1)",
        line=dict(color="red", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of counts according to the max_depth of Base Learner",
        xaxis=dict(title="max_depth of Base Learner"),
        yaxis=dict(title=f"count"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [25]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=df_collector_lr.lr,
        y=df_collector_lr.count_zero,
        name="Responsible (target = 0)",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=df_collector_lr.lr,
        y=df_collector_lr.count_one,
        name="Delinquent (target = 1)",
        line=dict(color="red", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of counts according to the max_depth of Base Learner",
        xaxis=dict(title="max_depth of Base Learner"),
        yaxis=dict(title=f"count"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [26]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

trace1 = go.Scatter(
        x=df_collector_base.max_depth,
        y=df_collector_base.count_zero,
        name="Responsible (target = 0)",
        line=dict(color="blue", width=4),
    )

trace2 = go.Scatter(
        x=df_collector_base.max_depth,
        y=df_collector_base.count_one,
        name="Delinquent (target = 1)",
        line=dict(color="red", width=4),
    )
data = [trace1, trace2]

# Edit the layout
layout = dict(
        title=f"Evolution of counts according to the max_depth of Base Learner",
        xaxis=dict(title="max_depth of Base Learner"),
        yaxis=dict(title=f"count"),
)

fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
import seaborn as sns
sns.distplot(array)

In [30]:
df_collector_estimators.columns

Index(['estimators', 'cost', 'count_zero', 'count_one', 'train_auc',
       'test_auc'],
      dtype='object')

In [33]:
df_collector_estimators

Unnamed: 0,estimators,cost,count_zero,count_one,train_auc,test_auc
0,50,2817420.0,0,30000,0.820779,0.828803
1,75,2817420.0,0,30000,0.829241,0.836079
2,100,2817420.0,0,30000,0.82976,0.836266
3,200,2817420.0,0,30000,0.8314,0.837866
4,300,2817420.0,0,30000,0.832269,0.83864
5,500,2817420.0,0,30000,0.832783,0.839488
6,700,2817420.0,0,30000,0.832784,0.839488


In [34]:
df_collector_lr

Unnamed: 0,lr,cost,count_zero,count_one,train_auc,test_auc
0,0.1,2817420.0,0,30000,0.832793,0.839865
1,0.01,2817420.0,0,30000,0.832783,0.839488
2,0.001,2817420.0,0,30000,0.820782,0.828788
3,0.0001,2817420.0,0,30000,0.800084,0.810889


In [36]:
df_collector_base.to_csv("kk_base.csv",index=False)