In [1]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (register_imputation, preprocess_df, 
                    scaling_values_df, clean_outliers, get_sample_weights, 
                  process_unit_cost, cost_score, generate_y_pred_with_custom_threshold, 
                  calculate_cost_score_with_learning, calculate_learning_cost, process_learning_unit_cost, check_counts,)
from plot_utils import (correlation_heatmap, visualize_permutation_feature_importances, 
                        plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                        plot_feature_importances, visualize_roc_curve, color_negative_red, )

from constants import cols, cols_with_missing_indicators

In [2]:
SEED = 2020

In [3]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [4]:
from sklearn.model_selection import train_test_split

df, df_test = train_test_split(df, test_size=0.2,random_state=42)

In [5]:

def preprocess_extra(df):
    """
    Extra cleanup process of some outliers.
    """
    df.loc[
        (df["RevolvingUtilizationOfUnsecuredLines"] > 1),
        "RevolvingUtilizationOfUnsecuredLines",
    ] = 0
    df.loc[(df["DebtRatio"] > 10), "DebtRatio"] = 0
    df.loc[(df["age"] < 18), "age"] = 0

In [6]:
from sklearn.neighbors import LocalOutlierFactor
def clean_outliers(df, flag_filter=False):
    """ 
    Register imputations, identify outliers with LOF and clean them. Also it process an extra function to clean some outliers.

    Args:
        - df (DataFrame Object): dataframe to be processed
        - flag_filter (boolean): Flag that indicates if the process requres an extra cleanup of outliers.

    Returns dataframe without outliers
    """
    #df = preprocess_df(df.copy())
    df = register_imputation(df.copy())
    local_outlier_factor = LocalOutlierFactor(contamination=0.1)
    is_outlier = local_outlier_factor.fit_predict(df[cols[1:]]) == -1
    data_outlier_excluded = df.loc[~is_outlier, :]
    if flag_filter:
        preprocess_extra(data_outlier_excluded)
    return data_outlier_excluded

In [7]:
df = clean_outliers(df.copy(), True)

In [8]:
cols_f = [c for c in df.columns if 'dummy' not in c.lower()]

df_cleaned=df.copy()[cols_f].reset_index(drop=True)

In [9]:
import random
random.seed(2019)
df = df.reset_index(drop=True)
list_of_ids_train=random.sample(range(0,len(df)), int(len(df) * 0.2))

In [10]:
df['NumberRealEstateLoansOrLines'] = df['NumberRealEstateLoansOrLines'].astype(float)
df['NumberOfOpenCreditLinesAndLoans'] = df['NumberOfOpenCreditLinesAndLoans'].astype(float)

In [11]:
for idx in list_of_ids_train:
    for col in ['MonthlyIncome', 'NumberRealEstateLoansOrLines', 'NumberOfOpenCreditLinesAndLoans']: #en el segundo lugar estaba dummy_MonthlyIncome
        df.at[idx, col] = np.nan

In [12]:
def register_imputation(df):
    """
    Register imputations of certain df
    Args:
        - df (DataFrame): Dataframe to be computed
    Return df with filled values and booleans that indicate if each row was changed
    """
    for c in cols[1:]:
        # df[f"{c}_amputado"] = df[f"{c}"].fillna(0)
        df[f"{c}_imputed"] = df[f"{c}"].fillna(0)
        df[f"dummy_{c}"] = (df[f"{c}"] != df[f"{c}_imputed"]).astype(int)
        del df[f"{c}"]
        df.columns = df.columns.str.replace(f"{c}_imputed", f"{c}")
    return df

In [13]:
df = register_imputation(df.copy())

In [14]:
df

Unnamed: 0,SeriousDlqin2yrs,dummy_RevolvingUtilizationOfUnsecuredLines,dummy_age,dummy_NumberOfTime30-59DaysPastDueNotWorse,dummy_DebtRatio,dummy_MonthlyIncome,dummy_NumberOfOpenCreditLinesAndLoans,dummy_NumberOfTimes90DaysLate,dummy_NumberRealEstateLoansOrLines,dummy_NumberOfTime60-89DaysPastDueNotWorse,...,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0,0,0,0,0,0,0,0,0,...,0.000000,29,0,0.011513,4342.0,5.0,0,0.0,0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.595526,55,0,0.835333,1833.0,11.0,0,1.0,0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.000000,43,0,0.043437,4166.0,2.0,0,0.0,0,4.0
3,0,0,0,0,0,0,0,0,0,0,...,0.000000,35,0,0.133598,5800.0,12.0,0,1.0,0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.442956,61,0,0.658520,7200.0,12.0,0,2.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107995,0,0,0,0,0,0,0,0,0,0,...,0.107445,72,0,0.899275,4000.0,17.0,0,2.0,0,1.0
107996,0,0,0,0,0,1,1,0,1,0,...,0.087697,67,1,0.000000,0.0,0.0,0,0.0,0,1.0
107997,0,0,0,0,0,0,0,0,0,0,...,0.068788,49,0,0.673331,2800.0,8.0,0,2.0,0,0.0
107998,0,0,0,0,0,0,0,0,0,0,...,0.219630,36,0,0.025316,1500.0,2.0,0,0.0,0,0.0


In [15]:
X_train = df.drop(columns=['SeriousDlqin2yrs']).values
y_train = df.SeriousDlqin2yrs.values

In [16]:
df_test = register_imputation(df_test.copy())
X_test = df_test.drop(columns=['SeriousDlqin2yrs']).values
y_test = df_test.SeriousDlqin2yrs.values

In [17]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

Resampled dataset shape Counter({0: 101298, 1: 101298})


In [66]:
base1 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=4, random_state=2020)
base2 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=6, random_state=2020)
base3 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=8, random_state=2020)
base4 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=12, random_state=2020)
base5 = DecisionTreeRegressor(criterion="friedman_mse", max_depth=16, random_state=2020)

In [68]:
#FIX MEEEEEEEEEEE: REFACTOR PLZZ !!! sss
df_collector = pd.DataFrame(columns=  ["hyperparams", "estimators", "learning_rate", "max_depth", 
                                       "threshold", "cost", "count_zero", "count_one", "train_auc", "test_auc"])
df_collector['hyperparams'] = df_collector['hyperparams'].astype('object')
k = 0
for estimator in [50, 100, 150, 250, 350, 500]:
    for lr in [0.01]:
        for baset in [base2,base3, base4]:
            ngb_clf = NGBClassifier(Dist=Bernoulli,
                                      verbose=True, Base=baset, n_estimators = estimator, 
                                    learning_rate = lr, verbose_eval = 0, random_state = 2020
                                     )
            print(ngb_clf)
            ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))
            index_th = 0
            for threshold in list(np.arange(0.2, 0.4, 0.05)):
                count_zero, count_one = check_counts(ngb_clf, X_test, threshold)
                if count_zero!=0:
                    df_collector.ix[str(k),'hyperparams'] = ngb_clf
                    df_collector.ix[str(k),'estimators'] = estimator
                    df_collector.ix[str(k),'learning_rate'] = lr
                    df_collector.ix[str(k),'max_depth'] = baset.max_depth
                    threshold = round(threshold,2)
                    print("k: "+str(k))
                    df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
                    df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, threshold)
                    df_aux['real'] = list(y_test)
                    df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
                    print(threshold)
                    print("cost "+str(cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)))
                    print("th "+str(threshold))
                    df_collector.ix[str(k),'threshold'] = threshold
                    df_collector.ix[str(k),'cost'] = cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)
                    df_collector.ix[str(k),'count_zero'] = count_zero
                    df_collector.ix[str(k),'count_one'] = count_one
                    # predict probabilities
                    train_probs = ngb_clf.predict_proba(X_train)
                    test_probs = ngb_clf.predict_proba(X_test)
                    # keep probabilities for the positive outcome only
                    train_probs = train_probs[:, 1]
                    test_probs = test_probs[:, 1]
                    # calculate scores
                    train_auc = roc_auc_score(y_train, train_probs)
                    test_auc = roc_auc_score(y_test, test_probs)
                    df_collector.ix[str(k),'train_auc'] = train_auc
                    df_collector.ix[str(k),'test_auc'] = test_auc
                    k+=1
                    print("sumando k")
                    print("---------------------------------")
            del ngb_clf

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=6,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=50,
              natural_gradient=True,
              random_state=RandomState(MT19

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=150,
              natural_gradient=True,
              random_state=RandomState(MT1

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=350,
              natural_gradient=True,
              random_state=RandomState(MT1

In [70]:
df_collector.sort_values("cost").head(50).to_csv("fix_fix_segundoexp_important_missing.csv", index=False)

In [18]:
df_collector = pd.read_csv("fix_fix_segundoexp_important_missing.csv")

In [19]:
df_collector

Unnamed: 0,hyperparams,estimators,learning_rate,max_depth,threshold,cost,count_zero,count_one,train_auc,test_auc
0,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,100,0.01,8,0.2,2817421.84,1,29999,0.839663,0.673008
1,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,250,0.01,8,0.25,2827097.16,7,29993,0.84193,0.667147
2,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,350,0.01,8,0.2,2827097.16,7,29993,0.842601,0.689239
3,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,350,0.01,8,0.25,2827097.16,7,29993,0.842601,0.689239
4,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,150,0.01,8,0.3,2827097.16,7,29993,0.840642,0.679827
5,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,150,0.01,8,0.25,2827097.16,7,29993,0.840642,0.679827
6,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,150,0.01,8,0.2,2827097.16,7,29993,0.840642,0.679827
7,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,500,0.01,8,0.2,2827097.16,7,29993,0.842776,0.68928
8,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,250,0.01,8,0.2,2827097.16,7,29993,0.84193,0.667147
9,NGBClassifier(Base=DecisionTreeRegressor(ccp_a...,100,0.01,8,0.3,2827097.16,7,29993,0.839663,0.673008


In [19]:
print(df_collector.iloc[0].hyperparams)

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=100,
              natural_gradient=True,
              random_state=RandomState(MT1

In [20]:
ngb_clf = NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
               col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=100,
              natural_gradient=True, tol=0.0001,
              verbose=True, verbose_eval=0)
ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= get_sample_weights(y_train, y_train_resampled))

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=8,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=2020, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=100,
              natural_gradient=True,
              random_state=RandomState(MT1

In [21]:
visualize_roc_curve(ngb_clf, X_train, y_train, X_test, y_test)

TRAIN: ROC AUC=0.840
TEST: ROC AUC=0.673


In [26]:
df_aux = pd.DataFrame(X_test, columns=cols_with_missing_indicators)
df_aux['predicted'] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
df_aux['real'] = list(y_test)
df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2
cost_score(df_aux.LoanPrincipal, df_aux.predicted, df_aux.real)

count_zero 1
count_one 29999


2817421.839999991

In [27]:
probas = [prob[1] for prob in ngb_clf.predict_proba(X_test)]
df_prueba = pd.DataFrame()
df_prueba["y_pred"] = generate_y_pred_with_custom_threshold(ngb_clf, X_test, 0.2)
df_prueba["proba"] = probas


count_zero 1
count_one 29999


In [24]:
import plotly.figure_factory as ff
# Add histogram data
x0 = df_prueba.proba

# Group data together
hist_data = [x0]

group_labels = ['Proba predicted']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2, show_hist=False)

# Add title
fig.update_layout(title_text='Kernel Density Estimate',
              xaxis = dict(title = 'Proba predicted'),
              yaxis = dict(title = 'Probability Density'),
              )