In [1]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (correlation_heatmap, register_amputation, preprocess_df, 
                    scaling_values_df, plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                    plot_feature_importances, visualize_roc_curve, color_negative_red,)

from constants import cols

## READING DATAFRAME

In [2]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [3]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## FEATURE ENGINEERING

In [None]:
preprocess_df(df)

In [None]:
df.info()

In [None]:
print(df.isnull().sum())

## NGBOOST MODEL

In [None]:
X = df.drop(columns=['SeriousDlqin2yrs']).values
y = df.SeriousDlqin2yrs.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

In [None]:
from sklearn.tree import DecisionTreeRegressor

base = DecisionTreeRegressor(criterion="friedman_mse", max_depth=12)

In [None]:
parameters = {'n_estimators':[20,50,100,200,500,1000], 'learning_rate':[0.001,0.0001], 'Base': [base], 'Dist': [Bernoulli]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
clf = RandomizedSearchCV(NGBClassifier(), parameters, random_state=2020)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

In [None]:
ngb_clf = NGBClassifier(Dist=Bernoulli,
                              verbose=True, Base=base, n_estimators = 20, learning_rate = 0.0001
                             )

ngb_clf.fit(X_train_resampled, y_train_resampled)
preds = ngb_clf.pred_dist(X_test)

In [None]:
ngb_clf.feature_importances_

In [None]:
plot_feature_importances(cols[1:], ngb_clf)

In [None]:
visualize_roc_curve(ngb_clf, X_test, y_test)

In [None]:
result = ngb_clf.score(X_test, y_test)
print("Accuracy: %.2f%%" % (result*100.0))

In [None]:
roc_auc_score(y_test, ngb_clf.predict(X_test))
print("ROC AUC SCORE: %.2f%%" % (result*100.0))

In [None]:
class_weights # cuando tenia los datos desbalanceados, calcule la metrica de sklearn y obtuve este array

In [None]:
class_weights[1]

## despues (ponderado al reves)

In [None]:
for i in np.arange(0, 1, 0.1):
    dict_weights = {0.3:0,#7.43494424 - mayoritaria
            0.7:1 #0.53604932 - minoritario
               }
    rev_subs = { v:k for k,v in dict_weights.items()}
    print(dict_weights)
    print(accuracy_score(y_train_resampled, ngb_clf.predict(X_train_resampled), sample_weight= [rev_subs.get(item,item)  for item in y_train_resampled]))
    print(accuracy_score(y_test, ngb_clf.predict(X_test), sample_weight= [rev_subs.get(item,item)  for item in y_test]))
    print("---------------------------")

## en el orden correcto

In [None]:
dict_weights = {0.3: 0, 0.7: 1} # costo relativo de default vs fees
# al array class_weights lo pase a un diccionary asociado al target

#la magia que matchea los y de train y de test con las ponderaciones
rev_subs = { v:k for k,v in dict_weights.items()}

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train_resampled, ngb_clf.predict(X_train_resampled), sample_weight= [rev_subs.get(item,item)  for item in y_train_resampled])

In [None]:
accuracy_score(y_test, ngb_clf.predict(X_test), sample_weight= [rev_subs.get(item,item)  for item in y_test])

In [None]:
roc_auc_score(y_test, ngb_clf.predict(X_test))

In [None]:
ngb_clf