In [1]:
#ngboost and modelling libraries
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

#data manipulation libraries
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#import functions and constants
from utils import (correlation_heatmap, register_amputation, preprocess_df, 
                    scaling_values_df, plot_target_balance, plot_trace_line, plot_scatter_matrix, 
                    plot_feature_importances, visualize_roc_curve, color_negative_red,clean_outliers,)

from constants import cols

## READING DATAFRAME

In [2]:
df = pd.read_csv("./data/cs-training.csv", usecols =cols)

In [3]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
len(df)

150000

## FEATURE ENGINEERING

In [5]:
df = clean_outliers(df.copy())

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142500 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        142500 non-null int64
RevolvingUtilizationOfUnsecuredLines    142500 non-null float64
age                                     142500 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    142500 non-null int64
DebtRatio                               142500 non-null float64
MonthlyIncome                           142500 non-null float64
NumberOfOpenCreditLinesAndLoans         142500 non-null int64
NumberOfTimes90DaysLate                 142500 non-null int64
NumberRealEstateLoansOrLines            142500 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    142500 non-null int64
NumberOfDependents                      142500 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.0 MB


In [7]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [8]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0,142500.0
mean,0.063593,3.737884,49.205368,0.298884,353.47122,6547.921258,8.334063,0.146688,1.012225,0.127326,0.736821
std,0.244027,145.667594,10.748236,2.657209,1116.403045,5268.659833,4.823229,2.612936,1.072553,2.598402,1.098868
min,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029765,41.0,0.0,0.178669,3963.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.152317,52.0,0.0,0.369005,6666.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.549695,60.0,0.0,0.895552,7375.0,11.0,0.0,2.0,0.0,1.0
max,1.0,13930.0,60.0,98.0,38793.0,208333.0,57.0,98.0,32.0,98.0,10.0


## NGBOOST MODEL

In [9]:
X = df.drop(columns=['SeriousDlqin2yrs']).values
y = df.SeriousDlqin2yrs.values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
nm_common =SMOTE(random_state=2019)
X_train_resampled, y_train_resampled = nm_common.fit_sample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_train_resampled)))

Resampled dataset shape Counter({0: 106781, 1: 106781})


In [12]:
from sklearn.tree import DecisionTreeRegressor

base = DecisionTreeRegressor(criterion="friedman_mse", max_depth=12)

In [13]:
parameters = {'n_estimators':[20,50,100,200,500,1000], 'learning_rate':[0.001,0.0001], 'Base': [base], 'Dist': [Bernoulli]}

In [8]:
from sklearn.model_selection import RandomizedSearchCV
#clf = RandomizedSearchCV(NGBClassifier(), parameters, random_state=2020)

In [9]:
#clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [137]:
class_weight.compute_class_weight??

In [14]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([0.53380283, 7.89583045])

In [21]:
import numpy as np
npa = np.asarray([class_weights[0] if x == 0 else class_weights[1] for x in  y_train_resampled], dtype=np.float32)

In [22]:
npa

array([0.5338028, 0.5338028, 0.5338028, ..., 7.8958306, 7.8958306,
       7.8958306], dtype=float32)

In [23]:
ngb_clf = NGBClassifier(Dist=Bernoulli,
                              verbose=True, Base=base, n_estimators = 100, learning_rate = 0.01, verbose_eval = 1,
                        Score = 
                             )

ngb_clf.fit(X_train_resampled, y_train_resampled, sample_weight= npa)
preds = ngb_clf.pred_dist(X_test)

[iter 0] loss=0.6931 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 1] loss=0.6752 val_loss=0.0000 scale=2.0000 norm=3.9535
[iter 2] loss=0.6581 val_loss=0.0000 scale=2.0000 norm=3.9099
[iter 3] loss=0.6415 val_loss=0.0000 scale=2.0000 norm=3.8693
[iter 4] loss=0.6256 val_loss=0.0000 scale=2.0000 norm=3.8307
[iter 5] loss=0.6103 val_loss=0.0000 scale=2.0000 norm=3.7948
[iter 6] loss=0.5955 val_loss=0.0000 scale=2.0000 norm=3.7602
[iter 7] loss=0.5812 val_loss=0.0000 scale=2.0000 norm=3.7275
[iter 8] loss=0.5674 val_loss=0.0000 scale=2.0000 norm=3.6967
[iter 9] loss=0.5541 val_loss=0.0000 scale=2.0000 norm=3.6674
[iter 10] loss=0.5412 val_loss=0.0000 scale=2.0000 norm=3.6396
[iter 11] loss=0.5288 val_loss=0.0000 scale=2.0000 norm=3.6141
[iter 12] loss=0.5168 val_loss=0.0000 scale=2.0000 norm=3.5895
[iter 13] loss=0.5053 val_loss=0.0000 scale=2.0000 norm=3.5667
[iter 14] loss=0.4940 val_loss=0.0000 scale=2.0000 norm=3.5448
[iter 15] loss=0.4831 val_loss=0.0000 scale=4.0000 norm=7.0472
[i

In [24]:
import pickle
from pathlib import Path

file_path = Path("ngbtest.p")

with file_path.open("wb") as f:
    pickle.dump(ngb_clf, f)

In [25]:
with file_path.open("rb") as f:
    ngb_unpickled = pickle.load(f)

In [75]:
y_pred = []
for i in range(len(list(y_predictions))):
    if y_predictions[i][0] > 0.3:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [104]:
balanced_accuracy_score??

In [None]:
def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
                            adjusted=False):
    """Compute the balanced accuracy

    The balanced accuracy in binary and multiclass classification problems to
    deal with imbalanced datasets. It is defined as the average of recall
    obtained on each class.

    The best value is 1 and the worst value is 0 when ``adjusted=False``.

    Read more in the :ref:`User Guide <balanced_accuracy_score>`.

    Parameters
    ----------
    y_true : 1d array-like
        Ground truth (correct) target values.

    y_pred : 1d array-like
        Estimated targets as returned by a classifier.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    adjusted : bool, default=False
        When true, the result is adjusted for chance, so that random
        performance would score 0, and perfect performance scores 1.

    Returns
    -------
    balanced_accuracy : float

    See also
    --------
    recall_score, roc_auc_score

    Notes
    -----
    Some literature promotes alternative definitions of balanced accuracy. Our
    definition is equivalent to :func:`accuracy_score` with class-balanced
    sample weights, and shares desirable properties with the binary case.
    See the :ref:`User Guide <balanced_accuracy_score>`.

    References
    ----------
    .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).
           The balanced accuracy and its posterior distribution.
           Proceedings of the 20th International Conference on Pattern
           Recognition, 3121-24.
    .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).
           `Fundamentals of Machine Learning for Predictive Data Analytics:
           Algorithms, Worked Examples, and Case Studies
           <https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_.

    Examples
    --------
    >>> from sklearn.metrics import balanced_accuracy_score
    >>> y_true = [0, 1, 0, 0, 1, 0]
    >>> y_pred = [0, 1, 0, 0, 0, 1]
    >>> balanced_accuracy_score(y_true, y_pred)
    0.625

    """
    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
    with np.errstate(divide='ignore', invalid='ignore'):
        per_class = np.diag(C) / C.sum(axis=1)
    if np.any(np.isnan(per_class)):
        warnings.warn('y_pred contains classes not in y_true')
        per_class = per_class[~np.isnan(per_class)]
    score = np.mean(per_class)
    if adjusted:
        n_classes = len(per_class)
        chance = 1 / n_classes
        score -= chance
        score /= 1 - chance
    return score

In [30]:
from sklearn.metrics import balanced_accuracy_score
#balanced_accuracy_score(y_train, y_pred, [pesos[0] if x == 0 else pesos[1] for x in  y_pred])
pesos = [0.25,0.75]
## Function to find best decision threshold
def find_best_threshold(clf, x_test, y_test, score): 
    
    scoring = {'Balanced Accuracy': balanced_accuracy_score}    
    
    y_proba = clf.predict_proba(x_test)
    
    scores = {}
    preds = {}
    for i in np.arange(0.1,1,0.1):
        y_pred = np.zeros(shape = (y_proba.shape[0],1))
        for j in range(0, y_proba.shape[0]):
            if y_proba[j,1]>i:
                y_pred[j] = 1
            else:
                y_pred[j] = 0
                
        scores[i] = scoring[score](y_test, y_pred, [pesos[0] if x == 0 else pesos[1] for x in  y_pred])
        preds[i] = y_pred
    
    best_threshold = max(scores, key=scores.get)
    best_score = max(scores.values())
    best_pred = preds[best_threshold]
    
    return best_threshold, best_score, best_pred

In [32]:
find_best_threshold(ngb_clf, X_test, y_test, 'Balanced Accuracy')

(0.8, 0.7599097486533404, array([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]))

In [37]:
y_predictions = ngb_clf.predict_proba(X_test)
y_pred = []
for i in range(len(list(y_predictions))):
    if y_predictions[i][0] > 0.2:
        y_pred.append(0)
    else:
        y_pred.append(1)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred, [pesos[0] if x == 0 else pesos[1] for x in  y_pred])

0.7599097486533404

In [109]:
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
def visualize_roc_curve(model, X_test, y_test):
    """
    Plot roc curve
    Args:
        - model (NGBClassifier object): NGBoost model
        - X_test (numpy.ndarray): Data without target values
        - y_test (numpy.ndarray): Target values
    """
    y_predictions = model.predict_proba(X_test)
    y_pred = []
    for i in range(len(list(y_predictions))):
        if y_predictions[i][0] > 0.2:
            y_pred.append(0)
        else:
            y_pred.append(1)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    print(len(y_pred))
    print(len(y_predictions))

    print(classification_report(y_test, y_pred))

    trace0 = go.Scatter(
        x=fpr, y=tpr, name="Predictive Model", line=dict(color="blue", width=2)
    )
    trace1 = go.Scatter(
        x0=0,
        x=[0, 1],
        y0=0,
        y=[0, 1],
        name="Random Chance",
        line=dict(color="grey", width=2),
    )

    data = [trace0, trace1]

    # Edit the layout
    layout = dict(title="ROC curve", xaxis=dict(title="FPR"), yaxis=dict(title="TPR"))

    fig = dict(data=data, layout=layout)

    iplot(fig)

In [111]:
visualize_roc_curve(ngb_clf, X_test, y_test)

28500
28500
              precision    recall  f1-score   support

           0       0.96      0.93      0.95     26657
           1       0.31      0.43      0.36      1843

    accuracy                           0.90     28500
   macro avg       0.63      0.68      0.65     28500
weighted avg       0.92      0.90      0.91     28500



In [138]:
def _creditscoring_costmat(income, debt, pi_1, cost_mat_parameters):
    """ Private function to calculate the cost matrix of credit scoring models.
    Parameters
    ----------
    income : array of shape = [n_samples]
        Monthly income of each example
    debt : array of shape = [n_samples]
        Debt ratio each example
    pi_1 : float
        Percentage of positives in the training set
    References
    ----------
    .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten,
           "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring",
           in Proceedings of the International Conference on Machine Learning and Applications,
           , 2014.
    Returns
    -------
    cost_mat : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.
    """
    def calculate_a(cl_i, int_, n_term):
        """ Private function """
        return cl_i * ((int_ * (1 + int_) ** n_term) / ((1 + int_) ** n_term - 1))

    def calculate_pv(a, int_, n_term):
        """ Private function """
        return a / int_ * (1 - 1 / (1 + int_) ** n_term)

    #Calculate credit line Cl
    def calculate_cl(k, inc_i, cl_max, debt_i, int_r, n_term):
        """ Private function """
        cl_k = k * inc_i
        A = calculate_a(cl_k, int_r, n_term)
        Cl_debt = calculate_pv(inc_i * min(A / inc_i, 1 - debt_i), int_r, n_term)
        return min(cl_k, cl_max, Cl_debt)

    #calculate costs
    def calculate_cost_fn(cl_i, lgd):
        return cl_i * lgd

    def calculate_cost_fp(cl_i, int_r, n_term, int_cf, pi_1, lgd, cl_avg):
        a = calculate_a(cl_i, int_r, n_term)
        pv = calculate_pv(a, int_cf, n_term)
        r = pv - cl_i
        r_avg = calculate_pv(calculate_a(cl_avg, int_r, n_term), int_cf, n_term) - cl_avg
        cost_fp = r - (1 - pi_1) * r_avg + pi_1 * calculate_cost_fn(cl_avg, lgd)
        return max(0, cost_fp)

    v_calculate_cost_fp = np.vectorize(calculate_cost_fp)
    v_calculate_cost_fn = np.vectorize(calculate_cost_fn)

    v_calculate_cl = np.vectorize(calculate_cl)

    # Parameters
    k = cost_mat_parameters['k']
    int_r = cost_mat_parameters['int_r']
    n_term = cost_mat_parameters['n_term']
    int_cf = cost_mat_parameters['int_cf']
    lgd = cost_mat_parameters['lgd']
    cl_max = cost_mat_parameters['cl_max']

    cl = v_calculate_cl(k, income, cl_max, debt, int_r, n_term)
    cl_avg = cl.mean()

    n_samples = income.shape[0]
    cost_mat = np.zeros((n_samples, 4))  #cost_mat[FP,FN,TP,TN]
    cost_mat[:, 0] = v_calculate_cost_fp(cl, int_r, n_term, int_cf, pi_1, lgd, cl_avg)
    cost_mat[:, 1] = v_calculate_cost_fn(cl, lgd)
    cost_mat[:, 2] = 0.0
    cost_mat[:, 3] = 0.0

    return cost_mat

In [141]:
cost_mat_parameters = {'int_r': 0.0479 / 12,
                               'int_cf': 0.0294 / 12,
                               'cl_max': 25000,
                               'n_term': 24,
                               'k': 3,
                               'lgd': .75}
target = df['SeriousDlqin2yrs'].values.astype(np.int)
pi_1 = target.mean()
cost_mat = _creditscoring_costmat(df['MonthlyIncome'].values, df['DebtRatio'].values, pi_1, cost_mat_parameters)

ZeroDivisionError: float division by zero

In [80]:
len(y_pred)

28500

In [82]:
len(y_test)

28500

In [109]:
# data viz libraries
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
def visualize_roc_curve(model, X_test, y_test):
    """
    Plot roc curve
    Args:
        - model (NGBClassifier object): NGBoost model
        - X_test (numpy.ndarray): Data without target values
        - y_test (numpy.ndarray): Target values
    """
    y_predictions = model.predict_proba(X_test)
    y_pred = []
    for i in range(len(list(y_predictions))):
        if y_predictions[i][0] > 0.4:
            y_pred.append(0)
        else:
            y_pred.append(1)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    print(len(y_pred))
    print(len(y_predictions))

    print(classification_report(y_test, y_pred))

    trace0 = go.Scatter(
        x=fpr, y=tpr, name="Predictive Model", line=dict(color="blue", width=2)
    )
    trace1 = go.Scatter(
        x0=0,
        x=[0, 1],
        y0=0,
        y=[0, 1],
        name="Random Chance",
        line=dict(color="grey", width=2),
    )

    data = [trace0, trace1]

    # Edit the layout
    layout = dict(title="ROC curve", xaxis=dict(title="FPR"), yaxis=dict(title="TPR"))

    fig = dict(data=data, layout=layout)

    iplot(fig)

In [110]:
visualize_roc_curve(ngb_clf, X_test, y_test)
#lib cost_sensitive

28500
28500
              precision    recall  f1-score   support

           0       0.98      0.79      0.87     26712
           1       0.18      0.71      0.29      1788

    accuracy                           0.78     28500
   macro avg       0.58      0.75      0.58     28500
weighted avg       0.93      0.78      0.83     28500



In [149]:
y_predictions = ngb_clf.predict_proba(X_test)
y_pred = []
for i in range(len(list(y_predictions))):
    if y_predictions[i][0] > 0.4:
        y_pred.append(0)
    else:
        y_pred.append(1)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred, [pesos[0] if x == 0 else pesos[1] for x in  y_pred])
## ver de meterlo en el modelo en el random search

0.7139181462643649

In [148]:
y_predictions = ngb_clf.predict_proba(X_train)
y_pred = []
for i in range(len(list(y_predictions))):
    if y_predictions[i][0] > 0.4:
        y_pred.append(0)
    else:
        y_pred.append(1)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_train, y_pred, [pesos[0] if x == 0 else pesos[1] for x in  y_pred])

0.7574653534719549

In [134]:
pesos = [0.25,0.75]
#buscar cuanto es la perdida en el mercado..

In [41]:
ngb_clf.feature_importances_

array([[0.21090627, 0.05120032, 0.18537314, 0.0444543 , 0.03808067,
        0.0412483 , 0.11288559, 0.10069324, 0.06469714, 0.15046102]])

In [40]:
plot_feature_importances(cols[1:], ngb_clf)

In [83]:
from costcla.metrics import cost_loss, savings_score
from costcla.models import  CostSensitiveDecisionTreeClassifier

In [91]:
cost_mat_train = np.zeros((len(y_train),4))
#false positives cost 5
cost_mat_train[:,0]=5
#false negatives cost the transaction amount
cost_mat_train[:,1]= pd.DataFrame(X_train, columns = cols[1:])['MonthlyIncome'] * 2
#true positives also cost 5
cost_mat_train[:,2]=5
 
cost_mat_test = np.zeros((len(y_test),4))
cost_mat_test[:,0]=5
cost_mat_test[:,1]=pd.DataFrame(X_test, columns = cols[1:] )['MonthlyIncome'] * 2
cost_mat_test[:,2]=5

In [98]:
"""
from sklearn.metrics import balanced_accuracy_score
>>> y_true = [0, 1, 0, 0, 1, 0]
>>> y_pred = [0, 1, 0, 0, 0, 1]
>>> balanced_accuracy_score(y_true, y_pred)
"""
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred, sample_weight = )

0.729893158141227

In [97]:
data.cost_mat

array([[ 1023.73054104, 18750.        ,     0.        ,     0.        ],
       [  694.27379722,  5849.25      ,     0.        ,     0.        ],
       [  719.67113704,  6843.75      ,     0.        ,     0.        ],
       ...,
       [  865.73457105, 12563.25      ,     0.        ,     0.        ],
       [  873.31929697, 12860.25      ,     0.        ,     0.        ],
       [ 1013.6367265 , 18354.75      ,     0.        ,     0.        ]])

In [78]:
result = ngb_clf.score(X_test, y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 54.12%


In [79]:
roc_auc_score(y_test, ngb_clf.predict(X_test))
print("ROC AUC SCORE: %.2f%%" % (result*100.0))

ROC AUC SCORE: 54.12%


In [72]:
class_weights # cuando tenia los datos desbalanceados, calcule la metrica de sklearn y obtuve este array

array([0.53407292, 7.8372061 ])

In [73]:
class_weights[1]

7.837206104771071

In [74]:
from sklearn.metrics import accuracy_score

## despues (ponderado al reves)

In [77]:
dict_weights = {0.53407292:1,#7.43494424 - mayoritaria
            7.8372061 :0 #0.53604932 - minoritario
               }
rev_subs = { v:k for k,v in dict_weights.items()}
print(dict_weights)
print(accuracy_score(y_train_resampled, ngb_clf.predict(X_train_resampled), sample_weight= [rev_subs.get(item,item)  for item in y_train_resampled]))
print(accuracy_score(y_test, ngb_clf.predict(X_test), sample_weight= [rev_subs.get(item,item)  for item in y_test]))
print("---------------------------")

{0.53407292: 1, 7.8372061: 0}
0.7416702949550207
0.7114588952586269
---------------------------


In [57]:
df_aux=pd.DataFrame(X_test, columns = cols[1:]).replace({'MonthlyIncome': {0: np.mean(df.MonthlyIncome)}})
df_aux.LoanPrincipal = df_aux.MonthlyIncome * 2

In [59]:
df_aux.LoanPrincipal

0         8332.000000
1        13340.442475
2        13340.442475
3         8332.000000
4        13340.442475
             ...     
28495    33332.000000
28496     3600.000000
28497    12600.000000
28498    19500.000000
28499    12000.000000
Name: MonthlyIncome, Length: 28500, dtype: float64

In [None]:
# el tipo 0 pide 8332 de plata (asumo esto). 
#4 opciones:

#1. el tipo 0 no es deudor y yo decido prestarle la plata (yo gano el interés que podría ganar prestandole plata) TP
#2. el tipo 0 no es deudor y yo decido no prestarle plata (gano 0) FP
#3- el tipo 0 es deudor y yo decido prestarle plata (pierdo lo que le presté) 
#4. el tipo 0 es deudor y yo decido no prestarle plata (gano 0)

#problema: no se si es deudor o no deudor

#

In [None]:
#ARMAR LA MATRIZ DE COSTOS EN BASE A ESTO.

#tener como base lo máximo que podría ganar para cada una de las opciones.

#siendo no deudor: gano lo que podria ganar prestandole plata y siendo deudor es 0 (prefiero no prestarle y ganar 0)

#matriz de confusion
#filas: deudor | no deudor
#columnas: presto | no presto

#x: lo que presto (=8332)

# |         | presto |     no presto   |
# |deudor   | - 8332 |        0        | 
# |no deudor|    0   |- interes de 8332|  


#clasificar al vago 0 como deudor cuando no es deudor me hace perder el interes de 8332.
#clasificar al vago 0 como no deudor cuando en realidad es deudor me hace perder 8332.
#
#asumiendo que el interés crece linealmente con el monto del prestamo (chequear).
# clasificar al deudor como no deudor es i veces grave como clasificar al no deudor cuando realidad es deudor (i es menor a 1)
# mi matriz f1 que sea:
# f1 = 2 * prec * rec / ( prec + rec)
# f1 = 

In [None]:
#opcion 1 (más fácil pero menos exacta)
# f beta score tal que beta sea igual a x/i

In [None]:
#opcion 2 (totalmente exacta asumiendo que siempre piden 2*sueldo mensual)
# cada costo lo cuento por separado, si me fijio en cada una de las obs si clasifico como deudor y es no deudor
# el costo va a ser -i * xj (xj es lo que le presto a esa persona)
# si lo clasifico como no deudor y es deudor el costo será xj

#metrica de costo, uso apply 

In [87]:
df_aux=pd.DataFrame(X_test, columns = cols[1:])
df_aux['predicted'] = list(ngb_clf.predict(X_test))
df_aux['real'] = list(y_test)
df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2

def costo_error(x, rate):
    if (x['predicted'] == 1) & (x['real'] == 0):
        return x['LoanPrincipal']*rate
    elif (x['predicted'] == 0) & (x['real'] == 1):
        return x['LoanPrincipal']
    else:
        return 0
sum(df_aux.apply(lambda x: costo_error(x,0.04), axis=1))

In [76]:
df_aux=pd.DataFrame(X_test, columns = cols[1:])
df_aux['predicted'] = list(ngb_clf.predict(X_test))
df_aux['real'] = list(y_test)
df_aux['LoanPrincipal'] = df_aux.MonthlyIncome * 2

In [85]:
df_aux['real']

0        0
1        0
2        0
3        0
4        0
        ..
28495    0
28496    0
28497    0
28498    0
28499    0
Name: real, Length: 28500, dtype: int64

In [84]:
df_aux[['predicted', 'real', 'LoanPrincipal']].shape

(28500, 3)

In [90]:
sum(df_aux.apply(lambda x: costo_error(x,0.04), axis=1))

9427465.11257403

In [91]:
d = {'pred': ngb_clf.predict(X_test), 
     'tasa': [1.2] * len(ngb_clf.predict(X_test)), 
     'monto_credito':  list(df_aux.LoanPrincipal),
      'pago_no_pago': y_test}

df = pd.DataFrame(data=d)

In [92]:
df

Unnamed: 0,pred,tasa,monto_credito,pago_no_pago
0,0,1.2,8332.000000,0
1,0,1.2,13340.442475,0
2,0,1.2,13340.442475,0
3,0,1.2,8332.000000,0
4,1,1.2,13340.442475,0
...,...,...,...,...
28495,0,1.2,33332.000000,0
28496,0,1.2,3600.000000,0
28497,0,1.2,12600.000000,0
28498,0,1.2,19500.000000,0


In [93]:
df_sin_cobrar = df[(df.pred == 0) & (df.pago_no_pago == 1)]

In [94]:
df_sin_cobrar.head()

Unnamed: 0,pred,tasa,monto_credito,pago_no_pago
175,0,1.2,13340.442475,1
196,0,1.2,13340.442475,1
302,0,1.2,4200.0,1
318,0,1.2,5600.0,1
484,0,1.2,12666.0,1


In [95]:
perdida_total = df_sin_cobrar.monto_credito.sum()

In [96]:
df_pagaron = df[(df.pred == 0) & (df.pago_no_pago == 0)]

In [97]:
recupero_total = df_pagaron.monto_credito.sum()

In [98]:
prestado_total = recupero_total + perdida_total

In [101]:
performance = recupero_total * 1.2 / prestado_total

In [102]:
performance

1.1735623783891043

In [None]:
ngb_clf.predict_proba(X_test).min()

In [None]:
ngb_clf.predict_proba??

## en el orden correcto

In [34]:
dict_weights = {0.3: 0, 0.7: 1} # costo relativo de default vs fees
# al array class_weights lo pase a un diccionary asociado al target

#la magia que matchea los y de train y de test con las ponderaciones
rev_subs = { v:k for k,v in dict_weights.items()}

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train_resampled, ngb_clf.predict(X_train_resampled), sample_weight= [rev_subs.get(item,item)  for item in y_train_resampled])

0.9122433873340396

In [36]:
accuracy_score(y_test, ngb_clf.predict(X_test), sample_weight= [rev_subs.get(item,item)  for item in y_test])

0.7189375296809566

In [37]:
roc_auc_score(y_test, ngb_clf.predict(X_test))

0.7403018475973396

In [38]:
ngb_clf

NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=12,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, learning_rate=0.01,
              minibatch_frac=1.0, n_estimators=100, natural_gradient=True,
              random_state=RandomState(MT19937) at 0x7FEF1203CD10, tol=