In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from functions import *
from sklearn.model_selection import train_test_split
import warnings

In [136]:
socio = pd.read_feather('BF_Dados_Socio_Economicos.feather')
socio = socio.drop_duplicates('identificação')
cont = pd.read_feather('BF_Dados_Contratos.feather')
cont = cont.loc[cont['CONTRATO'] > 40000]
df = cont.join(socio.set_index('identificação'),on = 'IDENTIFICAÇÃO')
df = df.loc[~df['atividade'].isna()]
df['DEFAULT'] = df['ATRASO_DIAS'] > 90
df.loc[df['numero_de_pessoas_na_casa'] > 10,'numero_de_pessoas_na_casa'] = 10
serie = df['numero_de_pessoas_na_casa'].dropna()
df.loc[df['numero_de_pessoas_na_casa'].isna(),'numero_de_pessoas_na_casa'] = np.random.normal(serie.mean(),serie.std(),df['numero_de_pessoas_na_casa'].isna().sum())
df['numero_de_pessoas_na_casa'] = df['numero_de_pessoas_na_casa'].apply(lambda x: round(abs(x)))
df = df.dropna().sample(frac = 1).reset_index(drop = True)
to_drop = ['SITUACAO','QUANTIDADE_PAGAS','ATRASO_DIAS','ATRASO_MAXIMO','RENEGOCIADO','BAIRRO','CONTRATO','IDENTIFICAÇÃO','CEP','melhor_data_vencimento','QUANTIDADE_PARCELAS','VALOR_EMPRESTIMO','PRESTACAO','valor_solicitado','melhor_valor_parcela']
df = df.drop(to_drop,axis = 1)

### Transformação e separação treino-teste

In [137]:
df = numerize_dummie(df)
x_train,x_test,y_train,y_test = train_test_split(df.drop('DEFAULT',axis = 1).values,df['DEFAULT'].values,test_size = 0.25)

### Definição de métrica
F-beta score

In [138]:
from sklearn.metrics import fbeta_score,recall_score,precision_score
from sklearn.metrics import make_scorer

In [139]:
peso_recall = 0.5
metric = lambda real,pred: fbeta_score(real,pred,beta = peso_recall)
scorer = make_scorer(fbeta_score,beta = peso_recall)
# metric = lambda real,pred: ((((real == 1) & (pred == 1)).sum() / (pred == 1).sum() if (pred == 1).sum() != 0 else 0) + recall_score(real,pred)) / 2
# scorer = make_scorer(metric)

In [140]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print('Métricas para previsão de 100% negativo:')
    print('Score:','%.3f' % metric(y_test,np.zeros(y_test.shape)))
    print('Precision:','%.3f' % precision_score(y_test,np.zeros(y_test.shape)))
    print('Recall:','%.3f' % recall_score(y_test,np.zeros(y_test.shape)))
    print('\n')
    print('Métricas para previsão de 100% positivo:')
    print('Score:','%.3f' % metric(y_test,np.ones(y_test.shape)))
    print('Precision:','%.3f' % precision_score(y_test,np.ones(y_test.shape)))
    print('Recall:','%.3f' % recall_score(y_test,np.ones(y_test.shape)))

Métricas para previsão de 100% negativo:
Score: 0.000
Precision: 0.000
Recall: 0.000


Métricas para previsão de 100% positivo:
Score: 0.051
Precision: 0.041
Recall: 1.000


#### Modelo 0: Simple XGBoost

In [141]:
from xgboost import XGBClassifier

In [142]:
xgb_model = XGBClassifier().fit(x_train, y_train)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.019
Precision: 0.429
Recall: 0.004


#### Modelo 1: Random Forest

In [143]:
from sklearn.ensemble import RandomForestClassifier

In [144]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train,y_train)
pediction = rf_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.019
Precision: 0.429
Recall: 0.004


#### Modelo 2: XGBoost com scale-pos-weight

In [145]:
xgb_model = XGBClassifier(scale_pos_weight = 5).fit(x_train, y_train)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.151
Precision: 0.170
Recall: 0.104


#### Modelo 3: SMOTE oversampling

In [146]:
from imblearn.over_sampling import SMOTE

In [147]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x_train, y_train)
xgb_model = XGBClassifier().fit(x_smote, y_smote)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.068
Precision: 0.181
Recall: 0.020


#### Modelo 4: Tomek Links Undersampling

In [148]:
from imblearn.under_sampling import TomekLinks

In [149]:
tl = TomekLinks(sampling_strategy='majority')
x_tl, y_tl = tl.fit_resample(x_train, y_train)
xgb_model = XGBClassifier().fit(x_tl, y_tl)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.024
Precision: 0.286
Recall: 0.005


#### Modelo 5: XGBoost ajustado + SMOTE

In [150]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x_train, y_train)
xgb_model = XGBClassifier(scale_pos_weight = 9).fit(x_smote, y_smote)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.121
Precision: 0.104
Recall: 0.336


In [151]:
print('Percentual de Falsos Positivos:')
print(100 * ((prediction != y_test) & (y_test == 0)).sum() / (y_test == 0).sum())

Percentual de Falsos Positivos:
12.315544213913237


#### Fine Tuning:

In [152]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [153]:
x,y = df.drop('DEFAULT',axis = 1).values,df['DEFAULT'].values
cv = KFold(n_splits = 4)
param_grid = {'scale_pos_weight':list(range(0,11))}
model = XGBClassifier(eval_metric = scorer)
grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, cv = cv, scoring = scorer,verbose = 3)
grid_result = grid.fit(x,y)

Fitting 4 folds for each of 11 candidates, totalling 44 fits


In [154]:
print('Melhor parâmetro:',grid_result.best_params_)
print('Score:',grid_result.best_score_)

Melhor parâmetro: {'scale_pos_weight': 6}
Score: 0.15922116434772088


#### Modelo Final: Tuned Params

In [155]:
xgb_model = XGBClassifier(scale_pos_weight = grid_result.best_params_['scale_pos_weight'],eval_metric = scorer).fit(x_train, y_train)
prediction = xgb_model.predict(x_test)
print('Score:','%.3f' % metric(y_test,prediction))
print('Precision:','%.3f' % precision_score(y_test,prediction))
print('Recall:','%.3f' % recall_score(y_test,prediction))

Score: 0.129
Precision: 0.135
Recall: 0.110


In [156]:
print('Percentual True Positives:')
print(100 * ((y_test == 1) & (prediction == 1)).sum() / (prediction == 1).sum())

Percentual True Positives:
13.461538461538462


#### Conclusão:
O melhor modelo foi XGBoost com scale_pos_weight = 9 e rebalanceamento de classes com modelo SMOTE

##### Características do modelo:
37% dos futuros inadimplentes são identificados, porém 8% dos futuros adimplentes são erroneamente excluídos