In [1]:
import patsy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from tqdm import tqdm
from scipy.stats import ks_2samp
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
dados = pd.read_csv("C:/Users/Allan/Documents/Material de apoio/Módulo 23 - Combinação de modelos I/credit_scoring.csv", parse_dates = ['data_ref'])
# dados['tempo_emprego'].fillna(-1, inplace=True)
# dados['mau'] = dados2['mau'].map({'VERDADEIRO': 1, 'FALSO':0})
dados.head()

Unnamed: 0,data_ref,id_cliente,sexo,posse_de_veiculo,posse_de_imovel,qtd_filhos,tipo_renda,educacao,estado_civil,tipo_residencia,idade,tempo_emprego,qt_pessoas_residencia,renda,mau
0,2015-01-01,1,F,True,True,0,Assalariado,Secundário,Casado,Casa,49,8.605479,2.0,1916.54,0
1,2015-01-01,2,M,True,False,0,Empresário,Secundário,União,Casa,60,6.953425,2.0,2967.25,0
2,2015-01-01,3,F,True,False,0,Empresário,Secundário,Casado,Casa,28,0.682192,2.0,340.96,0
3,2015-01-01,4,F,False,True,0,Assalariado,Superior completo,Casado,Casa,60,1.879452,2.0,4903.16,0
4,2015-01-01,5,F,False,False,0,Empresário,Secundário,Casado,Casa,47,8.438356,2.0,3012.6,0


In [3]:
df = dados.copy()
# Selecionar meses de 2016 para validação
df_val = df[df['data_ref'] >= datetime(2016,1,1)].copy()

# Selecionar meses de 2015 para treinamento e teste
df = df[df['data_ref'] < datetime(2016,1,1)]

df_train, df_test = train_test_split(df, test_size=0.3, random_state=12)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

print('Quantidade de linhas no treino:    ',df_train.shape[0])
print('Quantidade de linhas no teste:     ',df_test.shape[0])
print('Quantidade de linhas na validação: ',df_val.shape[0])
print('Quantidade total de linhas: ',df.shape[0])

Quantidade de linhas no treino:     42000
Quantidade de linhas no teste:      18000
Quantidade de linhas na validação:  15000
Quantidade total de linhas:  60000


In [4]:
# X has 894 features, but DecisionTreeClassifier is expecting 1067 features as input
print(df_train.mau.value_counts())
print(df_test.mau.value_counts())
print(df_val.mau.value_counts())

0    39073
1     2927
Name: mau, dtype: int64
0    16769
1     1231
Name: mau, dtype: int64
0    13932
1     1068
Name: mau, dtype: int64


In [8]:
equacao = '''mau ~ 
sexo + 
posse_de_veiculo + 
posse_de_imovel +   
tipo_renda +
educacao +
estado_civil +
tipo_residencia +
qtd_filhos +
idade +
qt_pessoas_residencia'''

y_train, X_train = patsy.dmatrices(equacao, data=df_train)
y_test, X_test = patsy.dmatrices(equacao, data=df_test)
y_val, X_val = patsy.dmatrices(equacao, data=df_val)

In [22]:
# Calcular o Gini
def calcula_gini(RESP, PD):
    #AUC
    auc = roc_auc_score(RESP, PD)
    
    #Gini
    gini = 2*auc -1
    return gini


def print_metricas(dados, 
                  PD = 'PD', 
                  CLASSE_PRED = 'classe_predita', 
                  RESP = 'mau'):
    
    #Acuracia
    acc = accuracy_score(dados[RESP], dados[CLASSE_PRED])

    #AUC
    auc = roc_auc_score(dados[RESP], dados[PD])
    
    #Gini
    gini = 2*auc-1
    
    #KS
    ks = ks_2samp(dados.loc[dados[RESP] == 1, PD], 
                  dados.loc[dados[RESP] != 1, PD]).statistic

    print('KS:       {0:.2f}%'.format(ks*100))
    print('AUC:      {0:.2f}%'.format(auc*100))
    print('GINI:     {0:.2f}%'.format(gini*100))
    print('Acurácia: {0:.2f}%\n'.format(acc*100))
    
    return None

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score,precision_score,recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


In [9]:
%%time
gb = GradientBoostingClassifier()
parametros = {
    'n_estimators': [100,300,600],
    'min_samples_leaf': [2,10,20],
    'learning_rate':[0.04,0.06,0.1]
}

grid = GridSearchCV(estimator=gb,
                   param_grid=parametros,
                   scoring='roc_auc',
                   verbose=False,
                   cv=2)
grid.fit(X_train,y_train.ravel())

Wall time: 4min 44s


GridSearchCV(cv=2, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.04, 0.06, 0.1],
                         'min_samples_leaf': [2, 10, 20],
                         'n_estimators': [100, 300, 600]},
             scoring='roc_auc', verbose=False)

In [10]:
grid.best_params_

{'learning_rate': 0.04, 'min_samples_leaf': 2, 'n_estimators': 100}

In [12]:
%%time
clf = GradientBoostingClassifier(**grid.best_params_).fit(X_train,y_train.ravel())
clf

Wall time: 3.18 s


GradientBoostingClassifier(learning_rate=0.04, min_samples_leaf=2)

### 2º tunning

In [16]:
%%time
gb = GradientBoostingClassifier()
parametros = {
        'learning_rate':[0.02,0.03,0.04],
    'min_samples_leaf': [8,10,12],
    'n_estimators': [90,100,110]
    

}

grid = GridSearchCV(estimator=gb,
                   param_grid=parametros,
                   scoring='roc_auc',
                   verbose=False,
                   cv=2)
grid.fit(X_train,y_train.ravel())

grid.best_params_
# {'learning_rate': 0.03, 'min_samples_leaf': 12, 'n_estimators': 110}

Wall time: 1min 24s


{'learning_rate': 0.03, 'min_samples_leaf': 12, 'n_estimators': 110}

In [14]:
%%time
clf = GradientBoostingClassifier(**grid.best_params_).fit(X_train,y_train.ravel())
clf

Wall time: 3.6 s


GradientBoostingClassifier(learning_rate=0.03, min_samples_leaf=12,
                           n_estimators=110)

### 3º tunning

In [15]:
%%time
gb = GradientBoostingClassifier()
parametros = {
        'learning_rate':[0.01,0.015,0.02],
    'min_samples_leaf': [6,7,8],
    'n_estimators': [50,100,200]
    

}

grid = GridSearchCV(estimator=gb,
                   param_grid=parametros,
                   scoring='recall',
                   verbose=False,
                   cv=2)
grid.fit(X_train,y_train.ravel())

grid.best_params_

Wall time: 1min 37s


{'learning_rate': 0.01, 'min_samples_leaf': 6, 'n_estimators': 50}

### 4º tunning

In [17]:
%%time
gb = GradientBoostingClassifier()
parametros = {
    'learning_rate':[0.008,0.009,0.01],
    'min_samples_leaf': [8],
    'n_estimators': [100]
    

}

grid = GridSearchCV(estimator=gb,
                   param_grid=parametros,
                   scoring='roc_auc',
                   verbose=False,
                   cv=2)
grid.fit(X_train,y_train.ravel())

grid.best_params_

Wall time: 12.3 s


{'learning_rate': 0.01, 'min_samples_leaf': 8, 'n_estimators': 100}

### Treinando e calculando a performance dos melhores hyperparâmetros

In [18]:
grid.best_params_

{'learning_rate': 0.01, 'min_samples_leaf': 8, 'n_estimators': 100}

In [25]:
%%time
parametros = {
    'learning_rate':0.009,
    'min_samples_leaf': 8,
    'n_estimators': 100}

clf = GradientBoostingClassifier(**grid.best_params_)\
.fit(X_train,y_train.ravel())

df_train['classe_predita'] = clf.predict(X_train)
df_train['PD'] = clf.predict_proba(X_train)[:, 1]

df_test['classe_predita']=clf.predict(X_test)
df_test['PD']=clf.predict_proba(X_test)[:, 1]

df_val['classe_predita']=clf.predict(X_val)
df_val['PD']=clf.predict_proba(X_val)[:, 1]

print('Performance do GBM nos dados de treino')
print_metricas(dados=df_train)

print('Performance do GBM nos dados de teste')
print_metricas(dados=df_test)

print('Performance do GBM nos dados de validação')
print_metricas(dados=df_val)

Performance do GBM nos dados de treino
KS:       14.70%
AUC:      60.40%
GINI:     20.80%
Acurácia: 93.03%

Performance do GBM nos dados de teste
KS:       16.84%
AUC:      60.46%
GINI:     20.93%
Acurácia: 93.16%

Performance do GBM nos dados de validação
KS:       15.67%
AUC:      59.77%
GINI:     19.53%
Acurácia: 92.88%

Wall time: 3.43 s


In [32]:
df_train[(df_train['classe_predita']!=1) & (df_train['mau']!=0)]
print(df_train.classe_predita.value_counts())
print(df_train.mau.value_counts())

0.0    42000
Name: classe_predita, dtype: int64
0    39073
1     2927
Name: mau, dtype: int64


In [35]:
df_test[(df_test['classe_predita']!=1) & (df_test['mau']!=0)]
print(df_test.classe_predita.value_counts())
print(df_test.mau.value_counts())

0.0    18000
Name: classe_predita, dtype: int64
0    16769
1     1231
Name: mau, dtype: int64


In [36]:
df_val[(df_val['classe_predita']!=1) & (df_val['mau']!=0)]
print(df_val.classe_predita.value_counts())
print(df_val.mau.value_counts())

0.0    15000
Name: classe_predita, dtype: int64
0    13932
1     1068
Name: mau, dtype: int64


In [33]:
clf.predict_proba(X_test)[:, 1]

array([0.08189284, 0.06136853, 0.07670068, ..., 0.08189284, 0.05622839,
       0.07577805])