# <font color = "red">Classificação de Risco de Crédito</font>

## MODELAGEM PREDITIVA (CLASSIFICAÇÃO)

<font color = "red">Problema de Negócio:</font> A partir de dados históricos, irei verificar os padrões que levaram os clientes de uma instituição financeira, a adentrarem, ou não, na situação de inadimplência, para posteriormente criar um 

<font color = "red">Meta de Acurácia Final:</font> ACC mínimo de 88%

<font color = "blue">Nesta etapa, realizarei o treinamento dos modelos de classificação, para posterior comparação e teste, a fim de verificar as métricas. 

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importação dos dados 
X = pd.read_csv("C:/Projetos Pessoais/DataScience/analise_Risco_Credito/data/X_pre_processado_5.csv")
y = pd.read_csv("C:/Projetos Pessoais/DataScience/analise_Risco_Credito/data/y_pre_processado_5.csv")

In [4]:
X

Unnamed: 0,Renda_Anual,Anos_no_Emprego,Grau_Emprestimo,Valor,Taxa_Juros,Comprometimento_Renda,Inadimplencia_Historica,Tipo_Moradia_Hipoteca,Tipo_Moradia_Outro,Tipo_Moradia_Próprio,Motivo_Emprestimo_Educação,Motivo_Emprestimo_Empreendimento,Motivo_Emprestimo_Médico,Motivo_Emprestimo_Pessoal,Motivo_Emprestimo_Reformas Domésticas
0,-1.903478,0.196962,1,-1.551342,0.074325,-0.663416,0,0,0,1,1,0,0,0,0
1,-1.891412,-0.699305,0,-1.227123,-1.184422,1.077414,0,0,0,1,0,1,0,0,0
2,-1.887390,0.495717,3,-1.421655,1.207196,0.032916,0,0,0,1,0,1,0,0,0
3,-1.855214,1.093228,1,-1.389233,0.027122,0.032916,0,1,0,0,1,0,0,0,0
4,-1.847975,-1.296815,0,-1.443269,-1.137219,-0.199194,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34937,0.709549,-0.416085,4,2.555442,2.053350,1.309524,0,0,0,0,0,0,0,0,0
34938,-1.533943,-0.699305,3,-0.761657,1.393679,1.015354,1,0,0,0,0,0,0,1,0
34939,0.874144,0.959136,2,-0.120001,0.275724,-0.687167,1,1,0,0,0,0,0,0,0
34940,-1.112140,-0.998060,1,0.318936,-0.121854,1.975202,0,0,0,0,1,0,0,0,0


In [5]:
# Divisão em Dados de Treino e Teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X,
                                                        y,
                                                        test_size = 0.2,
                                                        random_state = 40)

# <font color = 'red'> --------------

# Regressão Logística 

In [19]:
modelo_v1_regressao = LogisticRegression()

In [20]:
modelo_v1_regressao.fit(X_treino, y_treino)

In [21]:
previsoes_v1_LR = modelo_v1_regressao.predict(X_teste)

In [24]:
modelo_v1_metricas = {
 "Modelo": "Regressão Logística",
 "Versão": "1",
 "Detalhes": " - ",
 "Precision": round(precision_score(previsoes_v1_LR, y_teste), 3),
 "Recall": round(recall_score(previsoes_v1_LR, y_teste), 3),
 "F1_Score": round(f1_score(previsoes_v1_LR, y_teste), 3),
 "Acurácia": round(accuracy_score(previsoes_v1_LR, y_teste), 3),
 "AUC": round(roc_auc_score(y_teste, previsoes_v1_LR), 3) #AUC inverte a ordem dos dados -  primeiro real depois previsões
}

modelo_v1_metricas

{'Modelo': 'Regressão Logística',
 'Versão': '1',
 'Detalhes': ' - ',
 'Precision': 0.792,
 'Recall': 0.795,
 'F1_Score': 0.794,
 'Acurácia': 0.792,
 'AUC': 0.792}

# <font color = 'red'> --------------

# Decision Tree com Grid Search

In [51]:
# Parametros para grid
params_decision_tree = {
 "criterion": ["gini", "entropy"],
 "max_depth": [3, None],
 "min_samples_leaf": [1, 3, 10],
 "min_samples_split": [2, 3],
 "max_depth": [1,3,10]
}

In [52]:
clf_decision_tree = DecisionTreeClassifier()

In [53]:
# Grid Search
modelo_v2_decisionTree = GridSearchCV(clf_decision_tree,
                                      params_decision_tree,
                                      scoring='accuracy',
                                      cv = 5)

In [54]:
# Treinando 
modelo_v2_decisionTree.fit(X_treino, y_treino)

In [55]:
modelo_v2_decisionTree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [56]:
previsoes_v2_DT = modelo_v2_decisionTree.predict(X_teste)

In [57]:
modelo_v2_metricas = {
 "Modelo": "Decision Tree GS",
 "Versão": "2",
 "Detalhes": "gini",
 "Precision": round(precision_score(previsoes_v2_DT, y_teste), 3),
 "Recall": round(recall_score(previsoes_v2_DT, y_teste), 3),
 "F1_Score": round(f1_score(previsoes_v2_DT, y_teste), 3),
 "Acurácia": round(accuracy_score(previsoes_v2_DT, y_teste), 3),
 "AUC": round(roc_auc_score(y_teste, previsoes_v2_DT), 3) #AUC inverte a ordem dos dados -  primeiro real depois previsões
}
modelo_v2_metricas

{'Modelo': 'Decision Tree GS',
 'Versão': '2',
 'Detalhes': 'gini',
 'Precision': 0.706,
 'Recall': 0.972,
 'F1_Score': 0.818,
 'Acurácia': 0.841,
 'AUC': 0.842}

# <font color = 'red'> --------------

# Random Forest com GS

In [50]:
??RandomForestClassifier

[1;31mInit signature:[0m
[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'sqrt'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mN

In [58]:
clf_random_forest = RandomForestClassifier(n_estimators = 100)

In [62]:
# Parametros
params_random_forest = {
    'max_depth': [None, 2, 10, 20],  # Profundidade máxima de cada árvore
    'min_samples_split': [2, 5, 10],  # Mínimo de amostras necessárias para dividir um nó
    'min_samples_leaf': [1, 2, 4],    # Mínimo de amostras necessárias em um nó folha
    'max_features': ['auto', 'sqrt'], # Número de características a serem consideradas para o melhor split
    'criterion': ['gini', 'entropy']
}

In [63]:
# Grid Search
modelo_v3_randomForest = GridSearchCV(clf_random_forest,
                                      params_random_forest,
                                      cv = 5)

In [67]:
modelo_v3_randomForest.fit(X_treino, y_treino)

KeyboardInterrupt: 

In [None]:
# <font color = 'red'> --------------

# Random Forest com GS