# 📊 Análise Exploratória e Modelagem de Crédito com Scikit-Learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score


## 📥 Carregamento dos dados

In [None]:
df_train = pd.read_csv("data/credtrain.txt", sep="\t", header=None)
df_test = pd.read_csv("data/credtest.txt", sep="\t", header=None)

colunas = ["ESCT", "NDEP", "RENDA", "TIPOR", "VBEM", "NPARC", "VPARC",
           "TEL", "IDADE", "RESMS", "ENTRADA", "CLASSE"]
df_train.columns = colunas
df_test.columns = colunas


## 📊 Análise Exploratória Inicial

In [None]:
print(df_train.info())
print(df_train.describe())
print("\nDistribuição da variável CLASSE (%):")
print(df_train['CLASSE'].value_counts(normalize=True) * 100)


In [None]:
numericas = ["RENDA", "VBEM", "NPARC", "VPARC", "IDADE", "RESMS", "ENTRADA"]
fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(14, 12))
axs = axs.flatten()
for i, col in enumerate(numericas):
    sns.histplot(df_train[col], bins=30, kde=True, ax=axs[i], color='steelblue')
    axs[i].set_title(f"Distribuição de {col}")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
categoricas = ["ESCT", "NDEP", "TIPOR", "TEL"]
axs = axs.flatten()
for i, col in enumerate(categoricas):
    sns.countplot(x=df_train[col], ax=axs[i], palette="Set2")
    axs[i].set_title(f"Frequência de {col}")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(14, 14))
axs = axs.flatten()
for i, col in enumerate(numericas):
    sns.boxplot(x="CLASSE", y=col, data=df_train, ax=axs[i], palette="pastel")
    axs[i].set_title(f"{col} por CLASSE")
plt.tight_layout()
plt.show()


## ⚙️ Pré-processamento

In [None]:
X_train = df_train.drop("CLASSE", axis=1)
y_train = df_train["CLASSE"]
X_test = df_test.drop("CLASSE", axis=1)
y_test = df_test["CLASSE"]

colunas_categoricas = ["ESCT", "NDEP", "TIPOR", "TEL"]
colunas_numericas = ["RENDA", "VBEM", "NPARC", "VPARC", "IDADE", "RESMS", "ENTRADA"]

preprocessador = ColumnTransformer(transformers=[
    ("num", MinMaxScaler(), colunas_numericas),
    ("cat", OneHotEncoder(drop="first"), colunas_categoricas)
])



In [7]:
X_train_transformado = preprocessador.fit_transform(X_train)

## ⚙️ Modelagem com Pipeline

In [None]:

modelos = {
    "Regressão Logística": LogisticRegression(solver="liblinear"),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

resultados = []
for nome, modelo in modelos.items():
    pipe = Pipeline(steps=[
        ("preprocessador", preprocessador),
        ("classificador", modelo)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    resultados.append({
        "Modelo": nome,
        "Acurácia": accuracy_score(y_test, y_pred),
        "Precisão": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    })

df_resultados = pd.DataFrame(resultados).sort_values(by="F1-Score", ascending=False)
print(df_resultados)

## 📉 Matrizes de Confusão dos Modelos

In [None]:
for nome, modelo in modelos.items():
    pipe = Pipeline(steps=[
        ("preprocessador", preprocessador),
        ("classificador", modelo)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Inadimplente (0)", "Adimplente (1)"])
    disp.plot(cmap="Blues")
    plt.title(f"Matriz de Confusão - {nome}")
    plt.grid(False)
    plt.show()
