
# MVP: Previsão de Risco de Doenças Cardíacas - Versão Final

Notebook completo cobrindo:
1. Pré-processamento e engenharia de atributos
2. Modelagem e comparação de algoritmos
3. Avaliação de métricas e gráficos
4. SHAP para interpretabilidade
5. Monitoramento pós-deploy
6. Exportação de relatório HTML e PDF


In [None]:

# Bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, precision_score, recall_score,
                             confusion_matrix, RocCurveDisplay)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import shap
from scipy.stats import ks_2samp
import nbformat
from nbconvert import HTMLExporter
import pdfkit
import os


In [None]:

# Carga de dados
df = pd.read_csv("dados_cardiacos.csv")
df.drop(columns=['HeartDisease_FamilyHistory'], inplace=True)
df['target'] = df['target'].replace({'No':0,'Yes':1}).astype(int)

categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64','float64']).drop('target', axis=1).columns.tolist()

X = df.drop('target', axis=1)
y = df['target']

# Divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:

# Pipeline de pré-processamento
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [None]:

# Modelagem e comparação de modelos
model_candidates = {
    "Dummy": DummyClassifier(strategy="most_frequent"),
    "LogReg": LogisticRegression(max_iter=500, random_state=42),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42)
}

best_model = None
best_score = 0

for name, model in model_candidates.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('model', model)])
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')
    mean_score = scores.mean()
    print(f"{name} - ROC-AUC CV: {mean_score:.4f}")
    if mean_score > best_score:
        best_score = mean_score
        best_model = pipe


In [None]:

# Otimização de hiperparâmetros (exemplo RandomForest)
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10, None]
}
grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
melhor_modelo = grid_search.best_estimator_


In [None]:

# Avaliação do modelo
y_pred = melhor_modelo.predict(X_test)
y_prob = melhor_modelo.predict_proba(X_test)[:,1]

metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_prob),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred)
}
print("Métricas do modelo:", metrics)

# Matriz de Confusão
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Matriz de Confusão")
plt.show()

# Curva ROC
RocCurveDisplay.from_estimator(melhor_modelo, X_test, y_test)
plt.show()

# Importância das features
if hasattr(melhor_modelo.named_steps['model'], 'feature_importances_'):
    importances = melhor_modelo.named_steps['model'].feature_importances_
    feature_names = (melhor_modelo.named_steps['preprocessor'].transformers_[0][2] + 
                     list(melhor_modelo.named_steps['preprocessor']
                          .transformers_[1][1].get_feature_names_out(categorical_cols)))
    feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)
    sns.barplot(x='importance', y='feature', data=feat_imp_df.head(10))
    plt.title("Top 10 Features")
    plt.show()

# SHAP values
explainer = shap.Explainer(melhor_modelo.named_steps['model'], melhor_modelo.named_steps['preprocessor'].transform(X_train))
shap_values = explainer(melhor_modelo.named_steps['preprocessor'].transform(X_test))
shap.summary_plot(shap_values, features=melhor_modelo.named_steps['preprocessor'].transform(X_test), feature_names=feature_names)


In [None]:

# Monitoramento Pós-Deploy
def monitor_model(model, X_new, y_new, baseline_metrics=None, alert_threshold=0.1):
    y_pred = model.predict(X_new)
    y_prob = model.predict_proba(X_new)[:,1]

    metrics = {
        "accuracy": accuracy_score(y_new, y_pred),
        "f1_score": f1_score(y_new, y_pred),
        "roc_auc": roc_auc_score(y_new, y_prob),
        "precision": precision_score(y_new, y_pred),
        "recall": recall_score(y_new, y_pred)
    }
    
    alerts = {}
    if baseline_metrics:
        for key in metrics:
            if abs(metrics[key] - baseline_metrics[key]) > alert_threshold:
                alerts[key] = f"Alerta: queda de {key} em {abs(metrics[key] - baseline_metrics[key]):.2f}"
    
    return metrics, alerts

def monitor_data_drift(X_train, X_new, alpha=0.05):
    drift_report = {}
    for col in X_train.columns:
        stat, p_value = ks_2samp(X_train[col], X_new[col])
        drift_report[col] = p_value < alpha
    return drift_report


In [None]:

# Exportação de relatório HTML e PDF
notebook_filename = "MVP_Previsao_Risco_Cardiaco.ipynb"

# HTML
with open(notebook_filename) as f:
    nb_node = nbformat.read(f, as_version=4)

html_exporter = HTMLExporter()
html_exporter.exclude_input = False
body, resources = html_exporter.from_notebook_node(nb_node)

html_filename = "MVP_Previsao_Risco_Cardiaco.html"
with open(html_filename, "w", encoding="utf-8") as f:
    f.write(body)
print(f"Relatório HTML gerado: {html_filename}")

# PDF (wkhtmltopdf necessário)
pdf_filename = "MVP_Previsao_Risco_Cardiaco.pdf"
pdfkit.from_file(html_filename, pdf_filename)
print(f"Relatório PDF gerado: {pdf_filename}")
