# MVP: Previsão de Risco de Doenças Cardíacas

## 1. Escopo, Objetivo e Definição do Problema
- Objetivo: prever risco de doenças cardíacas (classificação binária: sim/não)
- Hipóteses iniciais: idade, pressão, colesterol e histórico familiar influenciam mais.
- Restrições: dataset público, sem dados clínicos sensíveis.

## 2. Reprodutibilidade e Ambiente

In [None]:
import numpy as np
import random
import torch
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## 3. Dados: Carga, Entendimento e Qualidade

In [None]:
df = pd.read_csv('heart.csv')  # Ajustar caminho
print(df.info())
print(df.describe())
print(df.isnull().sum())
sns.countplot(x='target', data=df)
plt.show()

## 4. Definição do Target, Variáveis e Divisão dos Dados

In [None]:
X = df.drop('target', axis=1)
y = df['target']
numerical_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## 5. Tratamento de Dados e Pipeline de Pré-processamento

In [None]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[('num', num_transformer, numerical_cols), ('cat', cat_transformer, categorical_cols)])

## 6. Baseline e Modelos Candidatos

In [None]:
baseline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DummyClassifier(strategy='most_frequent'))])
baseline.fit(X_train, y_train)
y_pred_base = baseline.predict(X_test)
print("Accuracy baseline:", accuracy_score(y_test, y_pred_base))

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(random_state=42)
}
for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.3f}, F1: {f1_score(y_test, y_pred):.3f}")

## 7. Validação e Otimização de Hiperparâmetros

In [None]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5]
}
grid_search = GridSearchCV(
    Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))]),
    param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
print("Melhores parâmetros:", grid_search.best_params_)

## 8. Avaliação Final, Análise de Erros e Limitações

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()
RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.show()

## 9. Engenharia de Atributos (Opcional)
# Exemplo: criar bins de idade
# df['age_bin'] = pd.cut(df['age'], bins=[29,40,50,60,70,80], labels=False)