In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

CSV_PATH = "/Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/full_warehouse_merged.csv"

df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["annual_medical_cost"]).copy() # Garantir que o alvo não tem NaN


# Remover IDs (muito importante!)
id_cols = ["person_id", "cost_id", "policy_id", "record_id", "visit_id"]
df = df.drop(columns=id_cols)

y = df["annual_medical_cost"]
X = df.drop(columns=["annual_medical_cost"])


# === One-hot encoding APENAS nas colunas categóricas reais ===
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
print("Colunas categóricas detectadas:", cat_cols)

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

print("Shape final de X:", X.shape)

# === Remover linhas com NaN nas features, caso algo tenha sobrado ===
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]

print("NaNs em X:", X.isna().sum().sum())
print("NaNs em y:", y.isna().sum())

# === Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Treino:", X_train.shape, y_train.shape)
print("Teste :", X_test.shape, y_test.shape)

# Modelo leve e rápido  
model = HistGradientBoostingRegressor(
    max_depth=6,       # limita profundidade da árvore
    max_iter=200,      # número de boosting rounds
    learning_rate=0.05,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10
)

# Treinar
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("\n======= RESULTADOS =======")
print(f"MSE : {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²  : {r2:.4f}")

Colunas categóricas detectadas: ['plan_type', 'network_tier', 'sex', 'region', 'urban_rural', 'education', 'marital_status', 'employment_status', 'smoker', 'alcohol_freq']
Shape final de X: (53009, 73)
NaNs em X: 0
NaNs em y: 0
Treino: (11924, 73) (11924,)
Teste : (2982, 73) (2982,)

MSE : 15651.27
RMSE: 125.11
R²  : 0.9905
