In [2]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# === Caminhos ===
df_path = "full_warehouse_merged.csv"
test_path = "teste.csv"

# === Carregar dados ===
df = pd.read_csv(df_path)
df_test = pd.read_csv(test_path)

# === Filtrar registros com custo válido ===
df = df.dropna(subset=["annual_medical_cost"]).copy()

# === Remover colunas de ID (não informativas para modelo) ===
id_cols = ["person_id", "cost_id", "policy_id", "record_id", "visit_id"]
df = df.drop(columns=[col for col in id_cols if col in df.columns])

# === Criar coluna alvo com log1p ===
df["annual_medical_cost_log"] = np.log1p(df["annual_medical_cost"])

# === Separar features numéricas e categóricas ===
categorical = df.select_dtypes(include="object").columns.tolist()
numerical = df.select_dtypes(include=np.number).drop(columns=["annual_medical_cost", "annual_medical_cost_log"]).columns.tolist()

# === Pré-processamento ===
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical),
    ("cat", categorical_transformer, categorical)
])

# === Modelo ===
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", model)
])

# === Treino ===
X_train = df.drop(columns=["annual_medical_cost", "annual_medical_cost_log"])
y_train = df["annual_medical_cost_log"]
pipeline.fit(X_train, y_train)

# === Teste ===
X_test = df_test.copy()
ids = X_test["person_id"] if "person_id" in X_test.columns else np.arange(len(X_test))
X_test = X_test.drop(columns=["person_id"], errors="ignore")

# Garante que X_test tenha todas as colunas do treino
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = np.nan
X_test = X_test[X_train.columns]

# === Predição ===
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)

# === Submissão ===
submission = pd.DataFrame({
    "person_id": ids,
    "annual_medical_cost": y_pred
})
submission.to_csv("submission.csv", index=False)

print("✅ Submissão salva com sucesso em 'submission.csv'")


✅ Submissão salva com sucesso em 'submission.csv'
