In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
import scipy.sparse as sp

# === Caminhos (ajuste se necessário) ===
df_path = "onlyfulldata_sem_outliers_cols_especificas.csv"
test_path = "teste.csv"

# === Carregar dados ===
df = pd.read_csv(df_path)
df_test = pd.read_csv(test_path)

# === Remover registros sem custo (já fazia) ===
df = df.dropna(subset=["annual_medical_cost"]).copy()

# === Inspeciona valores problemáticos em annual_medical_cost ===
print("Resumo annual_medical_cost — count/min/max/mean:")
print(df["annual_medical_cost"].describe())

# Linhas com valores <= -1 (causam log1p -> -inf) ou valores negativos fora do esperado
bad_mask = df["annual_medical_cost"] <= -1
if bad_mask.any():
    print(f"\nEncontradas {bad_mask.sum()} linhas com annual_medical_cost <= -1 (problemáticas para log1p). Exemplo:")
    print(df.loc[bad_mask, ["annual_medical_cost"]].head(10))
else:
    print("\nNenhuma linha com annual_medical_cost <= -1 encontrada.")

# --- Escolha de tratamento: aqui vamos remover valores negativos/invalidos (ajuste se preferir outra lógica)
# Mantemos custos >= 0 (log1p(0)=0 é válido). Se preferir manter zeros mas investigar negativos, ajuste aqui.
df = df[df["annual_medical_cost"] >= 0].copy()
print("\nApós filtro (mantendo annual_medical_cost >= 0):", df.shape)

# === Remover colunas de ID (não informativas para modelo) ===
id_cols = ["person_id", "cost_id", "policy_id", "record_id", "visit_id"]
df = df.drop(columns=[col for col in id_cols if col in df.columns])

# === Criar coluna alvo com log1p (safety: agora não haverá -inf) ===
df["annual_medical_cost_log"] = np.log1p(df["annual_medical_cost"])

# === Separar features numéricas e categóricas ===
categorical = df.select_dtypes(include="object").columns.tolist()
numerical = df.select_dtypes(include=np.number).drop(columns=["annual_medical_cost", "annual_medical_cost_log"], errors="ignore").columns.tolist()

print("Numéricas:", len(numerical), "— Exemplo:", numerical[:10])
print("Categóricas:", len(categorical), "— Exemplo:", categorical[:10])

# === Tratar dtypes 'nullable' (Int64, boolean, UInt...) convertendo para float ===
nullable_cols = [c for c in df.columns if str(df[c].dtype).startswith(("Int64","boolean","UInt"))]
if nullable_cols:
    print("Convertendo colunas 'nullable' para float:", nullable_cols)
    for c in nullable_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype(float)
        if c in df_test.columns:
            df_test[c] = pd.to_numeric(df_test[c], errors="coerce").astype(float)
    categorical = df.select_dtypes(include="object").columns.tolist()
    numerical = df.select_dtypes(include=np.number).drop(columns=["annual_medical_cost", "annual_medical_cost_log"], errors="ignore").columns.tolist()

# === Pré-processamento ===
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical),
    ("cat", categorical_transformer, categorical)
], sparse_threshold=0.3)

# === Modelo (XGBoost) ===
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

# Função para forçar float32 (evita dtypes object/nullable chegando ao XGBoost)
to_float32 = FunctionTransformer(lambda X: X.astype(np.float32), validate=False)

pipeline = Pipeline([
    ("preproc", preprocessor),
    ("to_float32", to_float32),
    ("model", model)
])

# === Treino (pre-fit checks) ===
X_train = df.drop(columns=["annual_medical_cost", "annual_medical_cost_log"]).copy()
y_train = df["annual_medical_cost_log"].copy()

print("\n>>> Checando dtypes originais (train):")
print(X_train.dtypes.value_counts())
print("\nNaNs em y_train (deve ser 0):", y_train.isna().sum())
print("min/max y_train:", y_train.min(), y_train.max())

# Testar preprocessor separadamente (debug)
preproc = preprocessor.fit(X_train)
X_train_trans = preproc.transform(X_train)
print("\nSaída do preprocessor:", type(X_train_trans))
if sp.issparse(X_train_trans):
    print("Sparse matrix — shape:", X_train_trans.shape, "nnz:", X_train_trans.nnz)
else:
    print("Numpy array — shape:", X_train_trans.shape, "dtype:", X_train_trans.dtype)
    print("contains NaN:", np.isnan(X_train_trans).any(), "contains inf:", np.isinf(X_train_trans).any())

# Treinando pipeline completa
print("\nTreinando pipeline completo...")
pipeline.fit(X_train, y_train)
print("✅ pipeline.fit() completado com sucesso")

# === Teste / Preparo X_test ===
X_test = df_test.copy()
ids = X_test["person_id"] if "person_id" in X_test.columns else np.arange(len(X_test))
X_test = X_test.drop(columns=["person_id"], errors="ignore")

# Garante que X_test tenha todas as colunas do treino
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = np.nan
X_test = X_test[X_train.columns]

# Forçar tipos básicos em X_test:
for c in numerical:
    if c in X_test.columns:
        X_test[c] = pd.to_numeric(X_test[c], errors="coerce")
for c in categorical:
    if c in X_test.columns:
        X_test[c] = X_test[c].astype(object)

print("\nDtypes X_test (resumo):")
print(X_test.dtypes.value_counts())

# === Predição ===
print("\nRealizando predições...")
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)

# === Submissão ===
submission = pd.DataFrame({
    "person_id": ids,
    "annual_medical_cost": y_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ Submissão salva com sucesso em 'submission.csv'")
