# Notebook: Gerar 4 submissions (log1p target)

Este notebook treina 4 modelos diferentes (RandomForest, GradientBoosting, XGBoost, LightGBM) aplicando `log1p` no target, faz pré-processamento automático (imputação e one-hot) e gera 4 arquivos `submission_*.csv`. Ajuste `CSV_PATH` se necessário.

In [5]:
import pandas as pd
import numpy as np
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb
print("xgboost:", xgb.__version__)
print("pandas:  ", pd.__version__)

import lightgbm as lgb

print("Imports completos.")

xgboost: 3.1.1
pandas:   2.3.3
Imports completos.


In [6]:
# === Ajuste para seus paths (JA VEM PRONTO) ===
CSV_PATH = "/Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data"

df_path = f"{CSV_PATH}/full_warehouse_merged.csv"
test_path = f"{CSV_PATH}/teste.csv"

print("Carregando arquivos:")
print("  treino ->", df_path)
print("  teste  ->", test_path)

df = pd.read_csv(df_path)
df_test = pd.read_csv(test_path)

print("Shapes:")
print("  Treino:", df.shape)
print("  Teste :", df_test.shape)

df.head()

Carregando arquivos:
  treino -> /Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/full_warehouse_merged.csv
  teste  -> /Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/teste.csv
Shapes:
  Treino: (64800, 59)
  Teste : (7200, 58)


Unnamed: 0,person_id,annual_medical_cost,annual_premium,monthly_premium,claims_count,avg_claim_amount,total_claims_paid,hypertension,diabetes,asthma,...,proc_imaging_count,proc_surgery_count,proc_physio_count,proc_consult_count,proc_lab_count,had_major_procedure,cost_id,policy_id,record_id,visit_id
0,1,839.85,290.63,24.22,1.0,410.76,410.76,0,0,0,...,2.0,0,1.0,0.0,0.0,0,cost_id_31150,policy_id_00033,record_id_36092,visit_id_40611
1,2,709.72,278.13,23.18,1.0,269.0,269.0,0,0,0,...,0.0,0,0.0,0.0,0.0,0,cost_id_61371,policy_id_00119,record_id_45363,visit_id_48818
2,3,1589.27,372.57,31.05,4.0,270.0,1080.0,1,0,0,...,0.0,0,1.0,0.0,0.0,0,cost_id_63662,policy_id_00176,record_id_64734,visit_id_01246
3,4,3246.81,677.54,56.46,0.0,0.0,0.0,0,0,0,...,0.0,0,0.0,1.0,2.0,0,cost_id_17436,policy_id_00112,record_id_43770,visit_id_47786
4,6,1462.55,350.4,29.2,3.0,219.58,658.74,1,0,0,...,0.0,0,1.0,0.0,0.0,0,cost_id_59155,policy_id_00017,record_id_61337,visit_id_35905


In [7]:
# === Preparação + log1p no target ===

# Garantir que a coluna alvo é numérica
df["annual_medical_cost"] = pd.to_numeric(df["annual_medical_cost"], errors="coerce")

# Remover registros com target inválido
df = df.dropna(subset=["annual_medical_cost"]).copy()

# Remover a coluna _income_outlier_flag se existir (treino e teste)
if "_income_outlier_flag" in df.columns:
    df = df.drop(columns=["_income_outlier_flag"])

if "_income_outlier_flag" in df_test.columns:
    df_test = df_test.drop(columns=["_income_outlier_flag"])

# Aplicar log1p no target
y = np.log1p(df["annual_medical_cost"])

# X = tudo exceto o target
X = df.drop(columns=["annual_medical_cost"])
X_test = df_test.copy()

# Separar tipos de colunas
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print(f"Numéricas: {len(num_cols)} | Categóricas: {len(cat_cols)}")
print("Exemplos de colunas numéricas:", num_cols[:10])
print("Exemplos de colunas categóricas:", cat_cols[:10])

# Pré-processamento
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

print("Pré-processador criado.")


Numéricas: 43 | Categóricas: 14
Exemplos de colunas numéricas: ['person_id', 'annual_premium', 'monthly_premium', 'claims_count', 'avg_claim_amount', 'total_claims_paid', 'hypertension', 'diabetes', 'asthma', 'copd']
Exemplos de colunas categóricas: ['plan_type', 'network_tier', 'sex', 'region', 'urban_rural', 'education', 'marital_status', 'employment_status', 'smoker', 'alcohol_freq']
Pré-processador criado.


In [8]:
def train_and_export(model, name, X=X, y=y, X_test=df_test, preprocessor=preprocessor, out_dir='.'):
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    print(f"\nTreinando modelo: {name} ...")
    pipe.fit(X, y)

    print("Gerando previsões...")
    preds = pipe.predict(X_test)

    # desfaz log1p
    preds = np.expm1(preds)

    # proteção contra negativos / NaN
    preds = np.clip(preds, 0, None)
    preds = np.nan_to_num(preds, nan=0)

    # ⚠️ Garanta que person_id exista no df_test
    if "person_id" not in X_test.columns:
        raise KeyError("A coluna 'person_id' não existe no arquivo de teste!")

    # gerar dataframe final conforme Kaggle espera
    submission = pd.DataFrame({
        "person_id": X_test["person_id"].values,
        "annual_medical_cost": preds
    })

    out_path = os.path.join(out_dir, f"submission_{name}.csv")
    submission.to_csv(out_path, index=False)

    print(f"✔ Arquivo salvo -> {out_path}")
    return out_path


In [9]:
# Substitua apenas o bloco que chama train_and_export para o XGBoost por este
try:
    xgb_model = xgb.XGBRegressor(
        n_estimators=1200,
        learning_rate=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method="hist",
        n_jobs=-1,
        enable_categorical=False   # <-- correção aqui
    )
    train_and_export(xgb_model, "xgb_fixed")
except Exception as e:
    print("Falhou ao treinar XGBoost com correção. Erro:", e)





Treinando modelo: xgb_fixed ...
Gerando previsões...
✔ Arquivo salvo -> ./submission_xgb_fixed.csv


In [10]:
# 1 — Random Forest
train_and_export(
    RandomForestRegressor(
        n_estimators=600,
        max_depth=15,
        n_jobs=-1,
        random_state=42
    ),
    "rf"
)


Treinando modelo: rf ...
Gerando previsões...
✔ Arquivo salvo -> ./submission_rf.csv


'./submission_rf.csv'

In [11]:
# 2 — Gradient Boosting
train_and_export(
    GradientBoostingRegressor(
        learning_rate=0.03,
        n_estimators=800,
        max_depth=4,
        subsample=0.9,
        random_state=42
    ),
    "gb"
)


Treinando modelo: gb ...
Gerando previsões...
✔ Arquivo salvo -> ./submission_gb.csv


'./submission_gb.csv'

In [12]:
 #4 — LightGBM
train_and_export(
    lgb.LGBMRegressor(
        n_estimators=1500,
        learning_rate=0.015,
        max_depth=-1,
        num_leaves=40,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    ),
    "lgbm"
)

print("\nTodos os modelos treinados e arquivos gerados (se possível).")


Treinando modelo: lgbm ...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3383
[LightGBM] [Info] Number of data points in the train set: 53009, number of used features: 310
[LightGBM] [Info] Start training from score 7.488851
Gerando previsões...




✔ Arquivo salvo -> ./submission_lgbm.csv

Todos os modelos treinados e arquivos gerados (se possível).


## Observações

- Se faltar alguma dependência, instale com `pip install xgboost lightgbm`.
- Ajuste `CSV_PATH` se seus arquivos estiverem em outro local.
- Se quiser que eu gere também um stacking ou faça tuning com Optuna, me avisa.