In [8]:
!pip install catboost --quiet
!pip install xgboost --quiet

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# =========================================================
# 1. LOAD DATA
# =========================================================
train = pd.read_csv("/kaggle/input/smoker-status-prediction-biosignal/train.csv")
test  = pd.read_csv("/kaggle/input/smoker-status-prediction-biosignal/test.csv")

y = train["smoking"]
X = train.drop(columns=["id", "smoking"])
X_test = test.drop(columns=["id"])

# =========================================================
# 2. FEATURE ENGINEERING 
# =========================================================

def add_features(df):
    df["BMI"] = df["weight(kg)"] / (df["height(cm)"]/100)**2
    df["pulse_pressure"] = df["systolic"] - df["relaxation"]
    df["ldl_hdl_ratio"] = df["LDL"] / df["HDL"]
    df["chol_hdl_ratio"] = df["Cholesterol"] / df["HDL"]
    df["trig_hdl_ratio"] = df["triglyceride"] / df["HDL"]
    df["liver_ratio"] = df["AST"] / df["ALT"]
    df["eyesight_ratio"] = df["eyesight(left)"] / df["eyesight(right)"]
    df["hearing_ratio"] = df["hearing(left)"] / df["hearing(right)"]
    df["blood_health_score"] = df["HDL"] - df["LDL"] - df["Cholesterol"]
    df["liver_stress"] = df["Gtp"] + df["ALT"] + df["AST"]
    df["blood_pressure_score"] = df["systolic"] + df["relaxation"]
    return df

X = add_features(X)
X_test = add_features(X_test)

# =========================================================
# 3. MODEL DEFINITIONS (NO EARLY STOPPING)
# =========================================================

lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 40,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42
}

xgb_model = XGBClassifier(
    n_estimators=1500,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=6,
    eval_metric="auc",
    tree_method="hist"
)

cat_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    verbose=0
)

# =========================================================
# 4. STRATIFIED KFOLD + 3-MODEL ENSEMBLE
# =========================================================

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
oof_xgb = np.zeros(len(train))
oof_cat = np.zeros(len(train))

pred_lgb = np.zeros(len(test))
pred_xgb = np.zeros(len(test))
pred_cat = np.zeros(len(test))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    
    # LightGBM
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_val   = lgb.Dataset(X_val, y_val)

    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=1200,  # no early stopping
        valid_sets=[lgb_val],
    )

    oof_lgb[val_idx] = lgb_model.predict(X_val)
    pred_lgb += lgb_model.predict(X_test) / skf.n_splits

    # XGBoost
    xgb_model.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    pred_xgb += xgb_model.predict_proba(X_test)[:, 1] / skf.n_splits

    # CatBoost
    cat_model.fit(X_tr, y_tr)
    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    pred_cat += cat_model.predict_proba(X_test)[:, 1] / skf.n_splits

    print(f"Fold {fold} AUCs:",
          roc_auc_score(y_val, oof_lgb[val_idx]),
          roc_auc_score(y_val, oof_xgb[val_idx]),
          roc_auc_score(y_val, oof_cat[val_idx])
         )

# =========================================================
# 5. FINAL ENSEMBLE (weighted)
# =========================================================

# weights tuned for medical datasets
final_oof = (
    0.40 * oof_lgb +
    0.35 * oof_cat +
    0.25 * oof_xgb
)

final_preds = (
    0.40 * pred_lgb +
    0.35 * pred_cat +
    0.25 * pred_xgb
)

print("ENSEMBLE AUC:", roc_auc_score(y, final_oof))

# =========================================================
# 6. SUBMISSION
# =========================================================
submission = pd.DataFrame({
    "id": test["id"],
    "smoking": final_preds
})
submission.to_csv("submission.csv", index=False)

print("submission.csv saved!")

[LightGBM] [Info] Number of positive: 4393, number of negative: 7607
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3159
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.366083 -> initscore=-0.549057
[LightGBM] [Info] Start training from score -0.549057
Fold 0 AUCs: 0.875323987805853 0.8772534354075413 0.8797558298658961
[LightGBM] [Info] Number of positive: 4393, number of negative: 7607
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3183
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.366083 -> initscore=-0.549057
[