# Import

In [1]:
import os
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb

# Suppress compiler warnings for clean output
warnings.filterwarnings("ignore", message="1 warning generated")

# Load Data

In [2]:
DATA_PATH = "/kaggle/input/playground-series-s5e12"

train_df = pd.read_csv(f"{DATA_PATH}/train.csv")
test_df  = pd.read_csv(f"{DATA_PATH}/test.csv")

TARGET = "diagnosed_diabetes"
ID_COL = "id"

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (700000, 26)
Test shape: (300000, 25)


# Feature Setup

In [3]:
# Separate features and target
X = train_df.drop(columns=[TARGET, ID_COL])
y = train_df[TARGET].values
X_test = test_df.drop(columns=[ID_COL])

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
print("Categorical columns:", cat_cols)

Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


# Prepare XGBoost Features (Label Encode Categoricals)

In [4]:
X_xgb = X.copy()
X_test_xgb = X_test.copy()

for col in cat_cols:
    le = LabelEncoder()
    X_xgb[col] = le.fit_transform(X_xgb[col])
    X_test_xgb[col] = le.transform(X_test_xgb[col])

# LightGBM Categorical Setup

In [5]:
# LightGBM can handle native category types
X_lgb = X.copy()
X_test_lgb = X_test.copy()
for col in cat_cols:
    X_lgb[col] = X_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")

# Cross-Validation Setup

In [6]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# OOF predictions
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

# Test predictions
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))

# Model Parameters

In [7]:
# LightGBM (GPU)
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.02,
    "num_leaves": 128,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "device": "gpu",
    "gpu_use_dp": False,
    "verbosity": -1,
    "seed": 42
}

# XGBoost (CPU)
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.02,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "predictor": "auto",
    "seed": 42
}

# Training Loop (5-Fold CV)

In [8]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n===== Fold {fold} =====")

    # Split data
    X_tr_lgb, X_val_lgb = X_lgb.iloc[train_idx], X_lgb.iloc[val_idx]
    X_tr_xgb, X_val_xgb = X_xgb.iloc[train_idx], X_xgb.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    # ---------- LightGBM ----------
    lgb_train = lgb.Dataset(X_tr_lgb, y_tr, categorical_feature=cat_cols)
    lgb_val   = lgb.Dataset(X_val_lgb, y_val, categorical_feature=cat_cols, reference=lgb_train)

    lgb_model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_val],
        callbacks=[lgb.early_stopping(300)]
    )

    oof_lgb[val_idx] = lgb_model.predict(X_val_lgb, num_iteration=lgb_model.best_iteration)
    test_lgb += lgb_model.predict(X_test_lgb, num_iteration=lgb_model.best_iteration) / N_SPLITS

    # ---------- XGBoost ----------
    dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr)
    dval   = xgb.DMatrix(X_val_xgb, label=y_val)
    dtest  = xgb.DMatrix(X_test_xgb)

    xgb_model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=10000,
        evals=[(dval, "val")],
        early_stopping_rounds=300,
        verbose_eval=False
    )

    oof_xgb[val_idx] = xgb_model.predict(dval, iteration_range=(0, xgb_model.best_iteration))
    test_xgb += xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration)) / N_SPLITS


===== Fold 1 =====




Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1417]	valid_0's auc: 0.727381


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()



===== Fold 2 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1913]	valid_0's auc: 0.725762


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()



===== Fold 3 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1327]	valid_0's auc: 0.726693


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()



===== Fold 4 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1404]	valid_0's auc: 0.727237


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()



===== Fold 5 =====
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1630]	valid_0's auc: 0.72767


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


# CV Scores

In [9]:
print("LightGBM CV AUC:", roc_auc_score(y, oof_lgb))
print("XGBoost CV AUC:", roc_auc_score(y, oof_xgb))

# Ensemble
oof_ensemble = 0.5 * oof_lgb + 0.5 * oof_xgb
test_ensemble = 0.5 * test_lgb + 0.5 * test_xgb
print("Ensemble CV AUC:", roc_auc_score(y, oof_ensemble))

LightGBM CV AUC: 0.7269449712703255
XGBoost CV AUC: 0.726081522933125
Ensemble CV AUC: 0.7272625762253748


# Submission

In [10]:
submission = pd.read_csv("/kaggle/input/playground-series-s5e12/sample_submission.csv")
submission[TARGET] = test_ensemble
submission.to_csv("ensemble_lgb_xgb_submission.csv", index=False)

submission.head()

Unnamed: 0,id,diagnosed_diabetes
0,700000,0.486903
1,700001,0.682798
2,700002,0.765158
3,700003,0.377983
4,700004,0.9237


In [11]:
from sklearn.metrics import accuracy_score

# Convert probabilities to 0/1 using 0.5 threshold
pred_labels = (oof_ensemble >= 0.5).astype(int)

accuracy = accuracy_score(y, pred_labels)
print("CV Accuracy:", accuracy)

from sklearn.metrics import r2_score

r2 = r2_score(y, oof_ensemble)
print("CV R²:", r2)

from sklearn.metrics import mean_squared_error
import numpy as np

# Compute MSE first
mse = mean_squared_error(y, oof_ensemble)

# Take square root manually to get RMSE
rmse = np.sqrt(mse)
print("CV RMSE:", rmse)


CV Accuracy: 0.6848442857142857
CV R²: 0.1482314627213629
CV RMSE: 0.44720654190649883


In [12]:
# end