In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')


In [3]:
#  FOR PLAYGROUND SERIES S5E12 

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
import lightgbm as lgb

# ---------------- CONFIG ----------------
DATA_DIR   = "/kaggle/input/playground-series-s5e12"
ID_COL     = "id"
TARGET_COL = "diagnosed_diabetes"

# ---------------- LOAD DATA ----------------
train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)

# ---------------- BASIC CHECKS ----------------
if ID_COL not in train.columns:
    raise ValueError(f"{ID_COL} not found in train")

if TARGET_COL not in train.columns:
    raise ValueError(f"{TARGET_COL} not found in train")

# ---------------- SPLIT ----------------
train_ids = train[ID_COL].copy()
test_ids  = test[ID_COL].copy()

X = train.drop(columns=[ID_COL, TARGET_COL]).copy()
y = train[TARGET_COL].astype(int).copy()
X_test = test.drop(columns=[ID_COL]).copy()

print("Initial feature count:", X.shape[1])

# ---------------- ONE-HOT ENCODING ----------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)

all_data = pd.concat([X, X_test], axis=0, ignore_index=True)
all_data = pd.get_dummies(all_data, columns=cat_cols, drop_first=False)

X_encoded = all_data.iloc[:len(X)].copy()
X_test_encoded = all_data.iloc[len(X):].copy()

print("Encoded feature count:", X_encoded.shape[1])

# ---------------- SCALING ----------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# ---------------- CV TRAINING ----------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred  = np.zeros(len(X_scaled))
test_pred = np.zeros(len(X_test_scaled))
fold_auc  = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    X_tr, X_val = X_scaled[tr_idx], X_scaled[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # XGBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=600,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="auc",
        tree_method="hist",
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

    # LightGBM
    lgb_model = lgb.LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        num_leaves=31,
        objective="binary",
        random_state=42,
        n_jobs=-1
    )
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric="auc")

    val_pred = (
        0.5 * xgb_model.predict_proba(X_val)[:, 1] +
        0.5 * lgb_model.predict_proba(X_val)[:, 1]
    )

    oof_pred[val_idx] = val_pred
    auc = roc_auc_score(y_val, val_pred)
    fold_auc.append(auc)

    print(f"Fold {fold} AUC: {auc:.5f}")

    test_pred += (
        0.5 * xgb_model.predict_proba(X_test_scaled)[:, 1] +
        0.5 * lgb_model.predict_proba(X_test_scaled)[:, 1]
    ) / skf.n_splits

print("\nOOF AUC mean:", np.mean(fold_auc), "std:", np.std(fold_auc))

# ---------------- SUBMISSION ----------------
submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred
})

submission.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(submission.head())


Train shape: (700000, 26)
Test shape : (300000, 25)
Initial feature count: 24
Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
Encoded feature count: 42
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Fold 1 AUC: 0.72511
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070650 seconds.
You can set `force_ro