In [1]:
# ============================================================
# #AutoML #AutoGluon #TabularData #BinaryClassification
# #Kaggle #Ensembling #Stacking #ModelSelection #MachineLearning
# ============================================================

# If running on Kaggle: you can keep this install line
!pip -q install -U "autogluon.tabular>=1.1.0"

import os, glob
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor

# -------------------------
# Columns (as you specified)
# -------------------------
ID_COL = "id"
TARGET = "diagnosed_diabetes"

FEATURES = [
    "age",
    "alcohol_consumption_per_week",
    "physical_activity_minutes_per_week",
    "diet_score",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "bmi",
    "waist_to_hip_ratio",
    "systolic_bp",
    "diastolic_bp",
    "heart_rate",
    "cholesterol_total",
    "hdl_cholesterol",
    "ldl_cholesterol",
    "triglycerides",
    "gender",
    "ethnicity",
    "education_level",
    "income_level",
    "smoking_status",
    "employment_status",
    "family_history_diabetes",
    "hypertension_history",
    "cardiovascular_history",
]

CATEGORICAL = [
    "gender",
    "ethnicity",
    "education_level",
    "income_level",
    "smoking_status",
    "employment_status",
    "family_history_diabetes",
    "hypertension_history",
    "cardiovascular_history",
]
NUMERIC = [c for c in FEATURES if c not in CATEGORICAL]

# -------------------------
# Helper: auto-find train/test
# -------------------------
def find_csv_by_name(root="/kaggle/input", name="train.csv"):
    hits = glob.glob(os.path.join(root, "**", name), recursive=True)
    return hits[0] if hits else None

train_path = find_csv_by_name(name="train.csv")
test_path  = find_csv_by_name(name="test.csv")

# If your dataset uses different filenames, set them manually:
train_path = "/kaggle/input/playground-series-s5e12/train.csv"
test_path  = "/kaggle/input/playground-series-s5e12/test.csv"

print("train_path:", train_path)
print("test_path :", test_path)

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

# -------------------------
# Basic sanity checks
# -------------------------
needed_train = [ID_COL, TARGET] + FEATURES
missing_train = [c for c in needed_train if c not in train.columns]
if missing_train:
    raise ValueError(f"Missing columns in train: {missing_train}")

needed_test = [ID_COL] + FEATURES
missing_test = [c for c in needed_test if c not in test.columns]
if missing_test:
    raise ValueError(f"Missing columns in test: {missing_test}")

# -------------------------
# Type casting (helps AutoML)
# -------------------------
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # categoricals to "category"
    for c in CATEGORICAL:
        df[c] = df[c].astype("category")

    # numerics to float (coerce errors -> NaN)
    for c in NUMERIC:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

train = cast_types(train)
test  = cast_types(test)

# Ensure target is 0/1 int (if it isn't already)
train[TARGET] = pd.to_numeric(train[TARGET], errors="coerce").fillna(0).astype(int)

# Use only required columns (prevents leakage from extra cols)
train_ml = train[[ID_COL, TARGET] + FEATURES].copy()
test_ml  = test[[ID_COL] + FEATURES].copy()

# -------------------------
# AutoGluon Tabular AutoML
# -------------------------
# Notes:
# - If metric for the competition is unknown, ROC AUC is a solid default.
# - If leaderboard uses logloss, you can switch eval_metric="log_loss".
predictor = TabularPredictor(
    label=TARGET,
    eval_metric="roc_auc",
    path="ag_diabetes_autml",
    verbosity=2
)

predictor.fit(
    train_data=train_ml.drop(columns=[ID_COL]),
    presets="best_quality",   # strong ensemble (bagging + stacking), slower but usually best
    time_limit=60*60,         # 1 hour; adjust as needed
    num_bag_folds=5,
    num_stack_levels=1
)

# -------------------------
# Predict probabilities
# -------------------------
proba = predictor.predict_proba(test_ml.drop(columns=[ID_COL]))

# AutoGluon returns a DataFrame for binary classification with 2 columns (class labels).
# We want probability of class "1" (positive).
if isinstance(proba, pd.DataFrame):
    # pick column 1 if it exists, else take the "largest" label column
    if 1 in proba.columns:
        pred = proba[1].to_numpy()
    else:
        # fallback: choose the column that corresponds to positive class
        # (often '1', 'True', or the max label)
        col = sorted(proba.columns)[-1]
        pred = proba[col].to_numpy()
else:
    # fallback if returned as series/array
    pred = np.asarray(proba)

# -------------------------
# Save submission
# -------------------------
sub = pd.DataFrame({ID_COL: test_ml[ID_COL].values, TARGET: pred})
sub.to_csv("submission.csv", index=False)

print("✅ saved submission.csv")
print(sub.head())

# Optional: show leaderboard of models
lb = predictor.leaderboard(silent=True)
print("\n=== AutoGluon Leaderboard (top) ===")
print(lb.head(15))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.9/98.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
datasets 4.4.1 requires pyarrow>=21.0.0, but you have pyarrow 20.0.0 which is incompatible.
cudf-cu12 25.6.0 requir

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sat Sep 27 10:16:09 UTC 2025
CPU Count:          4
Pytorch Version:    2.8.0+cu126
CUDA Version:       CUDA is not available
Memory Avail:       29.60 GB / 31.35 GB (94.4%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of 

[36m(_ray_fit pid=463)[0m [1000]	valid_set's binary_logloss: 0.595547
[36m(_ray_fit pid=465)[0m [2000]	valid_set's binary_logloss: 0.597063[32m [repeated 4x across cluster][0m


[36m(_ray_fit pid=466)[0m 	Ran out of time, early stopping on iteration 2199. Best iteration is:
[36m(_ray_fit pid=466)[0m 	[2199]	valid_set's binary_logloss: 0.594616
[36m(_ray_fit pid=463)[0m [2025-12-28 18:52:11,527 E 463 566] core_worker_process.cc:837: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=463)[0m 	Ran out of time, early stopping on iteration 2201. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=463)[0m 	[2201]	valid_set's binary_logloss: 0.594443[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=593)[0m [2025-12-28 18:57:20,025 E 593 616] core_worker_process.cc:837: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize th

[36m(_ray_fit pid=593)[0m [1000]	valid_set's binary_logloss: 0.597954[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=593)[0m [2000]	valid_set's binary_logloss: 0.597245
[36m(_ray_fit pid=593)[0m [3000]	valid_set's binary_logloss: 0.596723


[36m(_ray_fit pid=593)[0m 	Ran out of time, early stopping on iteration 3828. Best iteration is:
[36m(_ray_fit pid=593)[0m 	[3694]	valid_set's binary_logloss: 0.596514
[36m(_dystack pid=176)[0m 	0.707	 = Validation score   (roc_auc)
[36m(_dystack pid=176)[0m 	544.51s	 = Training   runtime
[36m(_dystack pid=176)[0m 	275.35s	 = Validation runtime
[36m(_dystack pid=176)[0m Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 271.30s of remaining time.
[36m(_dystack pid=176)[0m 	Fitting 1 model on all data | Fitting with cpus=4, gpus=0, mem=0.0/27.9 GB
[36m(_dystack pid=176)[0m 	Ensemble Weights: {'LightGBMXT_BAG_L1': 1.0}
[36m(_dystack pid=176)[0m 	0.707	 = Validation score   (roc_auc)
[36m(_dystack pid=176)[0m 	0.2s	 = Training   runtime
[36m(_dystack pid=176)[0m 	0.17s	 = Validation runtime
[36m(_dystack pid=176)[0m Fitting 108 L2 models, fit_strategy="sequential" ...
[36m(_dystack pid=176)[0m Fitting model: LightGBMXT_BAG_L2 ... Train

✅ saved submission.csv
       id  diagnosed_diabetes
0  700000            0.483897
1  700001            0.720819
2  700002            0.765071
3  700003            0.391099
4  700004            0.950468

=== AutoGluon Leaderboard (top) ===
                 model  score_val eval_metric  pred_time_val     fit_time  \
0      LightGBM_BAG_L2   0.724857     roc_auc    1593.596610  1934.247614   
1  WeightedEnsemble_L3   0.724857     roc_auc    1593.767296  1951.695249   
2    LightGBMXT_BAG_L2   0.709858     roc_auc    1498.018157  1676.500010   
3  WeightedEnsemble_L2   0.709831     roc_auc    1479.563917  1590.555925   
4    LightGBMXT_BAG_L1   0.709816     roc_auc    1478.778527  1561.190172   
5      LightGBM_BAG_L1   0.676950     roc_auc       0.612761    20.649909   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0              114.205322         352.407533            2       True   
1                0.170686          17.447635            3       True   
2   