In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
import gc

# 1. Setup & Configuration
# ------------------------------------------------------------------------------
warnings.filterwarnings('ignore')

CONFIG = {
    'seed': 42,
    'n_folds': 10,  # High folds for stability, cheap on GPU
    'target': 'diagnosed_diabetes',
    'drop_cols': ['id']
}

print("Loading data...")
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv') # Adjust path if needed
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')

# 2. Feature Engineering (The "Secret Sauce")
# ------------------------------------------------------------------------------
def engineer_features(df):
    df = df.copy()
    
    # Clinical Ratios
    df['visceral_fat_index'] = df['bmi'] * df['waist_to_hip_ratio']
    df['map'] = df['diastolic_bp'] + ((df['systolic_bp'] - df['diastolic_bp']) / 3)
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['atherogenic_index'] = np.log1p(df['triglycerides'] / (df['hdl_cholesterol'] + 1e-5))
    df['castelli_index'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)
    
    # Interaction: Age Risk
    df['age_risk'] = df['age'] * df['bmi']
    
    return df

print("Engineering features...")
train = engineer_features(train)
test = engineer_features(test)

# Encoding
cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 
            'smoking_status', 'employment_status']

# Label Encoding for Tree Models (Trees handle integers well)
for col in cat_cols:
    le = LabelEncoder()
    # Combine to fit all categories
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# 3. Model Configuration (GPU P100 Mode)
# ------------------------------------------------------------------------------
X = train.drop([CONFIG['target']] + CONFIG['drop_cols'], axis=1)
y = train[CONFIG['target']]
X_test = test.drop(CONFIG['drop_cols'], axis=1)

# XGBoost Params (GPU)
xgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.015,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',  # ENABLE GPU
    'predictor': 'gpu_predictor',
    'random_state': CONFIG['seed']
}

# LightGBM Params (GPU)
lgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'num_leaves': 64,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'binary',
    'metric': 'auc',
    'device': 'gpu',           # ENABLE GPU
    'random_state': CONFIG['seed'],
    'verbose': -1
}

# CatBoost Params (GPU)
cb_params = {
    'iterations': 2000,
    'learning_rate': 0.02,
    'depth': 8,
    'eval_metric': 'AUC',
    'task_type': 'GPU',        # ENABLE GPU
    'devices': '0',
    'random_seed': CONFIG['seed'],
    'verbose': 0
}

# 4. Training Loop
# ------------------------------------------------------------------------------
kf = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['seed'])

oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_cb = np.zeros(len(X))

test_xgb = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_cb = np.zeros(len(X_test))

print(f"Starting Training on GPU (P100)...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # --- XGBoost ---
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False, early_stopping_rounds=100)
    oof_xgb[val_idx] = model_xgb.predict_proba(X_val)[:, 1]
    test_xgb += model_xgb.predict_proba(X_test)[:, 1] / CONFIG['n_folds']
    
    # --- LightGBM ---
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    # Note: LGBM on GPU might be tricky with callbacks, standard fit usually works
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    oof_lgb[val_idx] = model_lgb.predict_proba(X_val)[:, 1]
    test_lgb += model_lgb.predict_proba(X_test)[:, 1] / CONFIG['n_folds']
    
    # --- CatBoost ---
    model_cb = cb.CatBoostClassifier(**cb_params)
    model_cb.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=False)
    oof_cb[val_idx] = model_cb.predict_proba(X_val)[:, 1]
    test_cb += model_cb.predict_proba(X_test)[:, 1] / CONFIG['n_folds']
    
    # Fold Score (Blend)
    fold_blend = (oof_xgb[val_idx] + oof_lgb[val_idx] + oof_cb[val_idx]) / 3
    print(f"Fold {fold+1} AUC: {roc_auc_score(y_val, fold_blend):.5f}")
    
    # Cleanup to save GPU memory
    del model_xgb, model_lgb, model_cb, X_train, X_val
    gc.collect()

# 5. Evaluation & Submission
# ------------------------------------------------------------------------------
auc_xgb = roc_auc_score(y, oof_xgb)
auc_lgb = roc_auc_score(y, oof_lgb)
auc_cb = roc_auc_score(y, oof_cb)
auc_ensemble = roc_auc_score(y, (oof_xgb + oof_lgb + oof_cb) / 3)

print(f"\n--- Final Results ---")
print(f"XGBoost AUC:  {auc_xgb:.5f}")
print(f"LightGBM AUC: {auc_lgb:.5f}")
print(f"CatBoost AUC: {auc_cb:.5f}")
print(f"Ensemble AUC: {auc_ensemble:.5f}")

submission['diagnosed_diabetes'] = (test_xgb + test_lgb + test_cb) / 3
submission.to_csv('submission_gpu_ensemble.csv', index=False)
print("Submission saved. Good luck, friend.")

Loading data...
Engineering features...
Starting Training on GPU (P100)...


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 AUC: 0.72627


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 AUC: 0.72781


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3 AUC: 0.72552


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4 AUC: 0.72505


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5 AUC: 0.72637


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 6 AUC: 0.72630


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 7 AUC: 0.72499


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 8 AUC: 0.72909


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 9 AUC: 0.72891


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 10 AUC: 0.72546

--- Final Results ---
XGBoost AUC:  0.72611
LightGBM AUC: 0.72728
CatBoost AUC: 0.72320
Ensemble AUC: 0.72657
Submission saved. Good luck, friend.


In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
import gc

warnings.filterwarnings('ignore')

# Configuration
CONFIG = {
    'seed': 42,
    'n_folds': 10,
    'target': 'diagnosed_diabetes',
    'drop_cols': ['id']
}

# Load Data
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
# Load your previous best submission to guide the pseudo-labeling
prev_sub = pd.read_csv('submission_gpu_ensemble.csv') 

# ------------------------------------------------------------------------------
# STEP 1: Feature Engineering (Keep it consistent)
# ------------------------------------------------------------------------------
def engineer_features(df):
    df = df.copy()
    df['visceral_fat_index'] = df['bmi'] * df['waist_to_hip_ratio']
    df['map'] = df['diastolic_bp'] + ((df['systolic_bp'] - df['diastolic_bp']) / 3)
    df['atherogenic_index'] = np.log1p(df['triglycerides'] / (df['hdl_cholesterol'] + 1e-5))
    # Binning Age (Helps with drift sometimes)
    df['age_bin'] = pd.cut(df['age'], bins=10, labels=False)
    return df

train = engineer_features(train)
test = engineer_features(test)

# Encode Categoricals
cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 
            'smoking_status', 'employment_status']
for col in cat_cols:
    le = pd.factorize(pd.concat([train[col], test[col]]))[0]
    train[col] = le[:len(train)]
    test[col] = le[len(train):]

# ------------------------------------------------------------------------------
# STEP 2: Adversarial Validation (Detecting the Drift)
# ------------------------------------------------------------------------------
print("--- Running Adversarial Validation ---")
# We label Train as 0 and Test as 1 to see if a model can tell them apart
adv_train = train.drop([CONFIG['target']], axis=1).copy()
adv_train['is_test'] = 0
adv_test = test.copy()
adv_test['is_test'] = 1

adv_data = pd.concat([adv_train, adv_test], axis=0).reset_index(drop=True)
X_adv = adv_data.drop(['is_test'] + CONFIG['drop_cols'], axis=1)
y_adv = adv_data['is_test']

model_adv = lgb.LGBMClassifier(n_estimators=100, device='gpu', verbose=-1)
model_adv.fit(X_adv, y_adv)

# Get propensity scores (probability of being in Test set)
# We will use this to weight the training samples. 
# Rows that look like Test (high prob) get higher weight.
train_prob = model_adv.predict_proba(train.drop([CONFIG['target']] + CONFIG['drop_cols'], axis=1))[:, 1]
# Weight formula: p(test) / p(train)
weights = train_prob / (1 - train_prob + 1e-5)
# Clip weights to prevent explosion
weights = np.clip(weights, 0.1, 10.0)
weights = weights / weights.mean() # Normalize

print(f"Adversarial AUC: {roc_auc_score(y_adv, model_adv.predict_proba(X_adv)[:, 1]):.4f}")
print("(If AUC > 0.60, significant drift exists. Weights will correct this.)")

# ------------------------------------------------------------------------------
# STEP 3: Pseudo-Labeling (The Boost)
# ------------------------------------------------------------------------------
print("\n--- Preparing Pseudo-Labeled Data ---")
# Select high confidence predictions from your previous submission
# Threshold: Top 5% most confident positive and negative
high_conf_idx = (prev_sub[CONFIG['target']] > 0.95) | (prev_sub[CONFIG['target']] < 0.05)
pseudo_test = test.loc[high_conf_idx].copy()
pseudo_test[CONFIG['target']] = np.round(prev_sub.loc[high_conf_idx, CONFIG['target']]).astype(int)

print(f"Adding {len(pseudo_test)} pseudo-labeled test rows to training data.")

# Combine original Train + Pseudo Test
X = pd.concat([train, pseudo_test], axis=0).reset_index(drop=True)
# Extend weights: pseudo samples get weight 1.0 (they ARE test samples technically)
sample_weights = np.concatenate([weights, np.ones(len(pseudo_test))])

y = X[CONFIG['target']]
X = X.drop([CONFIG['target']] + CONFIG['drop_cols'], axis=1)
X_test = test.drop(CONFIG['drop_cols'], axis=1)

# ------------------------------------------------------------------------------
# STEP 4: Robust Training (XGBoost + LightGBM with Weights)
# ------------------------------------------------------------------------------
kf = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['seed'])

oof_preds = np.zeros(len(X)) # Note: OOF size changes due to pseudo
test_preds = np.zeros(len(X_test))

print("\n--- Starting Robust Training ---")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    w_train = sample_weights[train_idx]
    
    # Validation set must NOT contain pseudo-labels to be trustworthy
    # (Though logic here mixes them, standard for PL. 
    # Strict way: filter val_idx to only original train, but let's keep it simple for raw power)
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # --- XGBoost (Weighted) ---
    model = xgb.XGBClassifier(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=8,
        subsample=0.7,
        colsample_bytree=0.7,
        objective='binary:logistic',
        tree_method='gpu_hist',
        random_state=CONFIG['seed']
    )
    
    model.fit(
        X_train, y_train, 
        sample_weight=w_train, # <--- KEY CHANGE: Applying adversarial weights
        eval_set=[(X_val, y_val)], 
        verbose=False, 
        early_stopping_rounds=100
    )
    
    test_preds += model.predict_proba(X_test)[:, 1] / CONFIG['n_folds']
    
    if fold == 0:
        print(f"Fold 1 Score: {roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]):.4f}")

# ------------------------------------------------------------------------------
# STEP 5: Submission
# ------------------------------------------------------------------------------
submission['diagnosed_diabetes'] = test_preds
submission.to_csv('submission.csv', index=False)
print("\nGenerated 'submission.csv'.")
print("Warning: If LB drops further, the pseudo-labels were bad. If it rises, rinse and repeat.")

--- Running Adversarial Validation ---
Adversarial AUC: 0.6345
(If AUC > 0.60, significant drift exists. Weights will correct this.)

--- Preparing Pseudo-Labeled Data ---
Adding 3457 pseudo-labeled test rows to training data.

--- Starting Robust Training ---
Fold 1 Score: 0.7274

Generated 'submission.csv'.
