In [None]:
# ====================================================
# Setup
# ====================================================
import os
import gc
import random
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import early_stopping, log_evaluation

# Seed everything for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(42)

# ====================================================
# Load Data
# ====================================================
train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")
sub = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

TARGET = "y"
ID = "id"

y = train[TARGET]
X = train.drop([TARGET, ID], axis=1)
X_test = test.drop([ID], axis=1)

# ====================================================
# Preprocessing
# ====================================================
# Identify categorical and numerical features
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

# Target Encoding for categoricals
encoder = TargetEncoder(cols=cat_features)
X[cat_features] = encoder.fit_transform(X[cat_features], y)
X_test[cat_features] = encoder.transform(X_test[cat_features])

# Standard scaling for numericals
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

# ====================================================
# Model Training Functions
# ====================================================
def train_lightgbm(X, y, X_test, folds=5):
    oof = np.zeros(len(X))
    preds = np.zeros(len(X_test))
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, y_tr = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(
            objective='binary',
            learning_rate=0.05,
            num_leaves=31,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=5,
            random_state=42,
            n_estimators=5000
        )
        
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(200)]
        )
        
        oof[val_idx] = model.predict_proba(X_val)[:, 1]
        preds += model.predict_proba(X_test)[:, 1] / folds
    
    score = roc_auc_score(y, oof)
    print(f"LightGBM CV AUC: {score:.5f}")
    return oof, preds

def train_xgboost(X, y, X_test, folds=5):
    oof = np.zeros(len(X))
    preds = np.zeros(len(X_test))
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, y_tr = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            eval_metric="auc",
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='hist',
            random_state=42,
            n_estimators=5000
        )
        
        model.fit(X_tr, y_tr,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=100,
                  verbose=200)
        
        oof[val_idx] = model.predict_proba(X_val)[:, 1]
        preds += model.predict_proba(X_test)[:, 1] / folds
    
    score = roc_auc_score(y, oof)
    print(f"XGBoost CV AUC: {score:.5f}")
    return oof, preds

def train_catboost(X, y, X_test, folds=5):
    oof = np.zeros(len(X))
    preds = np.zeros(len(X_test))
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, y_tr = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(
            iterations=5000,
            learning_rate=0.05,
            depth=6,
            eval_metric='AUC',
            random_seed=42,
            verbose=200,
            early_stopping_rounds=100,
            use_best_model=True
        )
        
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        
        oof[val_idx] = model.predict_proba(X_val)[:, 1]
        preds += model.predict_proba(X_test)[:, 1] / folds
    
    score = roc_auc_score(y, oof)
    print(f"CatBoost CV AUC: {score:.5f}")
    return oof, preds

# ====================================================
# Train Models
# ====================================================
lgb_oof, lgb_preds = train_lightgbm(X, y, X_test)
xgb_oof, xgb_preds = train_xgboost(X, y, X_test)
cat_oof, cat_preds = train_catboost(X, y, X_test)

# ====================================================
# Blending
# ====================================================
oof_blend = (lgb_oof + xgb_oof + cat_oof) / 3
test_preds = (lgb_preds + xgb_preds + cat_preds) / 3

final_score = roc_auc_score(y, oof_blend)
print(f"Blended CV AUC: {final_score:.5f}")

# ====================================================
# Submission
# ====================================================
sub[TARGET] = test_preds
sub.to_csv("submission.csv", index=False)
print("submission.csv saved!")
