# Libraries
---

In [1]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

from xgboost import XGBClassifier

import warnings
warnings.simplefilter('ignore')

# Datasets
---

In [2]:
url = "../input/tabular-playground-series-sep-2021/"

train = pd.read_csv(url + "train.csv")
test = pd.read_csv(url + "test.csv")
submission = pd.read_csv(url + "sample_solution.csv")

features = [col for col in test.columns if 'f' in col]
target = train['claim']

Idea taken from TPS Sep 2021 xgb early stopping rounds

In [6]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()) 
])
train[features] = pipeline.fit_transform(train[features])
test[features] = pipeline.transform(test[features])

#  XGBClassifier
---

In [7]:
xgb_params = {
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_hist', 
    'gpu_id': 0, 
    'predictor': 'gpu_predictor', 
    'n_estimators': 30000, 
    'learning_rate': 0.005, 
    'gamma': 0.25, 
    'max_depth': 4, 
    'min_child_weight': 378, 
    'subsample': 0.63, 
    'colsample_bytree': 0.77, 
    'colsample_bylevel': 0.87, 
    'lambda': 0.05, 
    'alpha': 10
}


In [10]:
xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])
xgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=5, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=target)):
    print(f">>> fold {fold} >>>")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = XGBClassifier(**xgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],                
        verbose=1000,
        early_stopping_rounds=2000 
    )    
    df_tmp = pd.DataFrame()
    df_tmp['fold'] = fold
    xgb_importances = xgb_importances.append(df_tmp)
    xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    xgb_pred += model.predict_proba(X_test)[:, -1] / 5

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof xgb roc = {roc_auc_score(target, xgb_oof)}")

# Submission
---

In [1]:
submission[target] = xgb_pred
submission.to_csv("submission.csv", index=False)