In [6]:
# Author: Aaron Isom
# Kaggle Playground-Series-S5e8 - Binary Classification with a Bank Dataset
# Voting Classifier using CatBoost, LGBM, and XGBoost

from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [8]:
# Load data
train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
original = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', delimiter=";")
submission = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

original['y'] = original['y'].replace({'yes': 1, 'no': 0})

train = pd.concat([train, original], axis=0, ignore_index=True)

# Features for training (drop id and target)
X = train.drop(['id', 'y'], axis=1)
y = train['y']

# Features for test set (drop only id)
X_test = test.drop(['id'], axis=1)

# Encode object and category columns
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.fit_transform(X_test[col].astype(str))

In [9]:
# Create three voting classifiers - CatBoost, LGBM, and XGBoost
clf1 = LGBMClassifier(n_estimators=10000, objective='binary', random_state=42, device='gpu', metric='binary_logloss')
clf2 = XGBClassifier(n_estimators=10000, objective='binary:logistic', eval_metric='auc', random_state=42, device='cuda', n_jobs=-1, 
                          enable_categorical=True, tree_method='hist')
clf3 = CatBoostClassifier(n_estimators=10000, loss_function='Logloss', eval_metric='AUC', random_state=42, task_type='GPU')

voting_clf = VotingClassifier(estimators=[('lgbm', clf1), ('xgb', clf2), ('cat', clf3)], voting='soft')
voting_clf.fit(X, y)

preds = voting_clf.predict_proba(X_test)[:, 1]

print("VotingClassifier ROC AUC Score:", roc_auc_score(y, preds))

[LightGBM] [Info] Number of positive: 95777, number of negative: 699434
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1001
[LightGBM] [Info] Number of data points in the train set: 795211, number of used features: 16
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (9.10 MB) transferred to GPU in 0.010459 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120442 -> initscore=-1.988249
[LightGBM] [Info] Start training from score -1.988249


[LightGBM] [Fatal] Check failed: (best_split_info.right_count) > (0) at /tmp/lightgbm/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 856 .



LightGBMError: Check failed: (best_split_info.right_count) > (0) at /tmp/lightgbm/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 856 .


In [None]:
# Final submission
submission['y'] = preds
submission.to_csv('submission.csv', index=False)
display(submission)
print('Submission file saved.')