In [1]:
# Author: Aaron Isom
# Kaggle Playground-Series-S5e8 - Binary Classification with a Bank Dataset
# Voting Classifier using CatBoost, LGBM, and XGBoost

from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load data
train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
original = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', delimiter=";")
submission = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

original['y'] = original['y'].replace({'yes': 1, 'no': 0})

train = pd.concat([train, original], axis=0, ignore_index=True)

# Features for training (drop id and target)
X = train.drop(['id', 'y'], axis=1)
y = train['y']

# Features for test set (drop only id)
X_test = test.drop(['id'], axis=1)

# Encode object and category columns
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.fit_transform(X_test[col].astype(str))

In [None]:
# Create three voting classifiers - CatBoost, LGBM, and XGBoost
clf1 = LGBMClassifier(n_estimators=1000, objective='binary', random_state=42, metric='binary_logloss', verbose=-1)
clf2 = XGBClassifier(n_estimators=1000, objective='binary:logistic', eval_metric='auc', random_state=42, n_jobs=-1, 
                          enable_categorical=True, tree_method='hist', verbose=-1)
clf3 = CatBoostClassifier(n_estimators=1000, loss_function='Logloss', eval_metric='AUC', random_state=42, verbose=-1)

voting_clf = VotingClassifier(estimators=[('lgbm', clf1), ('xgb', clf2), ('cat', clf3)], voting='soft')
voting_clf.fit(X, y)

preds = voting_clf.predict_proba(X_test)[:, 1]

print("VotingClassifier ROC AUC Score:", roc_auc_score(y, preds))

[LightGBM] [Info] Number of positive: 95777, number of negative: 699434
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1001
[LightGBM] [Info] Number of data points in the train set: 795211, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120442 -> initscore=-1.988249
[LightGBM] [Info] Start training from score -1.988249
Learning rate set to 0.021599
0:	total: 172ms	remaining: 28m 37s
1:	total: 263ms	remaining: 21m 56s
2:	total: 349ms	remaining: 19m 21s
3:	total: 438ms	remaining: 18m 14s
4:	total: 527ms	remaining: 17m 34s
5:	total: 612ms	remaining: 16m 58s
6:	total: 704ms	remaining: 16m 44s
7:	total: 798ms	remaining: 16m 36s
8:	total: 886ms	remaining: 16m 23s
9:	total: 1.03s	remaining: 17m 8s
10:	total: 1.12s	remaining: 16m 59s
11:	total: 1.21s	remainin

In [None]:
# Final submission
submission['y'] = preds
submission.to_csv('submission.csv', index=False)
display(submission)
print('Submission file saved.')