<a href="https://www.kaggle.com/code/aaronisomaisom3/s5e8-voting-classifier?scriptVersionId=254816916" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Author: Aaron Isom
# Kaggle Playground-Series-S5e8 - Binary Classification with a Bank Dataset
# Voting Classifier using CatBoost, LGBM, and XGBoost

from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load data
train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
original = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', delimiter=";")
submission = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')

original['y'] = original['y'].replace({'yes': 1, 'no': 0})

train = pd.concat([train, original], axis=0, ignore_index=True)

# Features for training (drop id and target)
X = train.drop(['id', 'y'], axis=1)
y = train['y']

# Features for test set (drop only id)
X_test = test.drop(['id'], axis=1)

# Encode object and category columns
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.fit_transform(X_test[col].astype(str))

In [3]:
# Create three voting classifiers - CatBoost, LGBM, and XGBoost
clf1 = LGBMClassifier(n_estimators=5000, learning_rate=0.05, max_depth=8, objective='binary', max_bin=512,
                      random_state=42, metric='binary_logloss', verbose=-1)
clf2 = XGBClassifier(n_estimators=5000, objective='binary:logistic', eval_metric='auc', random_state=42, n_jobs=-1, 
                          enable_categorical=True, tree_method='hist', verbose=-1)
clf3 = CatBoostClassifier(n_estimators=5000, loss_function='Logloss', eval_metric='AUC', random_state=42, verbose=0)

voting_clf = VotingClassifier(estimators=[('lgbm', clf1), ('xgb', clf2), ('cat', clf3)], voting='soft')
voting_clf.fit(X, y)

scores = cross_val_score(voting_clf, X, y, cv=5, scoring='roc_auc')
print("Voting CV=5 ROC AUC:", scores.mean())

preds = voting_clf.predict_proba(X_test)[:, 1]

Voting CV=5 ROC AUC: 0.966969144450408


In [4]:
# Final submission
submission['y'] = preds
submission.to_csv('submission.csv', index=False)
display(submission)
print('Submission file saved.')

Unnamed: 0,id,y
0,750000,0.002510
1,750001,0.055011
2,750002,0.000106
3,750003,0.000049
4,750004,0.016105
...,...,...
249995,999995,0.000080
249996,999996,0.050251
249997,999997,0.811159
249998,999998,0.000374


Submission file saved.
