# Binary Classification with a Bank Dataset - XGBoost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna


  machar = _get_machar(dtype)
  from .autonotebook import tqdm as notebook_tqdm


## Reading from files

In [2]:
data = pd.read_csv('./playground-series-s5e8/train.csv')
other_data = pd.read_csv('./playground-series-s5e8/bank-full.csv')
X_test = pd.read_csv('./playground-series-s5e8/test.csv')
data = pd.concat([data, other_data])
data = data.drop(columns=["id"])
X_test = X_test.drop(columns=["id"])

y = data['y']
features = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
cat_features = ['job', 'marital', 'education', 'default', 'housing','loan','contact','month','poutcome']
X = data[features]

print(y.value_counts(normalize=True))

y
0    0.879558
1    0.120442
Name: proportion, dtype: float64


## Optuna - Catboost

In [3]:
def objective_catb(trial):
    params = {
        'random_state': 42,
        'iterations': trial.suggest_int('iterations', 2000, 5000),  # number of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 8),  # tree depth
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),  # L2 regularization
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),  # randomness in splits
        'border_count': trial.suggest_int('border_count', 32, 255),  # number of splits for numeric features
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),  # feature sampling
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0)

    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        scale_pos_weight = len(y_train) / (2 * sum(y_train))

        model = CatBoostClassifier(**params, eval_metric='AUC', loss_function='Logloss', verbose=0, class_weights=[1, scale_pos_weight])
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        auc_scores.append(auc)
    mean_auc = np.mean(auc_scores)
    print(f"~~~~~~~~Trial {trial.number} mean AUC: {mean_auc:.4f}\n~~~~~~")
    return mean_auc

## Catboost

In [4]:
best_params = {'iterations': 2312, 'learning_rate': 0.06641334484629591, 'depth': 11, 'l2_leaf_reg': 4.258468780350662, 'random_strength': 8.767951892289316, 'border_count': 160, 'grow_policy': 'Lossguide', 'colsample_bylevel': 0.5842739001189716, 'bootstrap_type': 'MVS'}

auc_scores = []
oof_preds_catb = np.zeros(len(X))
test_preds_catb = np.zeros(len(X_test))

kf = StratifiedKFold(n_splits=10 , shuffle=True, random_state=42)


for fold, (train_index, val_index) in enumerate(kf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    scale_pos_weight = len(y_train) / (2 * sum(y_train))
    
    model = CatBoostClassifier(**best_params, eval_metric='AUC', loss_function='Logloss', verbose=0, class_weights=[1, scale_pos_weight])
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100,cat_features=cat_features)

    val_preds = model.predict_proba(X_val)[:, 1] 
    oof_preds_catb[val_index] = val_preds
    
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} AUC: {auc:.4f}")

    test_preds_catb += model.predict_proba(X_test)[:, 1]

test_preds_catb /= kf.n_splits

# study = optuna.create_study(direction='maximize')
print(f"Average AUC across folds: {np.mean(auc_scores):.4f}")
# study.optimize(objective_catb, n_trials=10)

# # Show best result
# print("Best AUC:", study.best_value)
# print("Best parameters:", study.best_params)
0.9656

Fold 1 AUC: 0.9666
Fold 2 AUC: 0.9660
Fold 3 AUC: 0.9649
Fold 4 AUC: 0.9651
Fold 5 AUC: 0.9650
Fold 6 AUC: 0.9649
Fold 7 AUC: 0.9658
Fold 8 AUC: 0.9658
Fold 9 AUC: 0.9665
Fold 10 AUC: 0.9652
Average AUC across folds: 0.9656


0.9656

## Label Encoding

In [5]:
X.education = X.education.map({'primary': 0, 'secondary': 1, 'tertiary': 2})
X_test.education = X_test.education.map({'primary': 0, 'secondary': 1, 'tertiary': 2})

X.default = X.default.map({'no': 0, 'yes': 1})
X_test.default = X_test.default.map({'no': 0, 'yes': 1})

X.housing = X.housing.map({'no': 0, 'yes': 1})
X_test.housing = X_test.housing.map({'no': 0, 'yes': 1})

X.loan = X.loan.map({'no': 0, 'yes': 1})
X_test.loan = X_test.loan.map({'no': 0, 'yes': 1})

X.contact = X.contact.map({'cellular': 0, 'telephone': 1})
X_test.contact = X_test.contact.map({'cellular': 0, 'telephone': 1})

X.month = X.month.map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
X['day_of_year'] = (X['month'] - 1) * 31 + X['day']
X['day_sin'] = np.sin(2 * np.pi * X.day_of_year / 372)
X['day_cos'] = np.cos(2 * np.pi * X.day_of_year / 372)
X['month_sin'] = np.sin(2 * np.pi * X.month / 12)
X['month_cos'] = np.cos(2 * np.pi * X.month / 12)

X_test.month = X_test.month.map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
X_test['day_of_year'] = (X_test['month'] - 1) * 31 + X_test['day']
X_test['day_sin'] = np.sin(2 * np.pi * X_test.day_of_year / 372)
X_test['day_cos'] = np.cos(2 * np.pi * X_test.day_of_year / 372)
X_test['month_sin'] = np.sin(2 * np.pi * X_test.month / 12)
X_test['month_cos'] = np.cos(2 * np.pi * X_test.month / 12)

X.poutcome = X.poutcome.map({'failure': -1, 'other': 0, 'success': 1})
X_test.poutcome = X_test.poutcome.map({'failure': -1, 'other': 0, 'success': 1})

X = X.drop(['day', 'month', 'day_of_year'], axis=1)
X_test = X_test.drop(['day', 'month', 'day_of_year'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.education = X.education.map({'primary': 0, 'secondary': 1, 'tertiary': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.default = X.default.map({'no': 0, 'yes': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.housing = X.housing.map({'no': 0, 'yes': 1})
A value is trying to be set on a c

## Data Split

In [6]:
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

## Imputing

In [7]:
X = X.replace('unknown', np.nan)
X_test = X_test.replace('unknown', np.nan)

X_columns = X.columns
X_index = X.index
X_test_columns = X_test.columns
X_test_index = X_test.index

#Imputing missing values
imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X_columns, index=X_index)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test_columns, index=X_test_index)

## One-Hot Encoding

In [8]:
#Converting categorical columns to string
object_cols = ['job', 'marital']
X[object_cols] = X[object_cols].astype(str)
X_test[object_cols] = X_test[object_cols].astype(str)

#One-hot encoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(X_test[object_cols]))

#Matching indexes
OH_cols.index = X.index
OH_cols_test.index = X_test.index


#Droping original object columns and concatenating
num_X = X.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

X = pd.concat([num_X, OH_cols], axis=1)
X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

#Changing dtype of columns
for col in X.columns:
    X[col] = pd.to_numeric(X[col])
    X_test[col] = pd.to_numeric(X_test[col])

print(X.dtypes)
print(X_test.dtypes)

age            int64
education    float64
default        int64
balance        int64
housing        int64
loan           int64
contact      float64
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome     float64
day_sin      float64
day_cos      float64
month_sin    float64
month_cos    float64
0            float64
1            float64
2            float64
3            float64
4            float64
5            float64
6            float64
7            float64
8            float64
9            float64
10           float64
11           float64
12           float64
13           float64
dtype: object
age            int64
education    float64
default        int64
balance        int64
housing        int64
loan           int64
contact      float64
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome     float64
day_sin      float64
day_cos      float64
month_sin    float64
month_cos    float64
0            float64

## Interaction

In [None]:
age_ed_product = X['age']*X['education']
X = pd.concat([X, age_ed_product], axis=1)

TypeError: concat() takes 1 positional argument but 2 were given

## Saving new training data to a file

In [9]:
#X.to_csv('X_train.csv')

## Optuna - XGBoost

In [10]:
def objective_xgb(trial):
    params = {
        'eval_metric': 'auc',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 6.5, 8),
        'n_jobs': -1,
        'random_state': 42,
    }

    print(f"Trial {trial.number} parameters: {params}")
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in kf.split(X):

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        scale_pos_weight = len(y_train) / (2 * sum(y_train))

        model = XGBClassifier(**params, use_label_encoder=False, verbosity=0, class_weights=[1, scale_pos_weight])
        model.fit(X_train, y_train)

        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        auc_scores.append(auc)
    mean_auc = np.mean(auc_scores)
    print(f"Trial {trial.number} mean AUC: {mean_auc:.4f}\n")
    return mean_auc

## XGBoost

In [11]:
best_params = {
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'auc',
    'lambda': 4.789789675776273, 
    'alpha': 0.08471537263019147, 
    'gamma': 5.823912998954795, 
    'colsample_bytree': 0.6462228625505225, 
    'subsample': 0.6209806695110034, 
    'learning_rate': 0.08080462403564155, 
    'n_estimators': 898, 
    'max_depth': 12, 
    'min_child_weight': 6, 
    'scale_pos_weight': 7.3
}

auc_scores = []
oof_preds_xgb = np.zeros(len(X))
test_preds_xgb = np.zeros(len(X_test))

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    scale_pos_weight = len(y_train) / (2 * sum(y_train))

    model = XGBClassifier(**best_params, class_weights=[1, scale_pos_weight])
    model.fit(X_train, y_train)

    val_preds = model.predict_proba(X_val)[:, 1] 
    oof_preds_xgb[val_index] = val_preds
    
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} AUC: {auc:.4f}")

    test_preds_xgb += model.predict_proba(X_test)[:, 1]

test_preds_xgb /= kf.n_splits

print(f"Average AUC across folds: {np.mean(auc_scores):.4f}")
#study = optuna.create_study(direction='maximize')
#study.optimize(objective_xgb, n_trials=30)

# Show best result
#print("Best AUC:", study.best_value)
#print("Best parameters:", study.best_params)


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 AUC: 0.9659


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 AUC: 0.9652


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 AUC: 0.9648


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 AUC: 0.9646


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 AUC: 0.9644


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 6 AUC: 0.9639


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 7 AUC: 0.9651


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 8 AUC: 0.9653


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 9 AUC: 0.9659


Parameters: { "class_weights" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 10 AUC: 0.9643
Average AUC across folds: 0.9649


## Optuna - LightGBM

In [12]:
def objective_lgbm(trial):
    params = {
        'metric': 'auc',
        'random_state': 42,
        'n_estimators': 2000,
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves' : trial.suggest_int('num_leaves', 20, 255),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        scale_pos_weight = len(y_train) / (2 * sum(y_train))

        model = LGBMClassifier(**params, early_stopping_round=100, verbosity=-1, class_weights=[1, scale_pos_weight])
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="auc")
        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        auc_scores.append(auc)
    mean_auc = np.mean(auc_scores)
    print(f"~~~~~~~~Trial {trial.number} mean AUC: {mean_auc:.4f}\n~~~~~~")
    return mean_auc

## LightGBM

In [13]:
best_params = {'metric': 'auc', 'random_state': 42, 'n_estimators': 2000,'reg_alpha': 1.167185634134842, 'reg_lambda': 8.79073465010749, 'colsample_bytree': 0.5005629861380371, 'subsample': 0.6822575660406408, 'learning_rate': 0.06493690919467418, 'max_depth': 11, 'num_leaves': 188, 'min_child_samples': 88, 'min_data_per_groups': 1}


auc_scores = []
oof_preds_lgbm = np.zeros(len(X))
test_preds_lgbm = np.zeros(len(X_test))

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    scale_pos_weight = len(y_train) / (2 * sum(y_train))
    
    model = LGBMClassifier(**best_params, early_stopping_round=100, verbosity=-1, class_weights=[1, scale_pos_weight])
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="auc")

    val_preds = model.predict_proba(X_val)[:, 1] 
    oof_preds_lgbm[val_index] = val_preds
    
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} AUC: {auc:.4f}")

    test_preds_lgbm += model.predict_proba(X_test)[:, 1]

test_preds_lgbm /= kf.n_splits

print(f"Average AUC across folds: {np.mean(auc_scores):.4f}")
#study = optuna.create_study(direction='maximize')
#study.optimize(objective_lgbm, n_trials=30)

# Show best result
#print("Best AUC:", study.best_value)
#print("Best parameters:", study.best_params)



Fold 1 AUC: 0.9672
Fold 2 AUC: 0.9668
Fold 3 AUC: 0.9662
Fold 4 AUC: 0.9663
Fold 5 AUC: 0.9659
Fold 6 AUC: 0.9654
Fold 7 AUC: 0.9666
Fold 8 AUC: 0.9671
Fold 9 AUC: 0.9670
Fold 10 AUC: 0.9654
Average AUC across folds: 0.9664


## Ensembling - Stacking

In [14]:
#all_test_preds = pd.DataFrame()
#all_test_preds["xgboost"] = test_preds_xgb
#all_test_preds["lgbm"] = test_preds_lgbm
#all_train_preds = pd.DataFrame()
#all_train_preds["xgboost"] = oof_preds_xgb
#all_train_preds["lgbm"] = oof_preds_lgbm
#all_train_preds["y"] = y

In [15]:
train_stack = np.column_stack((oof_preds_xgb,oof_preds_lgbm,oof_preds_catb)) #train predictions of xgboost and lightgbm
test_stack = np.column_stack((test_preds_xgb,test_preds_lgbm,test_preds_catb)) #test predictions of xgboost and lightgbm

final_model = LogisticRegression()

final_model.fit(train_stack, y)

final_predictions = final_model.predict_proba(test_stack)[:,1]

## Test

In [16]:
final_predictions = pd.Series(final_predictions)
final_predictions.index = range(750000, 750000 + len(final_predictions))

print(final_predictions)
final_predictions.to_csv('submission.csv')

750000    0.007479
750001    0.032272
750002    0.006847
750003    0.006827
750004    0.009461
            ...   
999995    0.006840
999996    0.025673
999997    0.717756
999998    0.006975
999999    0.033641
Length: 250000, dtype: float64
