In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope
import ydata_profiling as ydp



In [2]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

VER = 1
EVALS = 100
FILEPATH = '../data/'

In [3]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [4]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

DataFrame Information:
______________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


None



DataFrame Head:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0




DataFrame Tail:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.0,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.0,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.0,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.0,1,0.0,1.0,71173.03,0
165033,165033,15732798,Ulyanov,850,France,Male,31.0,1,0.0,1,1.0,0.0,61581.79,1




DataFrame Description:
______________________


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034.0,82516.5,47641.3565,0.0,41258.25,82516.5,123774.8,165033.0
CustomerId,165034.0,15692010.0,71397.816791,15565701.0,15633141.0,15690169.0,15756820.0,15815690.0
CreditScore,165034.0,656.4544,80.10334,350.0,597.0,659.0,710.0,850.0
Age,165034.0,38.12589,8.867205,18.0,32.0,37.0,42.0,92.0
Tenure,165034.0,5.020353,2.806159,0.0,3.0,5.0,7.0,10.0
Balance,165034.0,55478.09,62817.663278,0.0,0.0,0.0,119939.5,250898.09
NumOfProducts,165034.0,1.554455,0.547154,1.0,1.0,2.0,2.0,4.0
HasCrCard,165034.0,0.7539537,0.430707,0.0,1.0,1.0,1.0,1.0
IsActiveMember,165034.0,0.4977702,0.499997,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,165034.0,112574.8,50292.865585,11.58,74637.57,117948.0,155152.5,199992.48




Number of Null Values:
______________________


id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64



Number of Duplicated Rows:
______________________


0



Number of Unique Values:
______________________


id                 165034
CustomerId          23221
Surname              2797
CreditScore           457
Geography               3
Gender                  2
Age                    71
Tenure                 11
Balance             30075
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55298
Exited                  2
dtype: int64



DataFrame Shape:
______________________
Rows: 165034, Columns: 14


DataFrame Columns:
______________________


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [5]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [6]:
def preprocess_data(df, cat_features, num_features, scaler):
    
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname','CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()

    return df

In [7]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]


In [8]:
# Initialize StandardScaler
scaler = StandardScaler()

# Assuming 'train' and 'test' are your DataFrame names
train_df = preprocess_data(train, cat_features, num_features, scaler)

# Split the training data
X_train = train_df.drop(['Exited', 'id'], axis=1) 
y_train = train_df['Exited']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

In [9]:
# CatBoost hyperparameter space
cat_space = {
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 1)),
    'depth': scope.int(hp.quniform('depth', 4, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log10(0.01), np.log10(0.3)),
    'random_strength': hp.uniform('random_strength', 1e-9, 10),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
    'border_count': scope.int(hp.quniform('border_count', 1, 255, 1)),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 1, 10),
    'scale_pos_weight': float(np.sum(y_train == 0)) / np.sum(y_train == 1),
    'random_seed': SEED, 
    'eval_metric': 'AUC',
    'thread_count': -1,
}

def cat_objective(cat_space):
    model = CatBoostClassifier(**cat_space)

    # Implement cross-validation
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr, verbose=False)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}

cat_trials = Trials()
cat_best_hyperparams = fmin(fn=cat_objective,  
                            space=cat_space,  
                            algo=tpe.suggest,
                            max_evals=EVALS,
                            trials=cat_trials)

print("The best hyperparameters for CatBoost are: ", "\n")
print(cat_best_hyperparams)

print("The best auc score for CatBoost is: ", "\n")
print(-cat_trials.best_trial['result']['loss'])


100%|██████████| 100/100 [3:41:16<00:00, 132.76s/trial, best loss: -0.8914323653906182] 
The best hyperparameters for CatBoost are:  

{'bagging_temperature': 0.1231886764685493, 'border_count': 202.0, 'depth': 5.0, 'iterations': 616.0, 'l2_leaf_reg': 1126.9402495846957, 'learning_rate': 0.16911347576117253, 'random_strength': 2.8198259268410744}
The best auc score for CatBoost is:  

0.8914323653906182


In [10]:

# LightGBM hyperparameter space
lgb_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 10, 150, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log10(0.01), np.log10(0.3)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'scale_pos_weight': float(np.sum(y_train == 0)) / np.sum(y_train == 1),
    'random_state': SEED,
    'verbose': -1,
    'nthreads': -1,
}

def lgb_objective(lgb_space):
    model = lgb.LGBMClassifier(**lgb_space)

    # Implement cross-validation
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}

lgb_trials = Trials()
lgb_best_hyperparams = fmin(fn=lgb_objective,
                            space=lgb_space,  
                            algo=tpe.suggest,
                            max_evals=EVALS,
                            trials=lgb_trials)

print("The best hyperparameters for LightGBM are: ", "\n")
print(lgb_best_hyperparams)

print("The best auc score for LightGBM is: ", "\n")
print(-lgb_trials.best_trial['result']['loss'])



100%|██████████| 100/100 [07:29<00:00,  4.49s/trial, best loss: -0.8911942782953309]
The best hyperparameters for LightGBM are:  

{'colsample_bytree': 0.5081958528597559, 'learning_rate': 0.15232371892973343, 'max_depth': 3.0, 'n_estimators': 376.0, 'num_leaves': 109.0, 'reg_alpha': 0.7065951745451389, 'reg_lambda': 0.9138730989541799, 'subsample': 0.8593589507171163}
The best auc score for LightGBM is:  

0.8911942782953309


In [11]:
# Define the hyperparameter space for XGBoost
xgb_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 2)),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 8, 1)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'reg_alpha': scope.int(hp.quniform('reg_alpha', 0, 10, 1)),
    'reg_lambda': hp.uniform('reg_lambda', 1, 10),
    'gamma': hp.loguniform('gamma', -10, 10),
    'learning_rate': hp.loguniform('learning_rate', np.log10(0.1), np.log10(0.25)),
    'random_state': SEED,
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'nthread': -1,
}

# Objective function for hyperparameter tuning
def xgb_objective(space):
    # Compute the scale_pos_weight
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

    model = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        reg_alpha=int(space['reg_alpha']),
        reg_lambda=space['reg_lambda'],
        gamma=space['gamma'],
        learning_rate=space['learning_rate'],
        scale_pos_weight=ratio,
        random_state=SEED,
        nthread=-1,
    )

    # Implement cross-validation
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}

# Set up trials
xgb_trials = Trials()

# Run the hyperparameter search using fmin
xgb_best_hyperparams = fmin(fn=xgb_objective,
                        space=xgb_space,
                        algo=tpe.suggest,
                        max_evals=EVALS,  
                        trials=xgb_trials)

print("The best hyperparameters for XGBoost are: ", "\n")
print(xgb_best_hyperparams)

print("The best auc score for XGBoost is: ", "\n")

100%|██████████| 100/100 [17:16<00:00, 10.37s/trial, best loss: -0.8906249427783338]
The best hyperparameters for XGBoost are:  

{'colsample_bytree': 0.8602647436899948, 'gamma': 7.901674511313632e-05, 'learning_rate': 0.40765452396565677, 'max_depth': 2.0, 'min_child_weight': 0.22748393083315713, 'n_estimators': 660.0, 'reg_alpha': 5.0, 'reg_lambda': 9.70770080831118, 'subsample': 0.9999371192592543}
The best auc score for XGBoost is:  



In [20]:
from sklearn.linear_model import LogisticRegression
# Preprocess the test data
test_df = preprocess_data(test, cat_features, num_features, scaler)
X_test = test_df.drop(['id'], axis=1)

# Train the XGBoost model
xgb_best_hyperparams['max_depth'] = int(xgb_best_hyperparams['max_depth'])
xgb_model = xgb.XGBClassifier(**xgb_best_hyperparams)
xgb_model.fit(X_train, y_train)

# Train CatBoost model
cat_model = CatBoostClassifier(**cat_best_hyperparams)
cat_model.fit(X_train, y_train)

# Train LightGBM model
lgb_best_hyperparams['n_estimators'] = int(lgb_best_hyperparams['n_estimators'])
lgb_best_hyperparams['num_leaves'] = int(lgb_best_hyperparams['num_leaves'])
lgb_best_hyperparams['max_depth'] = int(lgb_best_hyperparams['max_depth'])
lgb_model = lgb.LGBMClassifier(**lgb_best_hyperparams)
lgb_model.fit(X_train, y_train)

# Ensemble predictions using stacking
xgb_pred_val = xgb_model.predict_proba(X_val)[:, 1]
cat_pred_val = cat_model.predict_proba(X_val)[:, 1]
lgb_pred_val = lgb_model.predict_proba(X_val)[:, 1]

# Stack predictions to create new meta-features for the validation set
stacked_val_predictions = np.column_stack((xgb_pred_val, cat_pred_val, lgb_pred_val))

# Train meta-model on stacked predictions
meta_model = LogisticRegression()
meta_model.fit(stacked_val_predictions, y_val)

# Get predictions for the test set
xgb_pred_test = xgb_model.predict_proba(X_test)[:, 1]
cat_pred_test = cat_model.predict_proba(X_test)[:, 1]
lgb_pred_test = lgb_model.predict_proba(X_test)[:, 1]

# Stack test predictions to create meta-features for the test set
stacked_test_predictions = np.column_stack((xgb_pred_test, cat_pred_test, lgb_pred_test))

# Final predictions on test set using the meta-model
ensemble_test_pred = meta_model.predict_proba(stacked_test_predictions)[:, 1]

# Evaluate ensemble model using stacking
ensemble_auc_score = roc_auc_score(y_val, meta_model.predict_proba(stacked_val_predictions)[:, 1])
print(f'Stacking Ensemble ROC AUC Score: {ensemble_auc_score}')

# Commented out: Simple averaging ensemble and its evaluation
# ensemble_pred = (xgb_pred_val + cat_pred_val + lgb_pred_val) / 3
# ensemble_auc_score = roc_auc_score(y_val, ensemble_pred)
# print(f'Average Ensemble ROC AUC Score: {ensemble_auc_score}')

# Calculate predictions for the validation dataset using stacking
ensemble_pred_binary = np.where(meta_model.predict_proba(stacked_val_predictions)[:, 1] > 0.5, 1, 0)

# Create the confusion matrix for stacking ensemble
confusion_mat = confusion_matrix(y_val, ensemble_pred_binary)
print("Confusion Matrix for Stacking Ensemble:")
print(confusion_mat)

# Commented out: Confusion matrix for averaging ensemble
# ensemble_pred_binary_avg = np.where(ensemble_pred > 0.5, 1, 0)
# confusion_mat_avg = confusion_matrix(y_val, ensemble_pred_binary_avg)
# print("Confusion Matrix for Average Ensemble:")
# print(confusion_mat_avg)


0:	learn: 0.5645430	total: 30.5ms	remaining: 18.8s
1:	learn: 0.4906664	total: 46.9ms	remaining: 14.4s
2:	learn: 0.4412535	total: 64.8ms	remaining: 13.2s
3:	learn: 0.4098956	total: 81.9ms	remaining: 12.5s
4:	learn: 0.3855566	total: 99ms	remaining: 12.1s
5:	learn: 0.3687144	total: 116ms	remaining: 11.8s
6:	learn: 0.3606600	total: 133ms	remaining: 11.6s
7:	learn: 0.3558702	total: 153ms	remaining: 11.6s
8:	learn: 0.3494181	total: 171ms	remaining: 11.5s
9:	learn: 0.3438233	total: 189ms	remaining: 11.5s
10:	learn: 0.3408428	total: 216ms	remaining: 11.9s
11:	learn: 0.3393950	total: 247ms	remaining: 12.4s
12:	learn: 0.3381894	total: 264ms	remaining: 12.3s
13:	learn: 0.3353182	total: 282ms	remaining: 12.1s
14:	learn: 0.3339253	total: 300ms	remaining: 12s
15:	learn: 0.3328453	total: 327ms	remaining: 12.3s
16:	learn: 0.3319291	total: 346ms	remaining: 12.2s
17:	learn: 0.3312234	total: 363ms	remaining: 12.1s
18:	learn: 0.3302697	total: 380ms	remaining: 11.9s
19:	learn: 0.3299288	total: 397ms	remain

In [21]:


# Get validation predictions
xgb_pred_val = xgb_model.predict_proba(X_val)[:, 1]
cat_pred_val = cat_model.predict_proba(X_val)[:, 1]
lgb_pred_val = lgb_model.predict_proba(X_val)[:, 1]

# Calculate AUC scores for each model as weights
xgb_weight = roc_auc_score(y_val, xgb_pred_val)
cat_weight = roc_auc_score(y_val, cat_pred_val)
lgb_weight = roc_auc_score(y_val, lgb_pred_val)

# Normalize weights
total_weight = xgb_weight + cat_weight + lgb_weight
xgb_weight /= total_weight
cat_weight /= total_weight
lgb_weight /= total_weight

# Apply weighted averaging
weighted_ensemble_pred = (xgb_pred_val * xgb_weight +
                          cat_pred_val * cat_weight +
                          lgb_pred_val * lgb_weight)

# Evaluate weighted ensemble model
weighted_ensemble_auc_score = roc_auc_score(y_val, weighted_ensemble_pred)
print(f'Weighted Averaging Ensemble ROC AUC Score: {weighted_ensemble_auc_score}')

# Apply weighted averaging to test predictions
xgb_pred_test = xgb_model.predict_proba(X_test)[:, 1]
cat_pred_test = cat_model.predict_proba(X_test)[:, 1]
lgb_pred_test = lgb_model.predict_proba(X_test)[:, 1]

weighted_ensemble_test_pred = (xgb_pred_test * xgb_weight +
                               cat_pred_test * cat_weight +
                               lgb_pred_test * lgb_weight)

# Calculate predictions for the validation dataset using weighted averaging
weighted_ensemble_pred_binary = np.where(weighted_ensemble_pred > 0.5, 1, 0)

# Create the confusion matrix for weighted averaging ensemble
confusion_mat_weighted = confusion_matrix(y_val, weighted_ensemble_pred_binary)
print("Confusion Matrix for Weighted Averaging Ensemble:")
print(confusion_mat_weighted)


Weighted Averaging Ensemble ROC AUC Score: 0.89259250897428
Confusion Matrix for Weighted Averaging Ensemble:
[[37233  1900]
 [ 4647  5731]]


In [22]:
from sklearn.ensemble import VotingClassifier

# Create a list of tuples with model name and model object
classifiers = [
    ('xgb', xgb.XGBClassifier(**xgb_best_hyperparams)),
    ('cat', CatBoostClassifier(**cat_best_hyperparams)),
    ('lgb', lgb.LGBMClassifier(**lgb_best_hyperparams))
]

# Create the voting classifier, specify type of voting
voting_clf = VotingClassifier(estimators=classifiers, voting='soft')

# Fit the voting classifier on the training data
voting_clf.fit(X_train, y_train)

# Evaluate the model
voting_pred = voting_clf.predict_proba(X_val)[:, 1]
voting_auc_score = roc_auc_score(y_val, voting_pred)
print(f'Voting Classifier ROC AUC Score: {voting_auc_score}')

# Predict on test data
voting_test_pred = voting_clf.predict_proba(X_test)[:, 1]

# Calculate predictions for the validation dataset using the voting classifier
voting_pred_binary = np.where(voting_pred > 0.5, 1, 0)

# Create the confusion matrix for the voting classifier
confusion_mat_voting = confusion_matrix(y_val, voting_pred_binary)
print("Confusion Matrix for Voting Classifier:")
print(confusion_mat_voting)


0:	learn: 0.5645430	total: 18.6ms	remaining: 11.4s
1:	learn: 0.4906664	total: 34.5ms	remaining: 10.6s
2:	learn: 0.4412535	total: 52ms	remaining: 10.6s
3:	learn: 0.4098956	total: 69.5ms	remaining: 10.6s
4:	learn: 0.3855566	total: 87.5ms	remaining: 10.7s
5:	learn: 0.3687144	total: 105ms	remaining: 10.7s
6:	learn: 0.3606600	total: 122ms	remaining: 10.6s
7:	learn: 0.3558702	total: 138ms	remaining: 10.5s
8:	learn: 0.3494181	total: 154ms	remaining: 10.4s
9:	learn: 0.3438233	total: 171ms	remaining: 10.4s
10:	learn: 0.3408428	total: 189ms	remaining: 10.4s
11:	learn: 0.3393950	total: 209ms	remaining: 10.5s
12:	learn: 0.3381894	total: 226ms	remaining: 10.5s
13:	learn: 0.3353182	total: 243ms	remaining: 10.5s
14:	learn: 0.3339253	total: 260ms	remaining: 10.4s
15:	learn: 0.3328453	total: 278ms	remaining: 10.4s
16:	learn: 0.3319291	total: 296ms	remaining: 10.4s
17:	learn: 0.3312234	total: 314ms	remaining: 10.4s
18:	learn: 0.3302697	total: 332ms	remaining: 10.4s
19:	learn: 0.3299288	total: 350ms	rema

In [18]:
test_pred_prob = ensemble_test_pred 
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.043852
1,165035,0.882886
2,165036,0.026335
3,165037,0.248632
4,165038,0.391011
