In [None]:
from IPython.display import display, Markdown
import numpy as np
def set_frame_style(df, caption="", font_size ='20px' ):
    random_list = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
                      'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
                      'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']
    """Helper function to set dataframe presentation style.
    """
    return df.style.background_gradient(cmap=random_list[np.random.randint(1,17)]).set_caption(caption).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'Brown'),
        ('font-size', font_size),
        ('font-weight','bold')
    ]}])

<center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:280%;text-align:center;border-radius: 9px 5px;padding : 9px">Bank Customer Churn Prediction</p>   </span></font></center> 

 <img src="https://emyrael.github.io/assets/img/churn.png" height="1000" width="1000" style="object-fit: cover;">

<div class="anchor" id="top" style=" margin-right: auto; margin-left: auto; padding: 10px; font-size : 15px; background-color: #DFFFFE; border-radius: 2px; font-color :  #581845  ; border: 2px solid #581845;"> Customer Churn : Customer churn is the percentage of customers who stop buying a business's products or services over a certain period of time. It's also known as customer attrition, customer turnover, or customer defection

# <center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Table of contents </p>   </span></font></center> 

# Table of Contents

<div class="anchor" id="top" style="
    margin-right: auto; 
    margin-left: auto;
    padding: 10px;
   font-size : 15px;
    background-color: #DFFFFE;
    border-radius: 2px;
    font-color :  #581845  ;        
    border: 2px solid #581845;">
 
- [About the dataset](#1)
    - [1.1 Dataset Description](#1.1)
- [2. Importing the data](#2)
    - [2.1 Columns creation](#2.1)
- [3. Exploratory Data Analysis](#3)
    - [3.1 Distribution of Features](#3.1)
    - [3.2 Insights](#3.2)
- [4. Preprocessing](#4)
    - [4.1 RobustScaler](#4.1)  
    - [4.2 One Hot Encoding](#4.2) 
- [5. Model Training](#5)
    - [5.1 XGB Classifier](#5.1)
    - [5.2 LGBM Classifier](#5.2)
    - [5.2 Neural Network](#5.3)
    - [5.3 CatBoost Classifier](#5.4)
    - [5.4  Hyperparameter Optimization](#5.5)
    - [5.5 Ensemble(XGB, LGBM, CatBoost](#5.6)
- [6. Model Inference](#6)
    - [6.1 Feature Importance](#6.1) 


# <a id="1"></a><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">About the dataset </p>   </span></font></center> 

# About the dataset



# <a id="1.1"></a><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:150%;text-align:center;border-radius:5px 5px;padding : 5px">Dataset Description </p>   </span></font></center> 

## Dataset Description

<div class="anchor" id="top" style="
    margin-right: auto; 
    margin-left: auto;
    padding: 10px;
   font-size : 120%;
    background-color: #DFFFFE;
    border-radius: 2px;
    font-color :  #581845  ;        
    border: 2px solid #581845;">

* Customer ID: A unique identifier for each customer
* Surname: The customer's surname or last name
* Credit Score: A numerical value representing the customer's credit score
* Geography: The country where the customer resides (France, Spain or Germany)
* Gender: The customer's gender (Male or Female)
* Age: The customer's age.
* Tenure: The number of years the customer has been with the bank
* Balance: The customer's account balance
* NumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card)
* HasCrCard: Whether the customer has a credit card (1 = yes, 0 = no)
* IsActiveMember: Whether the customer is an active member (1 = yes, 0 = no)
* EstimatedSalary: The estimated salary of the customer
* Exited: Whether the customer has churned (1 = yes, 0 = no)



# <a id="2"></a><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Importing the data </p>   </span></font></center> 

#  💽 Importing the data

In [None]:
import pandas as pd
syn_df= pd.read_csv('/kaggle/input/bank-customer-churn-prediction-spark4ai/train.csv')
syn_df = syn_df.drop('id',axis =1)
original_df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')
test_df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction-spark4ai/test.csv')
ids = test_df['id']
display(syn_df.info())

In [None]:
test_df.head()

In [None]:
ids.nunique()

In [None]:
set_frame_style(syn_df.head(),'Training Data')

In [None]:
print(f' Total Number of Unique Surnames : {syn_df.Surname.nunique()}')

In [None]:
print(f' Total Number of Unique Countries : {syn_df.Geography.nunique()}')

In [None]:
original_df = original_df.drop(['RowNumber'],axis =1)
set_frame_style(original_df.head())

In [None]:
display(pd.DataFrame(syn_df.isna().value_counts()))

In [None]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["Features","Unique Values"]
for i in list(syn_df.columns) :
    nunique =syn_df[str(i)].nunique
    table.add_row([i, f"{nunique()}"])
print('Unique values in synthetically generated dataset : \n')
print(table)

## `Geography`, `Gender` , `Tenure`, ` NumOfProducts`, `HasCrCard`, `IsActiveMember` are the categorical features

In [None]:
from prettytable import PrettyTable


table = PrettyTable()
table.field_names = ["Features","Unique Values"]
for i in list(original_df.columns) :
    nunique =original_df[str(i)].nunique
    table.add_row([i, f"{nunique()}"])
print('Unique values in original dataset : \n')
print(table)

 > ### Both are similar (of course it was synthetically generated from the original one, but it's a good practice to check when merging both dataframes)

In [None]:
df = pd.concat([syn_df,original_df], axis =0)

df = df.dropna()
df = df.sample(frac = 1).reset_index(drop = True)
df.info()

# <a id="3"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Feature Engineering </p>   </span></font></center> 
    

## Feature Engineering

#### Age Categories : 

In [None]:
def age_tr(df) : 
    df['Age_Category'] = pd.cut(df['Age'], bins=[18, 30, 40, 50, 60, 100], labels=['18-30', '30-40', '40-50', '50-60', '60+'])
    return df

df = age_tr(df)
test_df = age_tr(test_df)
original_df = age_tr(original_df)

#### Credit Score Ranges:

In [None]:
def cred_score_tr(df) : 
    df['Credit_Score_Range'] = pd.cut(df['CreditScore'], bins=[0, 300, 600, 700, 800, 900], labels=['0-300', '300-600', '600-700', '700-800', '900+'])
    return df
df = cred_score_tr(df)
test_df = cred_score_tr(test_df)
original_df = cred_score_tr(original_df)

#### Account Balance to Salary Ratio

In [None]:
def acc_sal_tr(df):

    df['Balance_Salary_Ratio'] = df['Balance'] / df['EstimatedSalary']
    return df

df = acc_sal_tr(df)
test_df = acc_sal_tr(test_df)
original_df = acc_sal_tr(original_df)

#### Geography and Gender Interaction

In [None]:
def geo_gender_tr(df) : 
    df['Geo_Gender'] = df['Geography'] + '_' + df['Gender']
    return df

df = geo_gender_tr(df)
test_df = geo_gender_tr(test_df)
original_df = geo_gender_tr(original_df)

#### Total Products Used

In [None]:
def total_pr_tr(df) :
    df['Total_Products_Used'] = df['NumOfProducts'] + df['HasCrCard']
    return df

df = total_pr_tr(df)
test_df = total_pr_tr(test_df)
original_df = total_pr_tr(original_df)

#### Gender and Total Products Interaction

In [None]:
def tp_gender_tr(df) : 
    df['Tp_Gender'] = df['Total_Products_Used'].astype('str') + df['Gender']
    return df

df = tp_gender_tr(df)
test_df = tp_gender_tr(test_df)
original_df = tp_gender_tr(original_df)


#### TF-IDF for Encoding Surnames : Credit [@arunklenin](https://www.kaggle.com/code/arunklenin/ps4e1-advanced-feature-engineering-ensemble?scriptVersionId=157561661&cellId=30)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD


def tf_idf(train, test,original, column,n,p):
    vectorizer=TfidfVectorizer(max_features=n)
    vectors_train=vectorizer.fit_transform(train[column])
    vectors_test=vectorizer.transform(test[column])
    vectors_original=vectorizer.transform(original[column])
    
    svd=TruncatedSVD(p)
    x_pca_train=svd.fit_transform(vectors_train)
    x_pca_test=svd.transform(vectors_test)
    x_pca_original = svd.transform(vectors_original)
    tfidf_df_train=pd.DataFrame(x_pca_train)
    tfidf_df_test=pd.DataFrame(x_pca_test)
    tfidf_df_original=pd.DataFrame(x_pca_original)

    
    cols=[(column+"_tfidf_"+str(f)) for f in tfidf_df_train.columns]
    tfidf_df_train.columns=cols
    tfidf_df_test.columns=cols
    tfidf_df_original.columns = cols
    train=pd.concat([train,tfidf_df_train], axis="columns")
    test=pd.concat([test,tfidf_df_test], axis="columns")
    original=pd.concat([original,tfidf_df_original], axis="columns")
    
    return train, test , original

df,test_df, original_df=tf_idf(df,test_df,original_df, "Surname",1000,5)

In [None]:
df.info()

In [None]:
df.columns

# <a id="3"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">EDA </p>   </span></font></center> 
    

# 📊 Exploratory Data Analysis

In [None]:
syn_df.columns

In [None]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Features","Unique Values"]
for i in list(original_df.columns) :
    nunique =original_df[str(i)].nunique
    table.add_row([i, f"{nunique()}"])
print('Unique values in original dataset : \n')
print(table)

In [None]:
numeric_cols= ['CreditScore',  'Age',  'Balance', 'EstimatedSalary','Balance_Salary_Ratio','Tenure', "Surname_tfidf_0",'Surname_tfidf_1',
       'Surname_tfidf_2', 'Surname_tfidf_3', 'Surname_tfidf_4']
original_cols = ['CreditScore',  'Age',  'Balance', 'EstimatedSalary','Balance_Salary_Ratio','Tenure',"Surname_tfidf_0",'Surname_tfidf_1',
       'Surname_tfidf_2', 'Surname_tfidf_3', 'Surname_tfidf_4']
test_to_scale = test_df[numeric_cols]
train_to_scale = df[numeric_cols]
set_frame_style(train_to_scale.head(), 'Features with continuous values')
train_to_scale_original = original_df[original_cols]

## <a id="3.1"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Distributions of Features </p>   </span></font></center> 

# 📈 Feature Distribution

In [None]:
# from plotly.subplots import make_subplots
# import random
# import plotly.graph_objects as go
# columns = list(train_to_scale_original.columns)
# ultra_light_colors = [
# "#F0F8FF", "#F6F6F6", "#F0FFF0",  "#FAFAD2",  "#FFE4E1",  "#FFF5EE", "#F5FFFA",  "#F0FFFF","#FFFAF0",  "#F8F8FF"   
# ]
# fig = make_subplots(rows=len(columns), cols=2)
# count = 0
# for row in range(int(len(columns))) : 
#     random_col = f"RGB({random.randint(100, 255)}, {random.randint(100, 255)}, {random.randint(150, 255)})"
#     fig.add_trace(go.Violin(y=train_to_scale[numeric_cols][columns[count]], x0 = columns[count], box_visible=True, line_color='black',
#                                meanline_visible=True, fillcolor=random_col, opacity=0.6,), row=row + 1, col= 1)
#     fig.add_trace(go.Violin(y= train_to_scale_original[columns[count]],x0 = columns[count], box_visible=True, line_color='black',
#                                meanline_visible=True, fillcolor=random_col, opacity=0.6,), row=row + 1, col= 2)
 
    
#     count +=1


# fig.update_layout(height=1000, width=800, title_text="Feature Distribution in Synthetic (Left) vs Original Dataset (Right)",showlegend=False,paper_bgcolor= '#F5F5F5')
# fig.show()


In [None]:
set_frame_style(pd.DataFrame(df.Exited.value_counts()), 'Data points per class', '15px')

In [None]:
# classes = list(df.Exited.unique())

# ultra_light_colors = [
# "#F0F8FF", "#F6F6F6", "#F0FFF0",  "#FAFAD2",  "#FFE4E1",  "#FFF5EE", "#F5FFFA",  "#F0FFFF","#FFFAF0",  "#F8F8FF"   
# ]
# def col_per_class(col):
#     fig = go.Figure()
#     for clas in classes : 
#         fig.add_trace(go.Violin(y = df[col][df['Exited']== clas],   box_visible=True,
#                             meanline_visible=True , x = df['Exited'][df['Exited'] == clas], name = str(clas) ))
#         fig.update_layout(title = f'Distribution for {col} for each class', plot_bgcolor = ultra_light_colors[np.random.randint(1,10)],paper_bgcolor= '#F5F5F5', height=400,  
#         width=1000 )
#     return fig
# for i in df : 
#     fig = col_per_class(i)
#     fig.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = train_to_scale.corr()

# Create a heatmap with masked upper triangle
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix (Lower Triangle)')
plt.show()

# <a id="4"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Preprocessing</p>   </span></font></center> 

# ⚙️ Preprocessing

In [None]:
set_frame_style(df.head())

# <a id="4.1"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:200%;text-align:center;border-radius:5px 5px;padding : 2px">Robust Scaler</p>   </span></font></center> 
> ###  It focuses only on the IQR for scaling unless specified/changed
    

# Robustscaler

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Assuming train_to_scale and test_to_scale are your DataFrames
scaled_train = pd.DataFrame(sc.fit_transform(train_to_scale), columns=train_to_scale.columns)
scaled_test = pd.DataFrame(sc.transform(test_to_scale), columns=test_to_scale.columns)


In [None]:
set_frame_style(scaled_train.head())

In [None]:
train_ohe = df.drop(numeric_cols, axis =1)
train_to_ohe = train_ohe.drop(['CustomerId','Exited','Surname'], axis =1)
test_ohe = test_df.drop(numeric_cols, axis =1)
test_to_ohe = test_ohe.drop(['id','CustomerId','Surname'],axis =1)
set_frame_style(train_to_ohe.head())

In [None]:
set_frame_style(test_to_ohe.head())

In [None]:
test_df.info()

## <a id="4.2"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:200%;text-align:center;border-radius:5px 5px;padding : 2px">One-hot encoding the categorical columns</p>   </span></font></center> 

# One-hot Encoding

In [None]:
ohe_train = pd.get_dummies(train_to_ohe, columns =train_to_ohe.columns )
ohe_test  = pd.get_dummies(test_to_ohe, columns = train_to_ohe.columns)
ohe_train = ohe_train.replace({True: 1, False: 0})
ohe_test = ohe_test.replace({True: 1, False: 0})
set_frame_style(ohe_train.head())

In [None]:
ohe_train = ohe_train.drop(['Gender_Male'],axis =1)
ohe_test = ohe_test.drop(['Gender_Male'],axis =1)

## Frequency Encoding

In [None]:
def freq_enc(df) :
    frequency_encoding = df['Surname'].value_counts(normalize=True)
    df['Surname'] = df['Surname'].map(frequency_encoding)
    return df
df = freq_enc(df)
test_df = freq_enc(test_df)

## <a><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:200%;text-align:center;border-radius:5px 5px;padding : 2px">Join the scaled and one-hot encoded columns</p>   </span></font></center> 

In [None]:
train_df_1 = pd.concat([ohe_train, scaled_train, df['Surname']], axis =1)
test_df = pd.concat([ohe_test, scaled_test, test_df['Surname']], axis =1)
set_frame_style(train_df_1.head())

In [None]:
train_df_1.info()

In [None]:
train_df_1.shape

In [None]:
from imblearn.under_sampling import TomekLinks
    
def tomek_links (X,y) : 
    '''Tomek Links are pairs of instances, one from the majority class and one from the minority class,
    that are very close to each other in the feature space. They are used in the context of dealing with
    imbalanced datasets to help balance the class distribution. The key idea is to 
    identify and remove those majority class instances that are near the minority class instances'''

    tl = TomekLinks(sampling_strategy='auto')

    X_resampled, y_resampled = tl.fit_resample(X, y)
    
    return X_resampled , y_resampled


## <a id="5"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Model Training</p>   </span></font></center> 

# 🤖 Training

## <a id="5.1"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">XGBClassifier</p>   </span></font></center> 

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import  confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

X = train_df_1
y = df['Exited']

# Label encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# X, y_encoded = tomek_links(X, y_encoded)


# XGBoost

In [None]:
from sklearn.metrics import roc_auc_score


#XGBoost parameters
xgb_params = {'max_depth': 8,
 'min_child_weight': 9, 
 'learning_rate': 0.015784217705381666,
 'n_estimators': 928, 
 'subsample': 0.8311128887439883,
 'colsample_bytree': 0.3454344427319984,
 'random_state': 42}



# number of folds
n_splits = 10

#  StratifiedKFold
stratkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

#  cross-validation results
cv_results = []

# stratified k-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(stratkf.split(X, y_encoded)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]


    # XGBoost model
    xgb_model = XGBClassifier(**xgb_params )

    xgb_model.fit(X_train, y_train )

    # predictions on the validation set
    y_val_pred_prob = xgb_model.predict(X_val)
    y_pred = xgb_model.predict(X_val)
        
    f1=  f1_score(y_val, y_pred, average='weighted')

    # Evaluating the model
    
    roc_auc = roc_auc_score(y_val, y_val_pred_prob)
    print(f'Fold {fold + 1}, AUC Score on Validation Set: {roc_auc}')
    print(f'Fold {fold + 1}, F1 Score on Validation Set: {f1}')
    print('-'*70)

    # results
    cv_results.append(roc_auc)

# average cross-validation result
average_cv_result = sum(cv_results) / n_splits
print(f'\nAverage AUC-score across {n_splits} folds: {average_cv_result}')


## <a id="5.2"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">LGBM Classifier</p>   </span></font></center> 

# LightGBM

In [None]:
import lightgbm as lgb
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier


lgbm_params_1 = {
    'min_child_samples': 12, 
    'learning_rate': 0.02849773542504347,
    'n_estimators': 410, 
    'subsample': 0.31556535683131615, 
    'colsample_bytree': 0.9691366837955018, 
    'reg_alpha': 0.7109904579556621,
    'reg_lambda': 0.8470259353827624,
    'device': 'gpu',
    'verbosity': 0
}
# folds
n_splits = 10

# StratifiedKFold
stratkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_results = []



for fold, (train_idx, val_idx) in enumerate(stratkf.split(X, y_encoded)):

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    lgbm_model = LGBMClassifier(**lgbm_params_1)
    lgbm_model.fit(X_train,y_train)

    y_val_pred_prob = lgbm_model.predict_proba(X_val)
    y_pred = lgbm_model.predict(X_val)
        
    f1=  f1_score(y_val, y_pred, average='weighted')

    # Evaluating the model
    logloss = log_loss(y_val, y_val_pred_prob)
    roc_auc = roc_auc_score(y_val, y_pred)
    print(f'Fold {fold + 1}, AUC-Score on Validation Set: {roc_auc}')
    print(f'Fold {fold + 1}, F1 Score on Validation Set: {f1}')
    print('-'*70)

    cv_results.append(roc_auc)
average_cv_result = sum(cv_results) / n_splits
print(f'\nAverage AUC-SCORE across {n_splits} folds: {average_cv_result}')

In [None]:
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:")
print(class_mapping)

## <a id="5.4"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Catboost Classifier</p>   </span></font></center> 

# CatBoost

In [None]:
from catboost import CatBoostClassifier
catboost_params = {
    'iterations': 848, 
    'depth': 28,
    'min_data_in_leaf': 5,
    'learning_rate': 0.027876808218320774,
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Bernoulli',
    'eval_metric': 'AUC',  
}

n_splits = 10

stratkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


cv_results = []


for fold, (train_idx, val_idx) in enumerate(stratkf.split(X, y_encoded)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    cat_model = CatBoostClassifier(**catboost_params, 
                            random_state=42, verbose =0
                           )
    cat_model.fit(X_train,y_train)

    y_val_pred_prob = cat_model.predict_proba(X_val)
    y_pred = cat_model.predict(X_val)
        
    f1=  f1_score(y_val, y_pred, average='weighted')

    # Evaluating the model
    logloss = log_loss(y_val, y_val_pred_prob)
    roc_auc = roc_auc_score(y_val, y_pred)
    print(f'Fold {fold + 1}, AUC- score on Validation Set: {roc_auc}')
    print(f'Fold {fold + 1}, F1 Score on Validation Set: {f1}')
    print('-'*70)

 
    cv_results.append(logloss)

average_cv_result = sum(cv_results) / n_splits
print(f'\nAverage Logarithmic Loss across {n_splits} folds: {average_cv_result}')


## <a id="5.6"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Ensemble of XGB, CATBOOST and LGBM</p>   </span></font></center> 

# Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier



Ensemble = VotingClassifier(estimators = [('lgb', lgbm_model), ('xgb', xgb_model), ('CB', cat_model)], 
                            voting='soft',
                            weights = [0.3,0.3,0.400]  
                            )
Ensemble.fit(X, y_encoded)


## <a id="6"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color:#581845;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Model Inference</p>   </span></font></center> 

<div class="anchor" id="top" style="
    margin-right: auto; 
    margin-left: auto;
    padding: 10px;
    background-color: #FFF7FD;
    border-radius: 2px;
    font-size : 15px;       
    font-color : #581845;                                    
    border: 2px solid #581845;"
     
- Ensemble Performs the best. We need to maximise the f1_score and minimise the log_loss for the ensemble

In [None]:
# import optuna

# def objective(trial):
#     lgb_weight = trial.suggest_int('lgb_weight', 0, 50)
#     xgb_weight = trial.suggest_int('xgb_weight', 0, 100 - lgb_weight)
    
#     cb_weight = 100- lgb_weight - xgb_weight

#     weights = [lgb_weight/100, xgb_weight/100, cb_weight/100]
    
#     ensemble = VotingClassifier(estimators=[('lgb', lgbm_model), ('xgb', xgb_model), ('CB', cat_model)],
#                                 voting='soft',
#                                 weights=weights)
    
#     ensemble.fit(X, y_encoded)
#     y_pred = ensemble.predict(test_x)
    
#     return f1_score(test_y, y_pred, average='weighted')


# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=10)

# # Get the best parameters
# best_params = study.best_params
# best_weights = [best_params['lgb_weight'], best_params['xgb_weight'], 100- best_params['lgb_weight'] - best_params['xgb_weight']]

# print("Best Weights:", best_weights)


In [None]:
# from sklearn.ensemble import VotingClassifier


# # lgb_1 = LGBMClassifier(**lgbm_params )
# # xgb_1 = XGBClassifier(**xgb_params )
# # cb_1 = CatBoostClassifier(**catboost_params, random_state=42)
# Ensemble = VotingClassifier(estimators = [('lgb', lgbm_model), ('xgb', xgb_model), ('CB', cat_model)], 
#                             voting='soft',
#                             weights = best_weights   #Adjust weighting since XGB performs better in local environment
#                             )
# Ensemble.fit(X, y_encoded)


# Model Inference

## <a id="6.1"><center><font size = 4><span style="color:#F5F5E6"> <p style="background-color: #005F5D;font-family:courier;color:#FFFFFF;font-size:250%;text-align:center;border-radius:5px 5px;padding : 2px">Feature Importance</p>   </span></font></center> 

# Feature Importance

In [None]:
import matplotlib.pyplot as plt
feature_importances = xgb_model.feature_importances_
feature_names = train_df_1.columns 
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_feature_names, sorted_importance_scores = zip(*sorted_feature_importance)
plt.figure(figsize=(10, 10))

plt.barh(sorted_feature_names, sorted_importance_scores)
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Feature Importance")
plt.show()

In [None]:
test_df.info()

In [None]:
train_df_1.info()

# Final Submission

In [None]:
y_pred = Ensemble.predict_proba(test_df)
y_pred = pd.DataFrame(y_pred)
y_pred.columns = ['Non-Exited', 'Exited']
y_pred.head()

In [None]:
submission_df = pd.DataFrame()
submission_df = y_pred  
submission_df['id'] = ids

In [None]:
submission_df['id'] = submission_df['id'].apply(lambda x : int(x))
submission_df = submission_df.drop('Non-Exited',axis =1)
submission_df.head()

### Uncomment this to submit to the competition

In [None]:
# submission_df.to_csv('submission.csv', index= False)