<a href="https://colab.research.google.com/github/aliiamrr/Customer-Churn-Prediction-for-Bank-Dataset/blob/main/Bank_Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from scipy.stats import uniform
from sklearn.utils import resample
from sklearn import linear_model
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score



# Data Preprocessing

## Basic Preprocessing


In [None]:
# READING THE DATA SET
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15585961.0,Hs?,684.0,France,Male,41.0,10.0,0.0,2.0,1.0,1.0,173948.4,1.0
1,1,15643378.0,Bellucci,807.0,France,Male,32.0,2.0,0.0,2.0,1.0,0.0,144532.85,0.0
2,2,15651022.0,O'Donnell,553.0,Germany,Male,53.0,9.0,102278.52,1.0,1.0,0.0,158816.03,1.0
3,3,15676521.0,Chiang,587.0,France,Female,34.0,6.0,0.0,1.0,1.0,0.0,167984.72,1.0
4,4,15772650.0,Kambinachi,732.0,Germany,Female,30.0,5.0,135070.92,1.0,1.0,1.0,116097.26,0.0


In [None]:
# ANALYZING BASIC INFORMATION ABOUT THE DATA SET
print(train.info())
print(train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               15000 non-null  int64  
 1   CustomerId       15000 non-null  float64
 2   Surname          15000 non-null  object 
 3   CreditScore      15000 non-null  float64
 4   Geography        15000 non-null  object 
 5   Gender           15000 non-null  object 
 6   Age              15000 non-null  float64
 7   Tenure           15000 non-null  float64
 8   Balance          15000 non-null  float64
 9   NumOfProducts    15000 non-null  float64
 10  HasCrCard        15000 non-null  float64
 11  IsActiveMember   15000 non-null  float64
 12  EstimatedSalary  15000 non-null  float64
 13  Exited           15000 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 1.6+ MB
None
                 id    CustomerId   CreditScore           Age        Tenure  \
count  1

In [None]:

# DROPPING REDUNDANT FEATURES
train = train.drop('id', axis=1)
train = train.drop('CustomerId', axis=1)
train = train.drop('Surname', axis=1)

# USING ONE HOT ENCODING FOR 'Geography' and 'Gender' COLUMNS
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(train[['Geography']])
train = pd.concat([train, pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(['Geography']))], axis=1)
train.drop('Geography', axis=1, inplace=True)

encoded = encoder.fit_transform(train[['Gender']])
train = pd.concat([train, pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['Gender']))], axis=1)
train.drop('Gender', axis=1, inplace=True)

# STANDARDIZING NUMERICAL VALUES

# Avoid scaling the target variable (Exited) as well as the encoded values
cols_to_scale = train.columns.drop(['Exited', 'Geography_France', 'Geography_Spain', 'Geography_Germany', 'Gender_Female', 'Gender_Male'])

# Standardize only the selected columns
scaler = StandardScaler()
train[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])

train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.336369,0.378507,1.778022,-0.708144,0.789827,0.529021,1.030326,0.356277,1.0,1.0,0.0,0.0,0.0,1.0
1,2.01451,-0.706662,-1.101133,-0.708144,0.789827,0.529021,-0.970566,0.165733,0.0,1.0,0.0,0.0,0.0,1.0
2,-1.45092,1.825398,1.418128,1.010527,-1.106971,0.529021,-0.970566,0.258255,1.0,0.0,1.0,0.0,0.0,1.0
3,-0.987044,-0.465513,0.338445,-0.708144,-1.106971,0.529021,-0.970566,0.317646,1.0,1.0,0.0,0.0,1.0,0.0
4,0.991253,-0.94781,-0.02145,1.561565,-1.106971,0.529021,1.030326,-0.018463,0.0,0.0,1.0,0.0,1.0,0.0


## Resampling the minority class

In [None]:
# start by rebalancing the 'exited' class

# Separate the majority and minority classes
majority_class = train[train['Exited'] == 0]
minority_class = train[train['Exited'] == 1]

# Oversample the minority class
minority_oversampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the majority class with the oversampled minority class
data_balanced = pd.concat([majority_class, minority_oversampled])

# Check the new class distribution after oversampling
df_balanced_distribution = data_balanced['Exited'].value_counts()
print("\nNew Class Distribution After Oversampling:\n", df_balanced_distribution)


New Class Distribution After Oversampling:
 Exited
0.0    11985
1.0    11985
Name: count, dtype: int64


## Splitting the data

In [None]:
x = data_balanced.drop('Exited', axis=1)
y = data_balanced['Exited']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Training the models

## Random Forest Classifier

In [None]:
#BASIC

rf = RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)


y_pred_rf = rf.predict_proba(x_test)[:,1]
print("ROC AUC score: ",roc_auc_score(y_test, y_pred_rf))


y_pred_rf_class = rf.predict(x_test)
f1 = f1_score(y_test, y_pred_rf_class)
print("F1 score: ", f1)

ROC AUC score:  0.9947382967721409
F1 score:  0.9604679306171844


In [None]:
#TESTING HYPER PARAMETERS

# Define parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
rf_search = RandomizedSearchCV(rf, rf_param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Fit the model
rf_search.fit(x_train, y_train)

# Best hyperparameters
print("Hyper Parameters:", rf_search.best_params_)

# Evaluate the model
y_pred_rf = rf_search.predict_proba(x_test)[:, 1]
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)
print("Random Forest ROC AUC:", roc_auc_rf)
f1 = f1_score(y_test, rf_search.predict(x_test))
print("F1 score: ", f1)



Hyper Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 30, 'bootstrap': False}
Random Forest ROC AUC: 0.9932285256840353
F1 score:  0.9607803292013818


## Logistic Regression Model

In [None]:
#BASIC

logr = linear_model.LogisticRegression()
logr.fit(x_train, y_train)

y_pred_logr = logr.predict_proba(x_test)[:,1]
print("ROC AUC score: ",roc_auc_score(y_test, y_pred_logr))

y_pred_logr_class = logr.predict(x_test)
f1 = f1_score(y_test, y_pred_logr_class)
print("F1 score: ", f1)

ROC AUC score:  0.8826049203447592
F1 score:  0.8198389147944044


In [None]:
#TESTING HYPER PARAMETERS

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
logreg_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l2','l1'],
}

# Initialize model
logreg = LogisticRegression(random_state=42)

# Initialize RandomizedSearchCV
logreg_search = RandomizedSearchCV(logreg, logreg_param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Fit the model
logreg_search.fit(x_train, y_train)

# Best hyperparameters
print("Hyper Parameters:", logreg_search.best_params_)

# Evaluate the model
y_pred_logreg = logreg_search.predict_proba(x_test)[:, 1]
roc_auc_logreg = roc_auc_score(y_test, y_pred_logreg)
print("ROC AUC:", roc_auc_logreg)

f1 = f1_score(y_test, logreg_search.predict(x_test))
print("F1 score: ", f1)




Hyper Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 1}
ROC AUC: 0.882615885308567
F1 score:  0.8198389147944044


## Desicison Trees

In [None]:
#BASIC

decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(x_train, y_train)

y_pred_dt = decision_tree.predict_proba(x_test)[:,1]
print("ROC AUC score: ",roc_auc_score(y_test, y_pred_dt))

y_pred_dt_class = decision_tree.predict(x_test)
f1 = f1_score(y_test, y_pred_dt_class)
print("F1 score: ", f1)

ROC AUC score:  0.944628760264642
F1 score:  0.9468405215646941


In [None]:
#TESING HYPER PARAMETERS

# Define parameter grid
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
}

# Initialize model
dt = DecisionTreeClassifier(random_state=42)

# Initialize RandomizedSearchCV
dt_search = RandomizedSearchCV(dt, dt_param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Fit the model
dt_search.fit(x_train, y_train)

# Best hyperparameters
print("Hyper Parameters:", dt_search.best_params_)

# Evaluate the model
y_pred_dt = dt_search.predict_proba(x_test)[:, 1]
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)
print("ROC AUC:", roc_auc_dt)

f1 = f1_score(y_test, dt_search.predict(x_test))
print("F1 score: ", f1)


Hyper Parameters: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
ROC AUC: 0.9456032496671786
F1 score:  0.9268887083671812


# Boosted Trees

## Gradient Boost

In [None]:
# BASIC

gb_clf = GradientBoostingClassifier(random_state=42)

gb_clf.fit(x_train, y_train)

y_pred_gb = gb_clf.predict_proba(x_test)[:, 1]
print("ROC AUC score: ", roc_auc_score(y_test, y_pred_gb))

y_pred_gb_class = gb_clf.predict(x_test)
f1 = f1_score(y_test, y_pred_gb_class)
print("F1 score: ", f1)

ROC AUC score:  0.9462903003756109
F1 score:  0.876802637000412


### Hyperparameter Tuning

In [None]:
#TESTING HYPER PARAMETERS

param_grid = {
    'n_estimators': [500,350],
    'learning_rate': [0.1],
    'max_depth': [8],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [None]:
print( "Hyper Parameters: ",grid_search.best_params_)
best_grad = grid_search.best_estimator_

y_pred_grad = best_grad.predict_proba(x_test)[:, 1]
print("ROC AUC score: ", roc_auc_score(y_test, y_pred_grad))

f1 = f1_score(y_test, best_grad.predict(x_test))
print("F1 score: ", f1)

Hyper Parameters:  {'learning_rate': 0.1, 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500, 'subsample': 0.8}
ROC AUC score:  0.9913035653711125
F1 score:  0.9638505280259951


## Cat Boost

###Installing CatBoost

In [None]:
!pip install catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool



### New Train - Test Split for the Cat Boost Model


In [None]:
# READING THE ORIGNAL DATA SET
train_cat = pd.read_csv('train.csv')

In [None]:
# DROPPING REDUNDANT FEATURES
train_cat = train_cat.drop('id', axis=1)
train_cat = train_cat.drop('CustomerId', axis=1)
train_cat = train_cat.drop('Surname', axis=1)

# STANDARDIZING THE NUMERICAL FEATURES
cols_to_scale = train_cat.columns.drop(['Exited', 'Geography','Gender'])
scaler = StandardScaler()
train_cat[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])

# PERFORMING TRAIN-TEST SPLIT
x = train_cat.drop('Exited', axis=1)
y = train_cat['Exited']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
#BASIC

categorical_features = ['Geography', 'Gender']
cat_clf = CatBoostClassifier(random_state=42, verbose=False, cat_features=categorical_features)
cat_clf.fit(x_train, y_train)

print("ROC AUC score: ",roc_auc_score(y_test, cat_clf.predict_proba(x_test)[:,1]))
y_pred_cat_class = cat_clf.predict(x_test)
f1 = f1_score(y_test, y_pred_cat_class)
print("F1 score: ", f1)

ROC AUC score:  0.9390025932879686
F1 score:  0.7305282005371531


In [None]:
# ANALYZING THE FEATURE IMPORTANCE FOR THE CAT BOOST MODEL
feature_importances = cat_clf.get_feature_importance()
important_features = pd.Series(feature_importances, index=x_train.columns).sort_values(ascending=False)
print(important_features)

NumOfProducts      27.956209
Age                22.731564
Balance             9.852119
EstimatedSalary     8.486234
CreditScore         8.217856
Geography           6.760567
IsActiveMember      6.244571
Tenure              4.182957
Gender              4.134333
HasCrCard           1.433591
dtype: float64


###Hyper Parameter Tuning

In [None]:
#TESTING HYPER PARAMETERS

# Define the parameter grid
param_grid = {
    'iterations': [500],
    'learning_rate': [0.2,0.4],
    'depth': [10,12],
    'l2_leaf_reg': [0.5],
    'random_strength': [1],
}

# Create a CatBoost model
model = CatBoostClassifier(random_state=42, verbose=False)

# Create a GridSearchCV object
grid_search = GridSearchCV(CatBoostClassifier(random_state=42, verbose=False, cat_features=categorical_features_indices), param_grid, cv=3, scoring='roc_auc')

# Fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)

# Print the best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

# Use the best model
best_cat_model = grid_search.best_estimator_

best_parameters = grid_search.best_params_
print("Hyper Parameters: ",best_parameters)

preds = best_cat_model.predict_proba(x_test)[:,1]
print("ROC AUC: ",roc_auc_score(y_test, preds))

f1 = f1_score(y_test, best_cat_model.predict(x_test))
print("F1 score: ", f1)



{'depth': 10, 'iterations': 500, 'l2_leaf_reg': 0.5, 'learning_rate': 0.2, 'random_strength': 1}
0.9196458839997348
Hyper Parameters:  {'depth': 10, 'iterations': 500, 'l2_leaf_reg': 0.5, 'learning_rate': 0.2, 'random_strength': 1}
ROC AUC:  0.9254070782404482
F1 score:  0.7163120567375887


# Making prediction

In [None]:
# READING THE TEST DATA
test = pd.read_csv('test.csv')
test2 = test.copy()

# PRE PROCESSING THE TEST DATA

# Dropping redundant features
test = test.drop('id', axis=1)
test = test.drop('CustomerId', axis=1)
test = test.drop('Surname', axis=1)

# Encoding Categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(test[['Geography']])
test = pd.concat([test, pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(['Geography']))], axis=1)
test.drop('Geography', axis=1, inplace=True)
encoded = encoder.fit_transform(test[['Gender']])
test = pd.concat([test, pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['Gender']))], axis=1)
test.drop('Gender', axis=1, inplace=True)

# Standardizing the numerical features
cols_to_scale = test.columns.drop(['Geography_France', 'Geography_Spain', 'Geography_Germany', 'Gender_Female', 'Gender_Male'])
scaler = StandardScaler()
test[cols_to_scale] = scaler.fit_transform(test[cols_to_scale])

# CHOOSING THE MODEL
test['Exited'] = best_cat_model.predict_proba(test)[:,1]

# FORMATTING THE SUBMISSION DATA SET
submission = pd.concat([test2['id'], test['Exited']], axis=1)
submission.to_csv('submission.csv', index=False)