# Binary Classification with a Bank Churn Dataset
***

In [1]:
# Importing all the dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

### Exploratory Data Analysis
***

In [4]:
train_data

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [5]:
train_data['Exited'].unique()

array([0, 1])

In [6]:
sample

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5
...,...,...
110018,275052,0.5
110019,275053,0.5
110020,275054,0.5
110021,275055,0.5


In [7]:
# Remove CustomerID and Surname as it does not add value to our data
for ds in [train_data, test_data]:
    ds.drop(['CustomerId', 'Surname', "id"], axis=1, inplace=True)

In [8]:
test_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...
110018,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [9]:
train_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [10]:
y = train_data['Exited']
x = train_data.drop(['Exited'], axis=1)

In [11]:
x.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83


In [12]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Exited, dtype: int64

In [13]:
label_encoder = LabelEncoder()
for cat in ['Geography','Gender']:
    x[cat] = label_encoder.fit_transform(x[cat])
x.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,668,0,1,33.0,3,0.0,2,1.0,0.0,181449.97
1,627,0,1,33.0,1,0.0,2,1.0,1.0,49503.5
2,678,0,1,40.0,10,0.0,2,1.0,0.0,184866.69
3,581,0,1,34.0,2,148882.54,1,1.0,1.0,84560.88
4,716,2,1,33.0,5,0.0,2,1.0,1.0,15068.83


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, stratify=y)

In [27]:
cols = x_train
features = np.where(cols.dtypes != np.float64)[0]
train_pool = Pool(x_train, y_train, cat_features=features)
test_pool = Pool(x_test, y_test, cat_features=features)

In [34]:
model = CatBoostClassifier(eval_metric="AUC", learning_rate=0.03, iterations=1940)
model.fit(train_pool, eval_set=test_pool, verbose=500)
y_pred = model.predict_proba(x_test)[:,1]
auc = roc_auc_score(y_test, y_pred)
print("AUC is: ",auc)

0:	test: 0.8729404	best: 0.8729404 (0)	total: 53.1ms	remaining: 1m 43s
500:	test: 0.8897273	best: 0.8897273 (500)	total: 35.5s	remaining: 1m 42s
1000:	test: 0.8901859	best: 0.8901975 (983)	total: 1m 13s	remaining: 1m 8s
1500:	test: 0.8903832	best: 0.8903875 (1496)	total: 1m 46s	remaining: 31.2s
1939:	test: 0.8905012	best: 0.8905012 (1939)	total: 2m 14s	remaining: 0us

bestTest = 0.8905012259
bestIteration = 1939

AUC is:  0.8905012258512688


In [64]:
test_id = pd.read_csv('./test.csv')
submission_df = pd.DataFrame({'id':test_id['id'], 'Exited':model.predict_proba(test_data)[:,1]})
submission_df.head(5)

Unnamed: 0,id,Exited
0,165034,0.024825
1,165035,0.616963
2,165036,0.041072
3,165037,0.248989
4,165038,0.413466


In [65]:
submission_df.to_csv('submission.csv', index=False)