<a href="https://www.kaggle.com/code/andrewbremner/bankchurn-s4e1-sklearn?scriptVersionId=157601773" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.utils import make_grid
import time

from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


# Explore Data

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [3]:
# No missing data
df.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
df['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [6]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [7]:
def process_data_dummies(df):
    df = df.drop(['CustomerId','Surname'], axis=1)
    df = pd.get_dummies(df, drop_first=True)
    return df

In [8]:
df_clean = process_data_dummies(df)
df_clean

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0,False,False,True
1,1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0,False,False,True
2,2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0,False,False,True
3,3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,False,False,True
4,4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,667,33.0,2,0.00,1,1.0,1.0,131834.75,0,False,True,False
165030,165030,792,35.0,3,0.00,1,0.0,0.0,131834.45,0,False,False,True
165031,165031,565,31.0,5,0.00,1,1.0,1.0,127429.56,0,False,False,True
165032,165032,554,30.0,7,161533.00,1,0.0,1.0,71173.03,0,False,True,False


In [9]:
df_clean.corr()['Exited']

id                   0.002512
CreditScore         -0.027383
Age                  0.340768
Tenure              -0.019565
Balance              0.129743
NumOfProducts       -0.214554
HasCrCard           -0.022141
IsActiveMember      -0.210237
EstimatedSalary      0.018827
Exited               1.000000
Geography_Germany    0.211054
Geography_Spain     -0.051175
Gender_Male         -0.146442
Name: Exited, dtype: float64

# Test Models on split train data

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [11]:
X = df_clean.drop('Exited',axis=1)
y = df_clean['Exited']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=814)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier 
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

In [14]:
rfc = RandomForestClassifier(n_estimators=100, class_weight="balanced")
t1 = time.time()
rfc.fit(X_train_scaled, y_train)
total = time.time() - t1
print(f'RFC took {total} s')
rfc_pred = rfc.predict_proba(X_test_scaled)

RFC took 29.205689668655396 s


In [15]:
roc_auc_score(y_test, rfc_pred[:,1])

0.8777939151692029

In [16]:
hgbc = HistGradientBoostingClassifier(class_weight="balanced")
t1 = time.time()
hgbc.fit(X_train_scaled, y_train)
total = time.time() - t1
print(f'HGBC took {total} s')
hbgc_pred = hgbc.predict_proba(X_test_scaled)

HGBC took 1.4499611854553223 s


In [17]:
roc_auc_score(y_test, hbgc_pred[:,1])

0.8897787207227863

In [18]:
lrc = LogisticRegressionCV(class_weight="balanced")
t1 = time.time()
lrc.fit(X_train_scaled, y_train)
total = time.time() - t1
print(f'LogReg took {total} s')
lrc_pred = lrc.predict_proba(X_test_scaled)

LogReg took 2.106045961380005 s


In [19]:
roc_auc_score(y_test, lrc_pred[:,1])

0.8191787386468399

# Retrain with all data with best model

In [20]:
X.head()

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,0,668,33.0,3,0.0,2,1.0,0.0,181449.97,False,False,True
1,1,627,33.0,1,0.0,2,1.0,1.0,49503.5,False,False,True
2,2,678,40.0,10,0.0,2,1.0,0.0,184866.69,False,False,True
3,3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,False,False,True
4,4,716,33.0,5,0.0,2,1.0,1.0,15068.83,False,True,True


In [21]:
scaler.fit(X)
X_scaled = scaler.transform(X)
X_test_clean = process_data_dummies(df_test)
X_test_clean_scaled = scaler.transform(X_test_clean)

In [22]:
# Final Train
hgbc_final = HistGradientBoostingClassifier(class_weight="balanced")
t1 = time.time()
hgbc_final.fit(X_scaled, y)
total = time.time() - t1
print(f'Final HGBC took {total} s')

Final HGBC took 1.9371869564056396 s


In [23]:
X_test_clean.head()

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,165034,586,23.0,2,0.0,2,0.0,1.0,160976.75,False,False,False
1,165035,683,46.0,2,0.0,1,1.0,0.0,72549.27,False,False,False
2,165036,656,34.0,7,0.0,2,1.0,0.0,138882.09,False,False,False
3,165037,681,36.0,8,0.0,1,1.0,0.0,113931.57,False,False,True
4,165038,752,38.0,10,121263.62,1,1.0,0.0,139431.0,True,False,True


In [24]:
final_preds = hgbc_final.predict_proba(X_test_clean.values)
final_preds[:,1]

array([0.62644852, 0.11717503, 0.69268163, ..., 0.69268163, 0.02576085,
       0.21020874])

In [25]:
preds_df = pd.DataFrame(final_preds[:,1].round(1),columns=['Exited'])
preds_df

Unnamed: 0,Exited
0,0.6
1,0.1
2,0.7
3,0.1
4,0.3
...,...
110018,0.1
110019,0.1
110020,0.7
110021,0.0


In [26]:
type(df['id'])

pandas.core.series.Series

In [27]:
output_df = pd.concat([X_test_clean['id'],preds_df],axis=1)
output_df

Unnamed: 0,id,Exited
0,165034,0.6
1,165035,0.1
2,165036,0.7
3,165037,0.1
4,165038,0.3
...,...,...
110018,275052,0.1
110019,275053,0.1
110020,275054,0.7
110021,275055,0.0


In [28]:
output_df.to_csv('/kaggle/working/submission.csv', index=False)