<a href="https://www.kaggle.com/code/andrewbremner/bankchurn-s4e1-sklearn-models-vs-pytorch?scriptVersionId=158356019" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.utils import make_grid
import time

from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


# Explore Data

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [3]:
# No missing data
df.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
df['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [6]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [7]:
def process_data_dummies(df):
    df = df.drop(['CustomerId','Surname'], axis=1)
    df = pd.get_dummies(df, drop_first=True)
    return df

In [8]:
df_clean = process_data_dummies(df)
df_clean

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0,False,False,True
1,1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0,False,False,True
2,2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0,False,False,True
3,3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,False,False,True
4,4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,667,33.0,2,0.00,1,1.0,1.0,131834.75,0,False,True,False
165030,165030,792,35.0,3,0.00,1,0.0,0.0,131834.45,0,False,False,True
165031,165031,565,31.0,5,0.00,1,1.0,1.0,127429.56,0,False,False,True
165032,165032,554,30.0,7,161533.00,1,0.0,1.0,71173.03,0,False,True,False


In [9]:
df_clean.corr()['Exited']

id                   0.002512
CreditScore         -0.027383
Age                  0.340768
Tenure              -0.019565
Balance              0.129743
NumOfProducts       -0.214554
HasCrCard           -0.022141
IsActiveMember      -0.210237
EstimatedSalary      0.018827
Exited               1.000000
Geography_Germany    0.211054
Geography_Spain     -0.051175
Gender_Male         -0.146442
Name: Exited, dtype: float64

### Add a step to change 'Exited' to catagory 

In [10]:
df_clean['Exited'] = df_clean['Exited'].astype('category')

# Test Models on split train data

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
ids = df_clean['id']
X = df_clean.drop(['Exited','id'],axis=1)
y = df_clean['Exited']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=814)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier 
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

In [15]:
# rfc = RandomForestClassifier(n_estimators=100, class_weight="balanced")
# t1 = time.time()
# rfc.fit(X_train_scaled, y_train)
# total = time.time() - t1
# print(f'RFC took {total} s')
# rfc_pred = rfc.predict_proba(X_test_scaled)

In [16]:
# roc_auc_score(y_test, rfc_pred[:,1])

In [17]:
hgbc = HistGradientBoostingClassifier(class_weight="balanced")
t1 = time.time()
hgbc.fit(X_train_scaled, y_train)
total = time.time() - t1
print(f'HGBC took {total} s')
hbgc_pred = hgbc.predict_proba(X_test_scaled)

HGBC took 1.5608630180358887 s


In [18]:
roc_auc_score(y_test, hbgc_pred[:,1])

0.8901537811102461

In [19]:
# lrc = LogisticRegressionCV(class_weight="balanced")
# t1 = time.time()
# lrc.fit(X_train_scaled, y_train)
# total = time.time() - t1
# print(f'LogReg took {total} s')
# lrc_pred = lrc.predict_proba(X_test_scaled)

In [20]:
# roc_auc_score(y_test, lrc_pred[:,1])

# Retrain with all data with best model

In [21]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,False,False,True
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,False,False,True
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,False,False,True
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,False,False,True
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,False,True,True


In [22]:
final_scaler = StandardScaler()
final_scaler.fit(X)
X_scaled = final_scaler.transform(X)
X_Final_test_clean = process_data_dummies(df_test)

test_ids = X_Final_test_clean['id']
X_Final_test_clean=X_Final_test_clean.drop(['id'],axis=1)
X_Final_test_clean_scaled = final_scaler.transform(X_Final_test_clean)

In [23]:
# Final Train
hgbc_final = HistGradientBoostingClassifier(class_weight="balanced")
t1 = time.time()
hgbc_final.fit(X_scaled, y)
total = time.time() - t1
print(f'Final HGBC took {total} s')

Final HGBC took 1.9850430488586426 s


In [24]:
X_Final_test_clean.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,586,23.0,2,0.0,2,0.0,1.0,160976.75,False,False,False
1,683,46.0,2,0.0,1,1.0,0.0,72549.27,False,False,False
2,656,34.0,7,0.0,2,1.0,0.0,138882.09,False,False,False
3,681,36.0,8,0.0,1,1.0,0.0,113931.57,False,False,True
4,752,38.0,10,121263.62,1,1.0,0.0,139431.0,True,False,True


In [25]:
final_preds = hgbc_final.predict_proba(X_Final_test_clean_scaled)
final_preds[:,1]

array([0.12222746, 0.94977606, 0.07805849, ..., 0.07009426, 0.3765203 ,
       0.45381539])

In [26]:
preds_df = pd.DataFrame(final_preds[:,1].round(1),columns=['Exited'])
preds_df

Unnamed: 0,Exited
0,0.1
1,0.9
2,0.1
3,0.6
4,0.7
...,...
110018,0.1
110019,0.3
110020,0.1
110021,0.4


In [27]:
type(df['id'])

pandas.core.series.Series

In [28]:
output_df = pd.concat([test_ids,preds_df],axis=1)
output_df

Unnamed: 0,id,Exited
0,165034,0.1
1,165035,0.9
2,165036,0.1
3,165037,0.6
4,165038,0.7
...,...,...
110018,275052,0.1
110019,275053,0.3
110020,275054,0.1
110021,275055,0.4


In [29]:
#output_df.to_csv('/kaggle/working/submission.csv', index=False)

# Pytorch Version
### All data is continuous

In [30]:
# trainloader = DataLoader(X_train, batch_size=100, shuffle=True)
# testloader = DataLoader(X_test, batch_size=100, shuffle=False)

In [31]:
# Convert the scaled data to Tensor
conts_train = torch.tensor(X_train_scaled, dtype=torch.float)
conts_test = torch.tensor(X_test_scaled, dtype=torch.float)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

In [32]:
y_train

tensor([0, 1, 1,  ..., 0, 0, 0])

In [33]:
class Model(nn.Module):
    def __init__(self, n_cont, out_sz, layers, p=0.5):
#     def __init__(self, in_features=11, h1=21, h2=15, out_features=2):
        super().__init__()
        # Set up the embedding, dropout, and batch normalization layer attributes
#         self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        # Assign a variable to hold a list of layers
        layerlist = []
        for i in layers:
            layerlist.append(nn.Linear(n_cont,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_cont = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
        # Convert the list of layers into an attribute
        self.layers = nn.Sequential(*layerlist)
            
        
    def forward(self, x_cont):
#         embeddings = []
#         for i,e in enumerate(self.embeds):
#             embeddings.append(e(x_cat[:,i]))
#         x = torch.cat(embeddings, 1)
        # Perform an initial dropout on the embeddings
#         x = self.emb_drop(x)
        
        # Normalize the incoming continuous data
#         x_cont = self.bn_cont(x_cont)
#         x = torch.cat([x, x_cont], 1)
        # Set up model layers
        x = self.layers(x_cont)
    
        return x

In [34]:
# def ROC_loss(output, target):
#     return Tensor.roc_auc_score(output, target)

In [35]:
weights = torch.tensor([df['Exited'].value_counts()[0]/sum(df['Exited'].value_counts()),
                        df['Exited'].value_counts()[1]/sum(df['Exited'].value_counts())], dtype=torch.float)

In [36]:
model = Model(n_cont=conts_train.shape[1],
              out_sz=2, 
              layers=[21,15], 
              p = 0.5)

criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [37]:
model

Model(
  (emb_drop): Dropout(p=0.5, inplace=False)
  (bn_cont): BatchNorm1d(11, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=11, out_features=21, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(21, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=21, out_features=15, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=15, out_features=2, bias=True)
  )
)

In [38]:
epochs = 201
losses = []

for i in range(epochs):
    i+=1
    y_pred = model.forward(conts_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%20 == 1:
        print(f'epoch: {i:2}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

epoch:  1  loss: 0.98861635
epoch: 21  loss: 0.45825261
epoch: 41  loss: 0.24293374
epoch: 61  loss: 0.20789695
epoch: 81  loss: 0.19856714
epoch: 101  loss: 0.19505404
epoch: 121  loss: 0.19259396
epoch: 141  loss: 0.19053498
epoch: 161  loss: 0.18931663
epoch: 181  loss: 0.18845282
epoch: 201  loss: 0.18677405


In [39]:
model.eval()
with torch.no_grad():
    y_val = model.forward(conts_test)
    loss = criterion(y_val, y_test)
print(f'{loss:.8f}')

0.16944125


In [40]:
predictions = [y_val[i].argmax().item() for i in range(len(y_val))] 

In [41]:
probs = torch.sigmoid(y_val)

In [42]:
probs[:,1].numpy()

array([0.29829702, 0.08827241, 0.07432546, ..., 0.29876179, 0.40768498,
       0.05446461], dtype=float32)

In [43]:
roc_auc_score(y_test, predictions)

0.5644150775020405

In [44]:
# CODE HERE
rows = len(y_test)
correct = 0
for i in range(rows):
    if predictions[i] == y_test[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')


26822 out of 33007 = 81.26% correct


In [45]:
# Repeat from part 1
final_scaler = StandardScaler()
final_scaler.fit(X)
X_clean = process_data_dummies(df_test)
final_ids = X_clean['id']
X_clean = X_clean.drop('id', axis=1)
X_scaled = final_scaler.transform(X_clean)

X_final_tensor = torch.tensor(X_scaled, dtype=torch.float)

In [46]:
model.eval()
with torch.no_grad():
     y_final_vals = model.forward(X_final_tensor)

In [47]:
final_probs = torch.sigmoid(y_final_vals)

In [48]:
preds_df = pd.DataFrame(final_probs[:,1].numpy().round(1),columns=['Exited'])
preds_df

Unnamed: 0,Exited
0,0.1
1,0.5
2,0.1
3,0.2
4,0.3
...,...
110018,0.1
110019,0.1
110020,0.1
110021,0.2


In [49]:
nn_output_df = pd.concat([final_ids,preds_df],axis=1)
nn_output_df

Unnamed: 0,id,Exited
0,165034,0.1
1,165035,0.5
2,165036,0.1
3,165037,0.2
4,165038,0.3
...,...,...
110018,275052,0.1
110019,275053,0.1
110020,275054,0.1
110021,275055,0.2


In [50]:
nn_output_df.to_csv('/kaggle/working/submission.csv', index=False)