In [152]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
import torch.optim as optim
import time
from tqdm import tqdm_notebook as tqdm_loop

## Preparing the data

In [30]:
df = pd.read_excel("data/default of credit card clients.xls", header=1)

In [31]:
df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [32]:
df["default payment next month"].value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [38]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [42]:
df.iloc[:,2:6]

Unnamed: 0,SEX,EDUCATION,MARRIAGE,AGE
0,2,2,1,24
1,2,2,2,26
2,2,2,2,34
3,2,2,1,37
4,1,2,1,57
...,...,...,...,...
29995,1,3,1,39
29996,1,3,2,43
29997,1,2,2,37
29998,1,3,1,41


In [55]:
ohe = OneHotEncoder(sparse=False)
df_sex_ed_mar_encoded = ohe.fit_transform(df.iloc[:,2:5])

In [57]:
df_sex_ed_mar_encoded = pd.DataFrame(df_sex_ed_mar_age_encoded, columns=["sex1", "sex2",
                        "ed0","ed1","ed2","ed3","ed4","ed5","ed6",
                        "mar0","mar1","mar2","mar3",])

In [62]:
ohe = OneHotEncoder(sparse=False)
df_pay_0_6_encoded = ohe.fit_transform(df.iloc[:,6:12])

In [75]:
df_pay_0_6_encoded = pd.DataFrame(df_pay_0_6_encoded)

In [78]:
df_pay_0_6_encoded.columns = ["pay_"+str(c) for c in df_pay_0_6_encoded.columns]

In [92]:
df_encoded = pd.concat((df.iloc[:, :2], df_sex_ed_mar_encoded, df.iloc[:, 5:6], 
                       df_pay_0_6_encoded, df.iloc[:, 12:]), axis=1)

In [93]:
y_values = df_encoded["default payment next month"]

In [94]:
df_encoded.drop("ID", inplace=True, axis=1)
df_encoded.drop("default payment next month", inplace=True, axis=1)

In [95]:
# normalize
min_max = MinMaxScaler()
df_normalized = min_max.fit_transform(df_encoded)

In [96]:
df_normalized = pd.DataFrame(df_normalized, columns=df_encoded.columns)

In [99]:
df_normalized.head()

Unnamed: 0,LIMIT_BAL,sex1,sex2,ed0,ed1,ed2,ed3,ed4,ed5,ed6,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0.010101,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.086723,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0
1,0.111111,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.087817,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783
2,0.080808,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.093789,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458
3,0.040404,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.113407,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892
4,0.040404,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.10602,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284


## Training a model

In [171]:
class model_credit(nn.Module):
    def __init__(self, input_size, first hidden_layer_dim, num_layers, dropout):
        super(model_credit, self).__init__()
        """
        dropout: dropout rate between fully connected layers;
        """
        self.dropout = dropout
        MLP_modules = []
        for i in range(num_layers):
            MLP_modules.append(nn.Linear(input_size, 512//(2**i)))
            MLP_modules.append(nn.ReLU())
            MLP_modules.append(nn.Dropout(p=self.dropout))
            input_size = 512//(2**i)
        self.MLP_layers = nn.Sequential(*MLP_modules)

        self.predict_layer = nn.Linear(input_size, 1)

        self._init_weight_()

    def _init_weight_(self):
        """ We leave the weights initialization here. """

        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict_layer.weight, 
                                a=1, nonlinearity='sigmoid')

        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x):
        output_MLP = self.MLP_layers(x)
        prediction = self.predict_layer(output_MLP)
        return prediction.view(-1)

In [123]:
x_tr, x_te, y_tr, y_te = train_test_split(df_normalized, y_values, test_size=0.2, stratify=y_values)

In [129]:
features_tensor_tr = torch.tensor(np.array(x_tr), dtype=torch.float)
target_tensor_tr = torch.tensor(y_tr.values)
###
features_tensor_te = torch.tensor(np.array(x_te), dtype=torch.float)
target_tensor_te = torch.tensor(y_te.values)

In [130]:
train_dataset = data_utils.TensorDataset(features_tensor_tr, target_tensor_tr)
test_dataset = data_utils.TensorDataset(features_tensor_te, target_tensor_te)

In [136]:
train_loader = data_utils.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = data_utils.DataLoader(test_dataset, batch_size=128, shuffle=False)

In [161]:
def accuracy(predicted_logits, reference):
    """Compute the ratio of correctly predicted labels"""
    pred_labels = [1 if i else 0 for i in F.sigmoid(predicted_logits) > 0.5]
    correct_predictions = pred_labels==reference.detach().cpu().numpy()
    return correct_predictions.sum() / len(correct_predictions)

In [172]:
### training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model_credit(df_normalized.shape[1], 4, 0.0)
model.to(device)
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(10):
    model.train() # Enable dropout (if have).
    start_time = time.time()
    for features, target in train_loader:
        features = features.to(device)
        target = target.float().to(device)

        model.zero_grad()
        prediction = model(features)
        loss = loss_function(prediction, target)
        loss.backward()
        optimizer.step()

    model.eval()
    test_accs = []
    weights_for_avg = []
    with torch.no_grad():
        for batch_x, batch_y in tqdm_loop(test_loader):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            prediction = model(batch_x)
            acc = accuracy(prediction, batch_y)
            test_accs.append(acc)
            weights_for_avg.append(len(batch_x))
    elapsed_time = time.time() - start_time
    print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
            time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
    print("test acc: {:.3f}".format(np.mean(test_accs)))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_x, batch_y in tqdm_loop(test_loader):


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 000 is: 00: 00: 00
test acc: 0.814


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 001 is: 00: 00: 00
test acc: 0.822


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 002 is: 00: 00: 00
test acc: 0.821


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 003 is: 00: 00: 00
test acc: 0.823


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 004 is: 00: 00: 00
test acc: 0.822


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 005 is: 00: 00: 00
test acc: 0.822


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 006 is: 00: 00: 00
test acc: 0.820


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 007 is: 00: 00: 00
test acc: 0.820


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 008 is: 00: 00: 00
test acc: 0.822


  0%|          | 0/47 [00:00<?, ?it/s]

The time elapse of epoch 009 is: 00: 00: 00
test acc: 0.820


In [175]:
model

model_credit(
  (MLP_layers): Sequential(
    (0): Linear(in_features=91, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.0, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.0, inplace=False)
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.0, inplace=False)
    (9): Linear(in_features=128, out_features=64, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.0, inplace=False)
  )
  (predict_layer): Linear(in_features=64, out_features=1, bias=True)
)

tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0], device='cuda:0')