In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import os
import gensim
import time
import matplotlib.pyplot as plt

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
CONSTANTS = {
    'label': 'default payment next month',
    'path': 'ppppreprocessed (1).csv' , #
    'sequence_features': ['PAY_', 'BILL_AMT', 'PAY_AMT'],
    'non_sequence_features': ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE'],
    'length': 6,
    'batch_size': 64,
}

In [4]:
class Dataset_seq(Dataset):
  def __init__(self, path):
    self.data = pd.read_csv(path)
    self.label = CONSTANTS['label']
    self.features = list(self.data.columns)
    self.features.remove(self.label)
  
  def __getitem__(self, index):
    ex = self.data.iloc[index]
    label = ex[self.label]
    features = ex[self.features]

    non_sequential_features = list(features[CONSTANTS['non_sequence_features']].values)

    all_features = []

    for i in range(1, 1 + CONSTANTS['length']):
      seq_i = []
      for base_feature in CONSTANTS['sequence_features']:
        seq_i.append(features[f'{base_feature}{i}'])
      
      seq_i += non_sequential_features
      all_features.append(seq_i)
    
    all_features = np.array(all_features)
    all_features = all_features.astype(np.double)
    all_features = torch.from_numpy(all_features)

    all_features = all_features.type(torch.float)

    return torch.flatten(all_features), torch.tensor(label, dtype=torch.long)
  
  def __len__(self):
    return self.data.shape[0]

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = Dataset_seq(CONSTANTS['path'])

In [7]:
# Training 
from collections import Counter
batch_sz = CONSTANTS['batch_size']

data_size = len(data)
print(len(data))
print(int(len(data)*0.80), int(len(data)*0.20), int(len(data)*0.80) + int(len(data)*0.20))
train_set, valid_set = random_split(data, [int(len(data)*0.80)+1, int(len(data)*0.20)])

train_loader = DataLoader(train_set, batch_size=batch_sz, shuffle=True)
test_loader = DataLoader(valid_set, batch_size=batch_sz, shuffle=True)

20758
16606 4151 20757


# Model

In [8]:
d = pd.read_csv(CONSTANTS['path']).drop(['default payment next month', 'ID'], axis=1)
unique_tokens = pd.unique(d[d.columns].values.ravel('K'))

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [9]:
def metrics(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    
    return torch.round(precision, decimals=3), torch.round(recall, decimals=3), torch.round(f1, decimals=3)

In [10]:
import torch
import torch.nn as nn

class ClassificationTransformer(nn.Module):
  def __init__(self, input_dim, output_dim, hidden_dim, n_layers, n_heads, dropout, max_sequence_length, device):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, hidden_dim)
    self.encoder_layers = nn.TransformerEncoderLayer(hidden_dim, n_heads, hidden_dim, dropout)
    self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, n_layers)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.positional_encoding = self.create_positional_encoding(max_sequence_length, hidden_dim, device)
    self.to(device)

  def create_positional_encoding(self, max_sequence_length, hidden_dim, device):
    positional_encoding = torch.zeros(max_sequence_length, hidden_dim, device=device)
    alpha = 1 / (10000 ** (torch.arange(0, hidden_dim, 2, dtype=torch.float, device=device) / hidden_dim))
    positional_encoding[:, 0::2] = torch.sin(torch.arange(0, max_sequence_length, device=device).unsqueeze(1) * alpha)
    positional_encoding[:, 1::2] = torch.cos(torch.arange(0, max_sequence_length, device=device).unsqueeze(1) * alpha)
    return positional_encoding

  def add_positional_encoding(self, x):
    batch_size, sequence_length = x.shape[:2]
    x = x.to(self.positional_encoding.device)
    positional_encoding = self.positional_encoding[:sequence_length]
    positional_encoding = positional_encoding.repeat(batch_size, 1, 1)
    return x + positional_encoding

  def forward(self, x):
    x = x.to(self.positional_encoding.device)
    x = self.add_positional_encoding(self.embedding(x))
    x = self.transformer_encoder(x)
    x = x.mean(dim=1)
    return self.fc(x)

model = ClassificationTransformer(len(unique_tokens), 2, hidden_dim=320, n_layers=5, n_heads=8, dropout=0.2, max_sequence_length=100, device=device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), lr=0.1, weight_decay=.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
epochs = 10

In [14]:
#train
train_total_correct = []
train_total_loss = []
train_precision = []
train_recall = []
train_f1 = []
val_total_correct = []
val_total_loss = []
val_precision = []
val_recall = []
val_f1 = []

for epoch in range(epochs):
  model.train()
  precision, recall, f1 = 0, 0, 0
  preds, labelss = [], []
  total_loss = 0
  total_correct = 0
  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(torch.int64))
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    total_correct += (outputs.argmax(dim=1) == labels).sum().item()

    _, pred = torch.max(outputs, 1)
    preds.extend([g.item() for g in list(pred.detach().cpu())])
    labelss.extend(labels.tolist())
  
  lr_scheduler.step()
  precision, recall, f1 = metrics(torch.tensor(preds), torch.tensor(labelss))

  train_total_correct.append(total_correct / len(train_loader))
  train_total_loss.append(total_loss / len(train_loader))
  train_precision.append(precision)
  train_recall.append(recall)
  train_f1.append(f1)

  print(f"""TRAIN --> Epoch {epoch+1}: loss = {total_loss / len(train_loader):.4f}, accuracy = {total_correct / len(train_loader):.4f}, precision = {precision:.4f},, recall = {recall:.4f}, f1 = {f1:.4f}""")

  with torch.no_grad():
    precision, recall, f1 = 0, 0, 0
    preds, labelss = [], []
    model.eval()
    total_loss = 0
    total_correct = 0
    for inputs, labels in test_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs.to(torch.int64))
      total_loss += criterion(outputs, labels).item()
      total_correct += (outputs.argmax(dim=1) == labels).sum().item()

      _, pred = torch.max(outputs, 1)
      preds.extend(list(pred.detach().cpu()))
      labelss.extend(labels.tolist())
    
    precision, recall, f1 = metrics(torch.tensor(preds), torch.tensor(labelss))
    val_total_correct.append(total_correct / len(test_loader))
    val_total_loss.append(total_loss / len(test_loader))
    val_precision.append(precision)
    val_recall.append(recall)
    val_f1.append(f1)

  print(f"""VAL --> Epoch {epoch+1}: loss = {total_loss / len(test_loader):.4f}, accuracy = {total_correct / len(test_loader):.4f}, precision = {precision:.4f},, recall = {recall:.4f}, f1 = {f1:.4f}""")


TRAIN --> Epoch 1, loss = 0.4470, accuracy = 0.7765, precision = 0.7577, recall = 0.8780, f1 = 0.8022
VAL --> Epoch 1, loss = 0.4682, accuracy = 0.7861, precision = 0.7576, recall = 0.8568, f1 = 0.8077
TRAIN --> Epoch 2, loss = 0.4492, accuracy = 0.7861, precision = 0.7594, recall = 0.8796, f1 = 0.8019
VAL --> Epoch 2, loss = 0.4681, accuracy = 0.7811, precision = 0.7586, recall = 0.8663, f1 = 0.8047
TRAIN --> Epoch 3, loss = 0.4499, accuracy = 0.7955, precision = 0.7584, recall = 0.8780, f1 = 0.8045
VAL --> Epoch 3, loss = 0.4541, accuracy = 0.7916, precision = 0.7592, recall = 0.8680, f1 = 0.8154
TRAIN --> Epoch 4, loss = 0.4497, accuracy = 0.7918, precision = 0.7620, recall = 0.8827, f1 = 0.8115
VAL --> Epoch 4, loss = 0.4417, accuracy = 0.7985, precision = 0.7622, recall = 0.8764, f1 = 0.8272
TRAIN --> Epoch 5, loss = 0.4352, accuracy = 0.8117, precision = 0.7785, recall = 0.8951, f1 = 0.8272
VAL --> Epoch 5, loss = 0.4343, accuracy = 0.8142, precision = 0.7675, recall = 0.8830, f1

In [None]:
import matplotlib.pyplot as plt

plt.semilogy(np.array(range(len(train_total_loss))), train_total_loss)
plt.semilogy(np.array(range(len(val_total_loss))), val_total_loss)
plt.legend(('Train', 'Test',))
plt.ylabel('Training loss')
plt.xlabel('Epoch')
plt.show()

In [None]:
plt.semilogy(np.array(range(len(train_total_correct))), train_total_correct)
plt.semilogy(np.array(range(len(val_total_correct))), val_total_correct)
plt.legend(('Train', 'Test',))
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

In [None]:
plt.semilogy(np.array(range(len(train_f1))), train_f1)
plt.semilogy(np.array(range(len(val_f1))), val_f1)
plt.legend(('Train', 'Test',))
plt.ylabel('F1')
plt.xlabel('Epoch')
plt.show()