In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

In [2]:
device = torch.device('cuda') if torch. cuda. is_available() else torch.device('cpu')

In [3]:
TrainUrl = "https://scale-static-assets.s3-us-west-2.amazonaws.com/ml-interview/expand/train.txt"

In [4]:
data = pd.read_csv(TrainUrl, sep='=', header=None, names = ["Input", "Output"])

In [5]:
print(data)

                     Input              Output
0         (7-3*z)*(-5*z-9)      15*z**2-8*z-63
1                  -9*s**2             -9*s**2
2            (2-2*n)*(n-1)       -2*n**2+4*n-2
3                     x**2                x**2
4             (4-x)*(x-23)       -x**2+27*x-92
...                    ...                 ...
999995   (2*k+14)*(7*k-10)    14*k**2+78*k-140
999996      (t-7)*(3*t+28)      3*t**2+7*t-196
999997   (10-2*i)*(6*i-10)   -12*i**2+80*i-100
999998  (-5*k-31)*(8*k+12)  -40*k**2-308*k-372
999999   (-8*i-13)*(3*i+4)    -24*i**2-71*i-52

[1000000 rows x 2 columns]


In [6]:
# input sequence as list of strings format
data_list_input = data['Input'].values.tolist()
# output sequence as list of strings format
data_list_output = data['Output'].values.tolist()

In [7]:
# size of the dataset
size = data.shape[0]

In [8]:
# string to index mapping
stoi = {}
# index to string mapping
itos = {}

In [9]:
# Padding token
stoi['<PAD>'] = 0
itos[0] = '<PAD>'
# Out of vocabulary token
stoi['<OOV>'] = 1
itos[1] = '<OOV>'
# Start of sentence token
stoi['<SOS>'] = 2
itos[2] = '<SOS>'
# End of sentence token
stoi['<EOS>'] = 3
itos[3] = '<EOS>'

In [10]:
# traverse data and build vocabulary
for i in range(0, size):
  
  for j in range(0, len(data_list_input[i])):
    x = stoi.get(data_list_input[i][j], -1)
    index = len(stoi.keys())
    if (x == -1):
      stoi[data_list_input[i][j]] = index
      itos[index] = data_list_input[i][j]

  for j in range(0, len(data_list_output[i])):
    x = stoi.get(data_list_output[i][j], -1)
    index = len(stoi.keys())
    if (x == -1):
      stoi[data_list_output[i][j]] = index
      itos[index] = data_list_output[i][j]


In [11]:
# Analyze vocabulary
print(stoi.keys())

dict_keys(['<PAD>', '<OOV>', '<SOS>', '<EOS>', '(', '7', '-', '3', '*', 'z', ')', '5', '9', '1', '2', '8', '6', 's', 'n', '+', '4', 'x', 'c', '0', 'k', 'o', 'j', 'h', 'y', 'i', 't', 'a'])


In [12]:
# input sequence is append with <EOS> in the end
# output sequence is appended with <SOS> in start and <EOS> in the end
# both input and output sequences are appended with <PAD> to get the same length which is max_seq_length

In [13]:
# Rounding off max seq len to 32 after including <SOS>, <EOS> and <PAD> 
max_seq_len = 32
embedding_dimension = 256
b_size = 64

In [14]:
# Convert input data from string to integer format
# with the help of stoi vocabulary built previously

data_list_input_index = []
data_list_input_padding = []
data_list_output_index = []
data_list_output_padding = []

for i in range(0, len(data_list_input)):

  x = []
  x_pad = []
  
  for j in range(0, len(data_list_input[i])):
    x.append(stoi[data_list_input[i][j]])
    x_pad.append(False)

  # Append <EOS> token in the end
  x.append(stoi['<EOS>'])
  x_pad.append(False)

  while(len(x)<max_seq_len):
    x.append(stoi['<PAD>'])
    x_pad.append(True)

  data_list_input_index.append(x)
  data_list_input_padding.append(x_pad)

  y = []
  y_pad = []

  # Append <SOS> token in the end
  y.append(stoi['<SOS>'])
  y_pad.append(False)

  for j in range(0, len(data_list_output[i])):
    y.append(stoi[data_list_output[i][j]])
    y_pad.append(False)

  # Append <EOS> token in the end
  y.append(stoi['<EOS>'])
  y_pad.append(False)
  while(len(y)<max_seq_len):
    y.append(stoi['<PAD>'])
    y_pad.append(True)
  data_list_output_index.append(y)
  data_list_output_padding.append(y_pad)
  

In [15]:
# Define custom dataset from PyTorch Dataset class
class MyDataset (Dataset):

  def __init__(self, data_list_input_index, data_list_output_index, data_list_input_padding, data_list_output_padding):
    # Lists of lists
    self.X = data_list_input_index 
    self.Y = data_list_output_index
    self.X_pad_mask = data_list_input_padding
    self.Y_pad_mask = data_list_output_padding

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return torch.tensor(self.X[idx], dtype=torch.int), torch.tensor(self.Y[idx], dtype=torch.int) , torch.tensor(self.X_pad_mask[idx], dtype=torch.bool), torch.tensor(self.Y_pad_mask[idx], dtype=torch.bool)

In [16]:
# Train, Validation and Test data sets
# 60/20/20 split of the original data
train_data = MyDataset(data_list_input_index[0:(int)(0.6*size)], data_list_output_index[0:(int)(0.6*size)], data_list_input_padding[0:(int)(0.6*size)], data_list_output_padding[0:(int)(0.6*size)])
val_data = MyDataset(data_list_input_index[(int)(0.6*size):(int)(0.8*size)], data_list_output_index[(int)(0.6*size):(int)(0.8*size)], data_list_input_padding[(int)(0.6*size):(int)(0.8*size)], data_list_output_padding[(int)(0.6*size):(int)(0.8*size)])
test_data = MyDataset(data_list_input_index[(int)(0.8*size):], data_list_output_index[(int)(0.8*size):], data_list_input_padding[(int)(0.8*size):], data_list_output_padding[(int)(0.8*size):])

In [17]:
# Train, Validation and Test data loaders
train_loader = DataLoader(train_data, batch_size = b_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size = b_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size = b_size, shuffle=False)

In [18]:
# Positional Embeddings - in accordance with Attention Is All You Need paper
pos_embeddings = np.zeros((max_seq_len, embedding_dimension), dtype=float)

for i in range(0, max_seq_len):
  for j in range(0, embedding_dimension):
    omega = 1 / (10000**((j - (j%2))/embedding_dimension))
    if (j % 2 == 0):
      pos_embeddings[i][j] = math.sin(i*omega)
    else:
      pos_embeddings[i][j] = math.cos(i*omega)

In [19]:
pos_embeddings = ((torch.from_numpy(pos_embeddings)).float()).to(device)

In [20]:
# Embeddings layer - used for both transformer encode and decoder
class Embed_Model(nn.Module):

  def __init__(self, number_embeddings, embedding_dimension, padding_index, pos_embeddings):
        
    super(Embed_Model, self).__init__()

    self.number_embeddings = number_embeddings
    self.embedding_dimension = embedding_dimension
    self.padding_index = padding_index
    self.pos_embeddings = pos_embeddings

    self.embedding_layer = nn.Embedding(num_embeddings=self.number_embeddings, embedding_dim=self.embedding_dimension, padding_idx=self.padding_index)

  def forward(self, x):
    # Input dimension - (batch size, seq length)
    # Sequence length is set as max_seq_length. Padding tokens are added in the end
    output = self.embedding_layer(x) + self.pos_embeddings
    # Output dimension - (batch size, seq length, embedding dim)
    # Add the positional embeddings and return the output
    return output

In [21]:
# Complete Model
# Contains both embedding layer and transformer model (as implemented by PyTorch)

class Model(nn.Module):

  def __init__(self, number_embeddings, embedding_dimension, feedforward_dimension, padding_index, num_heads, encoder_number_layers, decoder_number_layers, pos_embeddings):
        
    super(Model, self).__init__()

    self.embeddings = Embed_Model(number_embeddings, embedding_dimension, padding_index, pos_embeddings)
    self.transformer = nn.Transformer(d_model=embedding_dimension, nhead=num_heads, num_encoder_layers=encoder_number_layers, 
                                      num_decoder_layers=decoder_number_layers, dim_feedforward=feedforward_dimension, batch_first=True)
    self.fc = nn.Linear(embedding_dimension, number_embeddings)

# src: (N, S, E) if batch_first=True.

# tgt: (N, T, E) if batch_first=True.

# tgt_mask: lower triangular matrix marked with False - (T, T)
# [False, True]
# [False, False]

# src_key_padding_mask: mark true for padded tokens - (N, S)

# tgt_key_padding_mask: mark true for padded tokens - (N, T)

# memory_key_padding_mask: kept same as source key padding mask - (N, S)

  def forward(self, src, tgt, tgt_mask, src_key_padding_mask, tgt_key_padding_mask):
    
    src_embedding = self.embeddings(src)
    tgt_embedding = self.embeddings(tgt)
    output = self.transformer(src_embedding, tgt_embedding, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=src_key_padding_mask)
    output = self.fc(output)
    output = F.softmax(output, dim=2)
    return output

In [22]:
# All the hyperparameters (no of encoders, no of decoders, feed forward dimension, no attention heads) 
# were reduced by half to make the number of trainable parameters approximately 5 Million
number_embeddings = len(stoi.keys())
embedding_dimension = 256
padding_index = 0
num_heads = 4
encoder_number_layers = 3
decoder_number_layers = 3
feedforward_dimension = 1024

neural_net = Model(number_embeddings, embedding_dimension, feedforward_dimension, padding_index, num_heads, encoder_number_layers, decoder_number_layers, pos_embeddings)

In [23]:
neural_net = neural_net.to(device)

In [24]:
# SGD optimizer
optm = optim.SGD(neural_net.parameters(), lr = 0.1, momentum=0.9)
# Exponential Decay Scheduler
scheduler1 = torch.optim.lr_scheduler.ExponentialLR(optm, gamma=0.9)

In [25]:
# To prevent decoder from cheating (so it doesn't attend to future positions)
# Lower Triangular Matrix marked as false
tgt_mask_tensor = torch.zeros((max_seq_len, max_seq_len), dtype=torch.bool)

for i in range(0, max_seq_len):
  for j in range(0, max_seq_len):

    if (j<=i):
      tgt_mask_tensor[i][j] = False

    else:
      tgt_mask_tensor[i][j] = True


In [26]:
# Model Training with Early stopping as the Regularizer
# However I did not see increase in validation loss for first 20 epochs
# and hence we didn't have to use early stopping

epochs = 20

for i in range(0, epochs):

  print("*************** Epoch: ", i+1, " ********************")

  loss_t = 0

  for data in train_loader:

    optm.zero_grad()

    src = data[0].to(device)
    tgt = data[1].to(device)
    src_mask_padding = data[2].to(device)
    tgt_mask_padding = data[3].to(device)
    tgt_mask_tensor_input = tgt_mask_tensor.to(device)

    output = neural_net(src, tgt, tgt_mask_tensor_input, src_mask_padding, tgt_mask_padding)

    # shifted output
    tgt = torch.roll(tgt, -1, 1)
    tgt = tgt.view(-1)
    # mask <pad> tokens and <sos> as these won't contribute to
    # the cross entropy loss
    mask = torch.logical_not(torch.logical_or(tgt == 0, tgt == 2))
    mask_f = mask.float()

    output = output.view(-1, number_embeddings)
    ll = -torch.log(output[range(output.shape[0]), tgt.long()])
    ll = (ll * mask_f).masked_select(mask).mean()
    ll.backward()

    torch.nn.utils.clip_grad_norm_(neural_net.parameters(), 0.5)
    optm.step()

    loss_t = loss_t + (ll*b_size)

  loss_t = loss_t/600000
  
  loss_v = 0
  with torch.no_grad():
    for data in val_loader:
      
      src = data[0].to(device)
      tgt = data[1].to(device)
      src_mask_padding = data[2].to(device)
      tgt_mask_padding = data[3].to(device)
      tgt_mask_tensor_input = tgt_mask_tensor.to(device)

      output = neural_net(src, tgt, tgt_mask_tensor_input, src_mask_padding, tgt_mask_padding)

      # shifted output
      tgt = torch.roll(tgt, -1, 1)
      tgt = tgt.view(-1)
      # mask <pad> tokens and <sos> as these won't contribute to
      # the cross entropy loss
      mask = torch.logical_not(torch.logical_or(tgt == 0, tgt == 2))
      mask_f = mask.float()

      output = output.view(-1, number_embeddings)
      ll = -torch.log(output[range(output.shape[0]), tgt.long()])
      ll = (ll * mask_f).masked_select(mask).mean()

      loss_v = loss_v + (ll*b_size)

  loss_v = loss_v/200000

  print("Training Loss = "+str(loss_t)+"; Val Loss = "+str(loss_v))
  scheduler1.step()


*************** Epoch:  1  ********************
Training Loss = tensor(0.2963, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.1274, device='cuda:0')
*************** Epoch:  2  ********************
Training Loss = tensor(0.0993, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.0831, device='cuda:0')
*************** Epoch:  3  ********************
Training Loss = tensor(0.0643, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.0534, device='cuda:0')
*************** Epoch:  4  ********************
Training Loss = tensor(0.0463, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.0424, device='cuda:0')
*************** Epoch:  5  ********************
Training Loss = tensor(0.0370, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.0340, device='cuda:0')
*************** Epoch:  6  ********************
Training Loss = tensor(0.0308, device='cuda:0', grad_fn=<DivBackward0>); Val Loss = tensor(0.0299, device='cuda:0')
*************** 

In [27]:
model_path = "/content/drive/My Drive/model2.pt"
torch.save(neural_net, model_path)

In [28]:
neural_net_loaded = Model(number_embeddings, embedding_dimension, feedforward_dimension, padding_index, num_heads, encoder_number_layers, decoder_number_layers, pos_embeddings)
neural_net_loaded = torch.load("/content/drive/My Drive/model2.pt")
neural_net_loaded.eval()
neural_net_loaded = neural_net_loaded.to(device)

In [29]:
# Check Train and Validation accuracy
# Different from Test accuracy since we know the correct output
# Hence accuracy check is done with teacher forcing
def pred_accuracy(data_loader, neural_net):

  correct = 0
  total = 0

  with torch.no_grad():
    for data in data_loader:

      src = data[0].to(device)
      tgt = data[1].to(device)
      src_mask_padding = data[2].to(device)
      tgt_mask_padding = data[3].to(device)
      tgt_mask_tensor_input = tgt_mask_tensor.to(device)

      # shape [batch, seq, num_embeddings]
      output = neural_net(src, tgt, tgt_mask_tensor_input, src_mask_padding, tgt_mask_padding)

      # shifted output
      tgt = torch.roll(tgt, -1, 1)
      # mask <pad> tokens and <sos> as these won't contribute to our accuracy prediction
      mask = torch.logical_not(torch.logical_or(tgt == 0, tgt == 2))

      # resulting preductions is [batch, seq]
      predictions = torch.argmax(output, 2)

      for i in range(0, src.shape[0]):
        tensor1 = tgt[i].masked_select(mask[i])
        tensor2 = predictions[i].masked_select(mask[i])
        total += 1
        if (torch.equal(tensor1, tensor2)):
          correct += 1
      
  return (correct/total)


In [30]:
print(pred_accuracy(train_loader, neural_net_loaded))

0.9926766666666667


In [31]:
print(pred_accuracy(val_loader, neural_net_loaded))

0.99127


In [32]:
# Check Test set accuracy
def test_pred_accuracy(data_loader, neural_net, batch_size=b_size):

  correct = 0
  total = 0

  with torch.no_grad():
    for data in data_loader:

      src = data[0].to(device)
      tgt = data[1].to(device)
      src_mask_padding = data[2].to(device)
      tgt_mask_padding = data[3].to(device)
      tgt_mask_tensor_input = tgt_mask_tensor.to(device)

      # shape [1, seq, num_embeddings]
      # need to decode token by token

      # start with a <SOS> token which corresponds with 2
      input_tgt = (torch.zeros((batch_size, max_seq_len)).long()).to(device) 
      input_tgt[:, 0] = stoi['<SOS>']
      output_tgt = torch.zeros((batch_size, max_seq_len)).long() 

      for i in range(0, max_seq_len):
        output = neural_net(src, input_tgt, tgt_mask_tensor_input, src_mask_padding, tgt_mask_padding)
        output_tgt = torch.argmax(output, 2)
        if (i+1<max_seq_len):
          input_tgt[:, i+1] = output_tgt[:, i]

      # shifted output
      tgt = torch.roll(tgt, -1, 1)
      # mask <pad> tokens and <sos> as these won't contribute to our accuracy prediction
      mask = torch.logical_not(torch.logical_or(tgt == 0, tgt == 2))

      for i in range(0, src.shape[0]):
        tensor1 = tgt[i].masked_select(mask[i])
        tensor2 = output_tgt[i].masked_select(mask[i])
        total += 1
        if (torch.equal(tensor1, tensor2)):
          correct += 1
      
  return (correct/total)


In [33]:
print(test_pred_accuracy(test_loader, neural_net_loaded))

0.99145


# **Model Analysis**
Type of layers, no of trainable parameters etc.

In [34]:
print(neural_net)

Model(
  (embeddings): Embed_Model(
    (embedding_layer): Embedding(32, 256, padding_idx=0)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, 

In [35]:
# Source: https://stackoverflow.com/questions/49201236/check-the-total-number-of-parameters-in-a-pytorch-model
trainable_params = sum(parameter.numel() for parameter in neural_net.parameters() if parameter.requires_grad)
print(trainable_params)

5547040


In [36]:
# Source: https://stackoverflow.com/questions/49201236/check-the-total-number-of-parameters-in-a-pytorch-model
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(neural_net)

+-------------------------------------------------------------+------------+
|                           Modules                           | Parameters |
+-------------------------------------------------------------+------------+
|              embeddings.embedding_layer.weight              |    8192    |
|    transformer.encoder.layers.0.self_attn.in_proj_weight    |   196608   |
|     transformer.encoder.layers.0.self_attn.in_proj_bias     |    768     |
|    transformer.encoder.layers.0.self_attn.out_proj.weight   |   65536    |
|     transformer.encoder.layers.0.self_attn.out_proj.bias    |    256     |
|         transformer.encoder.layers.0.linear1.weight         |   262144   |
|          transformer.encoder.layers.0.linear1.bias          |    1024    |
|         transformer.encoder.layers.0.linear2.weight         |   262144   |
|          transformer.encoder.layers.0.linear2.bias          |    256     |
|          transformer.encoder.layers.0.norm1.weight          |    256     |

5547040