In [26]:
%load_ext autoreload
%autoreload 2

#from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
from Decoder import Decoder
import numpy as np
import torch.nn as nn
from CCustomTokenizer import CCustomTokenizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
# CUDA assertions
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [28]:
# print current path
import os
print(os.getcwd())
# Load the tokenizer
customTokenizer = CCustomTokenizer("../../data/SampleSentencesCorrected.txt")
print("Number of tokens:",customTokenizer.getVocabSize())

c:\ChaitanyaBelwal\ACTIVE\Development\GitHub\Python\MachineLearning\ReferenceCode\NLP\Transformers\TransformersCustom\Decoder
Number of tokens: 43


In [29]:
# Specify the dimension
dimEmbeddings = 64 # 64 embeddinds
VocabSize = customTokenizer.getMaxTokenId() # Since the embedding layer is index based used the idx
maxLen = customTokenizer.getMaxLen()
attentionKeysSize = 16 # size of q,k and v. Attention output size = noOfHeads*attentionKeysSize
noOfHeads = 4
noOfTransformerBlocks = 2

Define the Decoder, set the specific dimensions

In [35]:
model = Decoder(vocab_size = VocabSize,
                 max_len= maxLen, 
                 d_k = attentionKeysSize, 
                 d_model = dimEmbeddings, 
                 n_heads = noOfHeads, 
                 n_layers = noOfTransformerBlocks,
                 dropout_prob = 0.1)


In [None]:
#paramCount = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("# Trainable model parameters:", model.getParamCount())

In [32]:
print ("CUDA:",torch.cuda.is_available())
device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

CUDA: True
cpu


Decoder(
  (embedding): Embedding(43, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, el

Inference Routine

In [33]:
def getInferTokenIds(model, input):
    model.eval()
    with torch.no_grad():
        input = input.to(device)
        output = model(input)
    # logits will contain probabilities for each token    
    print("Output shape:", output.shape) # torch.Size([1, 12, 43])

    # output contains the logits
    # get the index for the highest logits for each token
    predictionIdxs = torch.argmax(output, axis=-1)
    print("Prediction Idxs shape:", predictionIdxs.shape) # torch.Size([1, 12])
    
    # Convert to list
    predictionIds = predictionIdxs.squeeze(0).tolist()
    # Get token ids from idx
    #predTokenIds = customTokenizer.getTokenIdsForIdxs(predictionIdxs)
    return predictionIds
     
def getDecodedSentence(inputTokens):
    return customTokenizer.decodeTokenizedSentence(inputTokens)

def runInferenceTillEnd(model, startTokens):
    input = customTokenizer.encodeTokenizedSentence(startTokens) # will add start and end tokens
    input = torch.tensor(input).unsqueeze(0).to(device)
    predTokenIds = getInferTokenIds(model, input)
    return getDecodedSentence(predTokenIds)


In [34]:
def testInfer_1():
    # Check inference with current model
    startTokens = "the"
    print(startTokens + " " + runInferenceTillEnd(model,startTokens)) # All are lower case

testInfer_1()

Output shape: torch.Size([1, 12, 43])
Prediction Idxs shape: torch.Size([1, 12])
the what for what high high high high high high high high high


In [None]:
# Prepare the training data
trainData = customTokenizer.getAllTrainingRows()
trainDataTensor = torch.tensor(trainData)
print("Train data shape:", trainDataTensor.shape)
# Shape is [154, 12]: 154 samples with 12 tokens

Main training loop

In [None]:
from datetime import datetime

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    # Go through each sample in the training dataset
    # update the model parameters after each sample like SGD
    # each row of trainingDataTensor
    rowsTrain = trainDataTensor.shape[0]
    for i in range(rowsTrain):
      #print(f"{i}/{rowsTrain}")
      x_t = trainDataTensor[i].unsqueeze(0).to(device)
     
      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = x_t.clone().detach()
      # shifts = -1, will shift the target to left by 1
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = customTokenizer.getPadTokenId()

      # Forward pass
      outputs = model(x_t)
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      transposedOutputs = outputs.transpose(2, 1)
      loss = criterion(transposedOutputs, targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      loss.backward()
      optimizer.step() # update the parameters
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  return train_losses

In [None]:
# Set Optim and criterion
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index= customTokenizer.getPadTokenId())
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)

In [None]:
# Start the training loop
train_losses = train(
    model, criterion, optimizer, epochs=100)

Epoch 1/100, Train Loss: 2.0687, Duration: 0:06:49.859084
Epoch 2/100, Train Loss: 2.0356, Duration: 0:00:01.170999
Epoch 3/100, Train Loss: 2.0491, Duration: 0:00:01.158245
Epoch 4/100, Train Loss: 2.0516, Duration: 0:00:01.147887
Epoch 5/100, Train Loss: 2.0496, Duration: 0:00:01.204300
Epoch 6/100, Train Loss: 2.0350, Duration: 0:00:01.148643
Epoch 7/100, Train Loss: 2.0538, Duration: 0:00:01.087868
Epoch 8/100, Train Loss: 2.0529, Duration: 0:00:01.125780
Epoch 9/100, Train Loss: 2.0523, Duration: 0:00:01.132392
Epoch 10/100, Train Loss: 2.0489, Duration: 0:00:01.060945
Epoch 11/100, Train Loss: 2.0463, Duration: 0:00:01.082432
Epoch 12/100, Train Loss: 2.0333, Duration: 0:00:01.154179
Epoch 13/100, Train Loss: 2.0403, Duration: 0:00:01.118678
Epoch 14/100, Train Loss: 2.0498, Duration: 0:00:01.069520
Epoch 15/100, Train Loss: 2.0374, Duration: 0:00:01.095247
Epoch 16/100, Train Loss: 2.0452, Duration: 0:00:01.115371
Epoch 17/100, Train Loss: 2.0287, Duration: 0:00:01.143713
Epoch 

Model has been trained, following sections will deal with model inference and metrics

In [None]:
testInfer_1()