In [52]:
%load_ext autoreload
%autoreload 2

#from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
from Decoder import Decoder
import numpy as np
import torch.nn as nn
from CCustomTokenizer import CCustomTokenizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
# CUDA assertions
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [55]:
# print current path
import os
print(os.getcwd())
# Load the tokenizer
customTokenizer = CCustomTokenizer("../../data/SampleSentencesCorrected.txt")
print("Number of tokens:",customTokenizer.getVocabSize())

c:\ChaitanyaBelwal\ACTIVE\Development\GitHub\Python\MachineLearning\ReferenceCode\NLP\Transformers\TransformersCustom\Decoder
Number of tokens: 43


In [56]:
# Specify the dimension
dimEmbeddings = 64 # 64 embeddinds
VocabSize = customTokenizer.getMaxTokenId() # Since the embedding layer is index based used the idx
maxLen = customTokenizer.getMaxLen()
attentionKeysSize = 16 # size of q,k and v. Attention output size = noOfHeads*attentionKeysSize
noOfHeads = 4
noOfTransformerBlocks = 2

Define the Decoder, set the specific dimensions

In [57]:
model = Decoder(vocab_size = VocabSize,
                 max_len= maxLen, 
                 d_k = attentionKeysSize, 
                 d_model = dimEmbeddings, 
                 n_heads = noOfHeads, 
                 n_layers = noOfTransformerBlocks,
                 dropout_prob = 0.0) # 0.1


In [58]:
#paramCount = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("# Trainable model parameters:", model.getParamCount())

# Trainable model parameters: 238891


In [59]:
print ("CUDA:",torch.cuda.is_available())
device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

CUDA: True
cpu


Decoder(
  (embedding): Embedding(43, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.0, inplace=False)
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, el

Inference Routine

In [75]:
def getInferTokenIds(model, input):
    model.eval()
    with torch.no_grad():
        input = input.to(device)
        outputs = model(input)
    # logits will contain probabilities for each token    
    #print("Outputs shape:", outputs.shape) # torch.Size([1, 12, 43])

    # output contains the logits
    # get the index for the highest logits for each token
    #predictionIdxs = torch.argmax(output, axis=-1)
    predictionId = torch.argmax(outputs[:, -1, :], axis=-1)
    #print("Prediction Id shape:", predictionId.shape) # torch.Size([1, 12])
    
    # Convert to list
    #predictionIds = predictionIdxs.squeeze(0).tolist()
    # Get token ids from idx
    #predTokenIds = customTokenizer.getTokenIdsForIdxs(predictionIdxs)
    return predictionId # return as a tensor
     
def getDecodedSentence(tensorInputTokens):
    # Convert to list
    inputTokenIds = tensorInputTokens.squeeze(0).tolist()
    return customTokenizer.decode(inputTokenIds)

def runInference(model, prompt):
    tokenizedPrompt = customTokenizer.encode(prompt) # will add start and end tokens
    # Remove the SEP Token at the end
    inputTokenIds = tokenizedPrompt[:-1] # Mask is not being considered at this time
    tensorInputTokenIds = torch.tensor(inputTokenIds).unsqueeze(0).to(device)
    len = 2
    while(len < customTokenizer.getMaxLen()):
        len += 1
        predTokenId = getInferTokenIds(model, tensorInputTokenIds)
        tensorInputTokenIds = torch.hstack((tensorInputTokenIds, predTokenId.view(1, 1)))
        if predTokenId == customTokenizer.sepTokenId:
            break
    return getDecodedSentence(tensorInputTokenIds)

'''
prompt = "it's a"
tokenized_prompt = tokenizer(prompt, return_tensors='pt')
# prepare inputs + get rid of SEP token at the end
input_ids = tokenized_prompt['input_ids'][:, :-1].to(device)
mask = tokenized_prompt['attention_mask'][:, :-1].to(device)
for _ in range(20):
  outputs = model(input_ids, mask)
  input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
  mask = torch.ones_like(input_ids)
  if prediction_id == tokenizer.sep_token_id:
    break
  tokenizer.decode(input_ids[0])
'''

'\nprompt = "it\'s a"\ntokenized_prompt = tokenizer(prompt, return_tensors=\'pt\')\n# prepare inputs + get rid of SEP token at the end\ninput_ids = tokenized_prompt[\'input_ids\'][:, :-1].to(device)\nmask = tokenized_prompt[\'attention_mask\'][:, :-1].to(device)\nfor _ in range(20):\n  outputs = model(input_ids, mask)\n  input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))\n  mask = torch.ones_like(input_ids)\n  if prediction_id == tokenizer.sep_token_id:\n    break\n  tokenizer.decode(input_ids[0])\n'

In [76]:
def testInfer_1():
    # Check inference with current model
    prompt = "Romi"
    print(f"Prompt: {prompt}")
    print(f"Response: {print(runInference(model,prompt))}") # All are lower case

testInfer_1()

Prompt: Romi
<CLS> romi leaped jump in inside not a inside not a inside
Response: None


In [77]:
# Prepare the training data
trainData = customTokenizer.getAllTrainingRows()
trainDataTensor = torch.tensor(trainData)
print("Train data shape:", trainDataTensor.shape)
# Shape is [154, 12]: 154 samples with 12 tokens

Train data shape: torch.Size([154, 12])


Main training loop

In [79]:
from datetime import datetime

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    # Go through each sample in the training dataset
    # update the model parameters after each sample like SGD
    # each row of trainingDataTensor
    rowsTrain = trainDataTensor.shape[0]
    for i in range(rowsTrain):
      #print(f"{i}/{rowsTrain}")
      x_t = trainDataTensor[i].unsqueeze(0).to(device)
     
      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = x_t.clone().detach()
      # shifts = -1, will shift the target to left by 1
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = customTokenizer.getPadTokenId()

      # Forward pass
      outputs = model(x_t)
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      transposedOutputs = outputs.transpose(2, 1)
      loss = criterion(transposedOutputs, targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      loss.backward()
      optimizer.step() # update the parameters
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  return train_losses

In [80]:
# Set Optim and criterion
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index= customTokenizer.getPadTokenId())
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [81]:
# Start the training loop
train_losses = train(
    model, criterion, optimizer, epochs=100)

Epoch 1/100, Train Loss: 1.8032, Duration: 0:00:01.103063
Epoch 2/100, Train Loss: 1.3070, Duration: 0:00:01.183914
Epoch 3/100, Train Loss: 1.2177, Duration: 0:00:01.125852
Epoch 4/100, Train Loss: 1.1614, Duration: 0:00:01.157971
Epoch 5/100, Train Loss: 1.1198, Duration: 0:00:01.184624
Epoch 6/100, Train Loss: 1.0821, Duration: 0:00:01.381633
Epoch 7/100, Train Loss: 1.0744, Duration: 0:00:01.140858
Epoch 8/100, Train Loss: 1.0481, Duration: 0:00:01.096581
Epoch 9/100, Train Loss: 1.0520, Duration: 0:00:01.185402
Epoch 10/100, Train Loss: 1.0595, Duration: 0:00:01.219122
Epoch 11/100, Train Loss: 1.0205, Duration: 0:00:01.165521
Epoch 12/100, Train Loss: 1.0198, Duration: 0:00:01.170201
Epoch 13/100, Train Loss: 1.0042, Duration: 0:00:01.102082
Epoch 14/100, Train Loss: 0.9871, Duration: 0:00:01.152140
Epoch 15/100, Train Loss: 0.9752, Duration: 0:00:01.169379
Epoch 16/100, Train Loss: 0.9714, Duration: 0:00:01.134976
Epoch 17/100, Train Loss: 0.9714, Duration: 0:00:01.140230
Epoch 

Model has been trained, following sections will deal with model inference and metrics

In [82]:
testInfer_1()

Prompt: Romi
<CLS> romi is a cat <SEP>
Response: None
