In [1]:
%load_ext autoreload
%autoreload 2

#from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
from Decoder import Decoder
import numpy as np
from CCustomTokenizer import CCustomTokenizer

In [2]:
# print current path
import os
print(os.getcwd())
# Load the tokenizer
tokenizer = CCustomTokenizer("../../data/SampleSentencesCorrected.txt")
print("Number of tokens:",tokenizer.getVocabSize())

c:\ChaitanyaBelwal\ACTIVE\Development\GitHub\Python\MachineLearning\ReferenceCode\NLP\Transformers\TransformersCustom\Decoder
Number of tokens: 43


In [3]:
# Specify the dimension
dimEmbeddings = 64 # 64 embeddinds
VocabSize = tokenizer.getVocabSize()
maxLen = tokenizer.getMaxLen()
attentionKeySize = 16
noOfHeads = 4
noOfTransformerBlocks = 2

Define the Decorder, set the specific dimensions

In [4]:
model = Decoder(vocab_size = VocabSize,
                 max_len=maxLen, 
                 d_k = attentionKeySize, 
                 d_model = dimEmbeddings, 
                 n_heads = noOfHeads, 
                 n_layers = noOfTransformerBlocks,
                 dropout_prob = 0.1)

paramCount = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("# Traininable model parameters:", paramCount)

# Traininable model parameters: 105643


In [None]:
x = np.random.randint(0, VocabSize, size=(8, 512)) 
x_t = torch.tensor(x)

# Pass the x through the model
y_t = model(x_t) # _t is for tensor
print("y Shape:", y_t.shape)

# Shape is (8, 512, 20_000) which is the batch size, sequence length, and vocab size

In [5]:
# Prepare the training data
trainData = tokenizer.getAllTrainingRows()
trainDataTensor = torch.tensor(trainData)
print("Train data shape:", trainDataTensor.shape)


ValueError: expected sequence of length 10 at dim 1 (got 12)

In [None]:
print ("CUDA:",torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

Main training loop

In [None]:
from datetime import datetime

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = tokenizer.pad_token_id

      # Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      loss = criterion(outputs.transpose(2, 1), targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  
  return train_losses