In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
from Decoder import Decoder
import numpy as np
import torch.nn as nn

Test the Decoder and tensor size with random data

In [None]:
model = Decoder(20_000, 1024, 16, 64, 4, 2, 0.1)

# 20,000 is the vocab size
x = np.random.randint(0, 20_000, size=(8, 512)) # 20_000 is meant  for easeir reading
x_t = torch.tensor(x)

# Pass the x through the model
y_t = model(x_t) # _t is for tensor
print("y Shape:", y_t.shape)

# Shape is (8, 512, 20_000) which is the batch size, sequence length, and vocab size

In [None]:
# For tokenization of data
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
from datasets import load_dataset

# we'll use the same dataset, just ignore the labels
raw_datasets = load_dataset("glue", "sst2")

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence", "idx", "label"])

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=1,  #32, # number of samples per batch
    collate_fn=data_collator
)

In [None]:
# check how it works
# k has [input_ids, attention_mask] and v has the values
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v:", v)
    print("k:", k, "v.shape:", v.shape)
  break

In [None]:
# Set autoreload
%reload_ext autoreload
%autoreload 2


import torch
from datetime import datetime

model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_len_single_sentence, #max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1,
)

In [None]:
print ("CUDA:",torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

Main training loop

In [None]:
from datetime import datetime

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = tokenizer.pad_token_id

      # Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      loss = criterion(outputs.transpose(2, 1), targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  
  return train_losses

In [None]:
# Set Optim and criterion
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

In [None]:
train_losses = train(
    model, criterion, optimizer, train_loader, epochs=15)