In [1]:
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
from Decoder import Decoder
import numpy as np
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


Test the Decoder and tensor size with random data

In [2]:
model = Decoder(20_000, 1024, 16, 64, 4, 2, 0.1)

# 20,000 is the vocab size
x = np.random.randint(0, 20_000, size=(8, 512)) # 20_000 is meant  for easeir reading
x_t = torch.tensor(x)

# Pass the x through the model
y_t = model(x_t) # _t is for tensor
print("y Shape:", y_t.shape)

# Shape is (8, 512, 20_000) which is the batch size, sequence length, and vocab size

y Shape: torch.Size([8, 512, 20000])


In [3]:
# For tokenization of data
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [4]:
from datasets import load_dataset

# we'll use the same dataset, just ignore the labels
raw_datasets = load_dataset("glue", "sst2")

In [6]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence", "idx", "label"])

In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=1,  #32, # number of samples per batch
    collate_fn=data_collator
)

In [8]:
# check how it works
# k has [input_ids, attention_mask] and v has the values
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v:", v)
    print("k:", k, "v.shape:", v.shape)
  break

k: input_ids v: tensor([[  101,  7678, 22556,  1106,  1103,  6976,  5945,  1105, 27668,   102]])
k: input_ids v.shape: torch.Size([1, 10])
k: attention_mask v: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
k: attention_mask v.shape: torch.Size([1, 10])


In [9]:
# Set autoreload
%reload_ext autoreload
%autoreload 2


import torch
from datetime import datetime

model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_len_single_sentence, #max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1,
)

In [15]:
print ("CUDA:",torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

CUDA: True
cuda:0


Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

Main training loop

In [24]:
from datetime import datetime

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = tokenizer.pad_token_id

      # Print the inputs -----------
      #print(batch['input_ids'].shape,batch['input_ids'])
      #print(tokenizer.decode(batch['input_ids'][0]))
      #  -----------

      # Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      loss = criterion(outputs.transpose(2, 1), targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  
  return train_losses

In [21]:
# Set Optim and criterion
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

In [23]:
train_losses = train(
    model, criterion, optimizer, train_loader, epochs=15)

torch.Size([1, 9]) tensor([[  101,  1451,  2046, 11778,  1116,  1103,  6548,  3853,   102]],
       device='cuda:0')
[CLS] every shot enhances the excellent performances [SEP]
torch.Size([1, 4]) tensor([[  101,  5049, 22857,   102]], device='cuda:0')
[CLS] operates nicely [SEP]
torch.Size([1, 8]) tensor([[ 101, 2296, 1176,  170, 1632, 4007, 3767,  102]], device='cuda:0')
[CLS] feeling like a great missed opportunity [SEP]
torch.Size([1, 13]) tensor([[ 101, 1110, 1376, 1167, 1190,  170, 8796, 2523, 2011, 1106, 2311, 1159,
          102]], device='cuda:0')
[CLS] is little more than a mall movie designed to kill time [SEP]
torch.Size([1, 17]) tensor([[ 101, 1142, 1110,  170, 1843,  117,  176, 7729, 2340,  117, 2121, 6276,
         1376,  176, 5521,  119,  102]], device='cuda:0')
[CLS] this is a dark, gritty, sometimes funny little gem. [SEP]
torch.Size([1, 12]) tensor([[  101,   178, 11604, 22398, 15298,  8362,  7804,  3382,  1821,  9725,
          3633,   102]], device='cuda:0')
[CLS] ir

KeyboardInterrupt: 

Inference Routines

In [27]:
# generate something
prompt = "it's a"

tokenized_prompt = tokenizer(prompt, return_tensors='pt')

# prepare inputs + get rid of SEP token at the end
input_ids = tokenized_prompt['input_ids'][:, :-1].to(device)
mask = tokenized_prompt['attention_mask'][:, :-1].to(device)

for _ in range(20):
  outputs = model(input_ids, mask)
  prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)

  input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
  mask = torch.ones_like(input_ids)

  if prediction_id == tokenizer.sep_token_id:
    break

tokenizer.decode(input_ids[0])

KeyboardInterrupt: 