In [1]:
# Modules to use
from transformers import AutoTokenizer, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


Prepare the dataset with HF library

In [2]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

from datasets import load_dataset

In [3]:
# load the sst2 dataset of the glue benchmark
# sst2 is used for sentiment analysis and will have a target label 
# raw_datasets = load_dataset("glue", "sst2")
raw_datasets = load_dataset("sst2")

Using the latest cached version of the dataset since sst2 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Chaitanya Belwal\.cache\huggingface\datasets\sst2\default\0.0.0\8d51e7e4887a4caaa95b3fbebbf53c0490b58bbb (last modified on Thu Jan  9 06:32:01 2025).


In [4]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [5]:

tokenized_datasets['train'][0]

{'idx': 0,
 'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'input_ids': [101, 4750, 1207, 3318, 5266, 1121, 1103, 22467, 2338, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
# remove columns not needed
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [7]:
# Load the train and validation test sets and set the batch size
from torch.utils.data import DataLoader

batch_size = 1

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=False, # Will ad randomization to the training data, training data index will be different in each run if True
    batch_size=batch_size,
    collate_fn=data_collator
)
valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    collate_fn=data_collator
)

In [8]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

k: labels v.shape: torch.Size([1])
k: input_ids v.shape: torch.Size([1, 10])
k: attention_mask v.shape: torch.Size([1, 10])


In [9]:
# Convert to set: unordered collection of unique elements
set(tokenized_datasets['train']['labels'])

tokenizer.vocab_size

28996

    '_' is allowed between number for easier readibility
    vocab_size,max_len,d_k, d_model, n_heads,n_layers,n_classes,dropout_prob
    vocab_size = 20,000
    max_len = 1024
    d_k = 16
    d_model = 64
    n_heads = 4
    n_layers = 2
    n_classes = 5
    dropout_prob = 0.1
    

In [10]:
# Set autoreload
%reload_ext autoreload
%autoreload 2

from Encoder import Encoder
import torch
import torch.nn as nn
from datetime import datetime
import numpy as np

model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len=512, #tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_prob=0.1,
)

4.605170185988092


In [11]:
print ("CUDA:",torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

CUDA: True
cuda:0


Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

Start Training Loop

In [12]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


See samples of training data

In [13]:
# In BERT token ids 101 and 102 represent start and end of sentence respectively 
"""
for batch in train_loader:
    # modify batch size to 1 before running this
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
""" 

'\nfor batch in train_loader:\n    # modify batch size to 1 before running this\n    batch = {k: v.to(device) for k, v in batch.items()}\n    print(batch)\n'

In [14]:
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      # model forward function only takes two params (the attention is optional) 
      # and the output is a binary classification problem
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
        
      # Backward and optimize
      loss.backward()
      optimizer.step()

      train_loss += loss.item()*batch['input_ids'].size(0)
      n_train += batch['input_ids'].size(0)

    # Get average train loss
    train_loss = train_loss / n_train
    
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
      test_loss += loss.item()*batch['input_ids'].size(0)
      n_test += batch['input_ids'].size(0)
    test_loss = test_loss / n_test

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
  
  return train_losses, test_losses

In [None]:
# Start the training
train_losses, test_losses = train(
    model, criterion, optimizer, train_loader, valid_loader, epochs=4)

# TODO: Evaluate size of Attention Mechanism