In [None]:
# Modules to use
from transformers import AutoTokenizer, DataCollatorWithPadding

Prepare the dataset with HF library

In [None]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

from datasets import load_dataset

In [None]:
# load the sst2 dataset of the glue benchmark
# sst2 is used for sentiment analysis and will have a target label 
# raw_datasets = load_dataset("glue", "sst2")
raw_datasets = load_dataset("sst2")

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets

In [None]:

tokenized_datasets['train'][0]

In [None]:
# remove columns not needed
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
# Load the train and validation test sets and set the batch size
from torch.utils.data import DataLoader

batch_size = 1

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)
valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    collate_fn=data_collator
)

In [None]:
# check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

In [None]:
# Convert to set: unordered collection of unique elements
set(tokenized_datasets['train']['labels'])

tokenizer.vocab_size

    '_' is allowed between number for easier readibility
    vocab_size,max_len,d_k, d_model, n_heads,n_layers,n_classes,dropout_prob
    vocab_size = 20,000
    max_len = 1024
    d_k = 16
    d_model = 64
    n_heads = 4
    n_layers = 2
    n_classes = 5
    dropout_prob = 0.1
    

In [None]:
# Set autoreload
%reload_ext autoreload
%autoreload 2

from Encoder import Encoder
import torch
import torch.nn as nn
from datetime import datetime
import numpy as np

model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len=512, #tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_prob=0.1,
)

In [None]:
print ("CUDA:",torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

Start Training Loop

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


See samples of training data

In [None]:
# In BERT token ids 101 and 102 represent start and end of sentence respectively 
"""
for batch in train_loader:
    # modify batch size to 1 before running this
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
""" 

In [None]:
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      # model forward function only takes two params (the attention is optional) 
      # and the output is a binary classification problem
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
        
      # Backward and optimize
      loss.backward()
      optimizer.step()

      train_loss += loss.item()*batch['input_ids'].size(0)
      n_train += batch['input_ids'].size(0)

    # Get average train loss
    train_loss = train_loss / n_train
    
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs, batch['labels'])
      test_loss += loss.item()*batch['input_ids'].size(0)
      n_test += batch['input_ids'].size(0)
    test_loss = test_loss / n_test

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
  
  return train_losses, test_losses

In [None]:
# Start the training
train_losses, test_losses = train(
    model, criterion, optimizer, train_loader, valid_loader, epochs=4)