<a href="https://colab.research.google.com/github/abudubai16/NLP-from-Scratch/blob/main/NLP_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [1]:
! pip install portalocker --q
! pip install datasets --q --q
! pip install positional-encodings[pytorch] --q --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
from positional_encodings.torch_encodings import PositionalEncoding1D
from datasets import load_dataset, concatenate_datasets

# Tokenizers
from tokenizers import Tokenizer
from tokenizers.normalizers import NFD, StripAccents, Sequence, Lowercase
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import WordPiece
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer

import os
import time
import math

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load the dataset

In [4]:
if not os.path.exists('/content/wikitext'):
  ! git clone 'https://huggingface.co/datasets/wikitext' --q

In [5]:
dataset = load_dataset('parquet', data_files={
    'train1':'/content/wikitext/wikitext-103-v1/train-00000-of-00002.parquet',
    'train2': '/content/wikitext/wikitext-103-v1/train-00001-of-00002.parquet',
    'val':'/content/wikitext/wikitext-103-v1/validation-00000-of-00001.parquet',
    'test':'/content/wikitext/wikitext-103-v1/test-00000-of-00001.parquet'
})

# Dataset for training the model
train_ds = concatenate_datasets([
    dataset['train1'],
      dataset['train2'],
      dataset['test']
])['text']

# Dataset for creating the tokenizer
stacked = concatenate_datasets([
      dataset['train1'],
      dataset['train2'],
      dataset['val'],
      dataset['test']
  ])

Generating train1 split: 0 examples [00:00, ? examples/s]

Generating train2 split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Create the tokenizer

In [6]:
tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'), )

tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tokenizer.trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train_from_iterator(stacked['text'], trainer)

## Create the dataloader

In [7]:
class token_ds(Dataset):
  def __init__(self, ds, max_length):
    super().__init__()
    length = len(ds)
    max_length += 1
    collector = []
    temp = torch.zeros(1, max_length)

    with tqdm(total=length) as pbar:
      for pos, line in enumerate(ds):
        sentences = line.split('.')
        for sentence in sentences:
          a = torch.tensor(tokenizer.encode(sentence).ids)

          size = a.shape[0]

          if size==2:
            continue

          if size<max_length:
            a = torch.cat([a.long(), torch.zeros(max_length-size).long()])
          else:
            a = a[:max_length]

          collector.append(a.unsqueeze(0))

          # Performance purpose
          if pos%10000 == 0:
            temp = torch.stack([temp, *collector], dim=0)
            collector = []

        pbar.update(1)

    temp = torch.cat([temp, *collector], dim=0).long()

    self.value = temp[1:, :-1]
    self.target = temp[1:, 1:]

  def __len__(self):
    return self.value.shape[0]

  def __getitem__(self, i):
    return self.value[i,:], self.target[i, :]

In [8]:
seq_len = 100

In [9]:
batch_size = 32

val_dl = DataLoader(token_ds(dataset['val']['text'], seq_len), num_workers=2, batch_size=batch_size)

100%|██████████| 3760/3760 [00:01<00:00, 2734.87it/s]


In [10]:
for pos, (X,Y) in enumerate(val_dl):
  print(f'{X} \n {Y}')
  print(f'{X.shape} \n {Y.shape}')
  break

tensor([[    1,    33,  2893,  ...,     0,     0,     0],
        [    1,  2893, 12796,  ...,     0,     0,     0],
        [    1,  1417,  1424,  ...,     0,     0,     0],
        ...,
        [    1,  1347, 20578,  ...,     0,     0,     0],
        [    1,  2031,  1426,  ...,     0,     0,     0],
        [    1, 16222,  7371,  ...,     0,     0,     0]]) 
 tensor([[   33,  2893, 12796,  ...,     0,     0,     0],
        [ 2893, 12796, 16222,  ...,     0,     0,     0],
        [ 1417,  1424,  6192,  ...,     0,     0,     0],
        ...,
        [ 1347, 20578,  1381,  ...,     0,     0,     0],
        [ 2031,  1426,  1510,  ...,     0,     0,     0],
        [16222,  7371,  1510,  ...,     0,     0,     0]])
torch.Size([32, 100]) 
 torch.Size([32, 100])


## Creating the model

In [11]:
class TransformerModel(nn.Module):
  def __init__(self, ntokens, d_model, nhead, seq_len, device, nlayers=6, dim_feedforward=2048, dropout=0.1):
    super(TransformerModel, self).__init__()
    self.d_model = d_model
    self.device = device
    """
    ntokens: The number of words in the dictionary
    d_model: the size of each word embedding
    n_head: the number of heads in each encoder layer
    nlayers: the number of transformer encoder layers in the encoder
    """
    self.seq_len = seq_len

    self.embed = nn.Embedding(ntokens, d_model, padding_idx=0)
    self.pos_embed = PositionalEncoding1D(d_model)

    encoder_layer = nn.TransformerEncoderLayer(d_model,
                                               nhead=nhead,
                                               dim_feedforward=dim_feedforward,
                                               dropout=dropout,
                                               batch_first=True)

    self.encoder = nn.TransformerEncoder(encoder_layer,
                                         num_layers=nlayers,
                                         enable_nested_tensor=True)
    self.fc = nn.Linear(d_model, ntokens)
    self.dropout = nn.Dropout(dropout)

    self.src_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)
    self.init_weights()

  def init_weights(self):
    initrange = 0.1
    self.embed.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()
    self.fc.weight.data.uniform_(-initrange, initrange)

  def forward(self, src, src_mask = None):
    src = self.embed(src) * math.sqrt(self.d_model)
    src = src + self.pos_embed(src)

    if src_mask is None:
      src_mask = self.src_mask

    try:
      encoded = self.encoder(src, src_mask)
    except Exception as e:
      print(src.device, src_mask.device)
    target = self.dropout(self.fc(encoded))

    return target

### Function for sequence generation

In [12]:
def generate(src: str, num_words: int):
  src = torch.tensor(tokenizer.encode(src).ids[:-1]).long()

  for _ in range(num_words):
    # Pad the input sequence properly so that the model can understand
    temp = src
    pos = 10
    if int(src.shape[0]) < seq_len:
      temp = torch.cat([src, torch.zeros(seq_len-src.shape[0], dtype=torch.long)])
      pos = src.shape[0]
    if int(src.shape[0]) > seq_len:
      temp = src[-seq_len:]

    # Get the predictions of words from the model
    temp = temp.unsqueeze(0).to(device)
    dist = model(temp) # (1, seq_len, word_probability)
    dist = dist.squeeze(0)[pos-1]  # <----- (pos-1)

    # Sort the words in most likely to come next
    dist = torch.sort(dist)[1]

    # Pick out the word from the top 5 recommendations of the model
    value = int(torch.rand(1)*5)
    token = dist[-(1+value)].unsqueeze(dim=0).to('cpu')

    # Add that word to the sentence

    src = torch.cat([src, token])

  src = tokenizer.decode(list(src))

  return src

## Training function

In [14]:
loss_fn = nn.CrossEntropyLoss()

In [15]:
def train(model, optim, train_dl, val_dl, l_fn, epochs, check_overfit=True):
  H = {
      'train_loss': [],
      'train_acc': [],
      'val_loss': [],
      'val_acc': [],
  }
  e0 = 0

  for e in range(epochs):
    model.train()

    print(f"------------------------------------------------------------")
    print(f"EPOCH : {e+1}")

    train_loss = 0
    val_loss = 0
    train_correct = 0
    val_correct = 0

    print("Training Step:")
    with tqdm(total=len(train_dl)) as pbar:
      for _, (X, Y) in enumerate(train_dl):
        Y = Y.view(-1)
        X, Y = (X.to(device), Y.to(device))

        # Forward prop
        pred = model(X)
        B,T,C = pred.shape
        pred = pred.view(B*T, C)
        loss = loss_fn(pred, Y)
        train_loss += loss.to("cpu").detach().numpy()

        # Back prop
        optim.zero_grad()
        loss.backward()
        optim.step()

        train_correct += sum([1 for i, val in enumerate(pred.argmax(1)) if int(val) == int(Y[i])])

        pbar.update(1)

    print("\nValidation Step:")
    with tqdm(total=len(val_dl)) as pbar:
      with torch.no_grad():
        model.eval()

        for _, (X, Y) in enumerate(val_dl):
          Y = Y.view(-1)
          X, Y = (X.to(device), Y.to(device))

          pred = model(X)
          B,T,C = pred.shape
          pred = pred.view(B*T, C)
          loss = loss_fn(pred, Y)

          val_loss += loss.to("cpu").detach().numpy()
          val_correct += sum([1 for i, val in enumerate(pred.argmax(1)) if int(val) == int(Y[i])])
          pbar.update(1)


    # Store the important specifications of the training process
    H["train_loss"].append(train_loss)
    H["train_acc"].append(train_correct / len(train_dl.dataset))
    H["val_loss"].append(val_loss)
    H["val_acc"].append(val_correct/ len(val_dl.dataset))

    # Print the messages appropriately
    print(f"\n\nTrain Loss : {train_loss:.2f}")
    print(f"Val Loss : {val_loss:.2f}")
    print(f"Train Accuracy : {train_correct/len(train_dl.dataset):.4f}")
    print(f"Val Accuracy : {val_correct/len(val_dl.dataset):.4f}")

    # Check for overfitting
    if check_overfit and (len(H['val_acc']) > 1 and H['val_acc'][-1] < H['val_acc'][-2]):
      print("The model is showing signs of over fitting enter Y to continue, or N for breaking the training loop")
      t = input()
      if t == 'Y':
        continue
      else:
        return H, e
    e0 = e
  return H, e0

# Training the model

In [16]:
# Hyperparameters
nhead = 8
d_model = 400
ntokens = tokenizer.get_vocab_size()
num_inputs = seq_len
model = TransformerModel(nhead=nhead, d_model=d_model, ntokens=ntokens, seq_len=num_inputs, device=device).to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-5)

In [20]:
epochs = 3
num_sequences = 10_000
num_repetitions = 3

for i in range(num_repetitions):
  train_dl = DataLoader(token_ds(train_ds[i*num_sequences:(i+1)*num_sequences], seq_len), num_workers=2, batch_size=batch_size)

  start_time = time.time()
  train(model, opt, train_dl, val_dl, loss_fn, epochs, check_overfit=True)
  end_time = time.time()

  print(f'\nElapsed Time: {end_time-start_time}')

100%|██████████| 10000/10000 [00:03<00:00, 2619.87it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


  self.pid = os.fork()
100%|██████████| 781/781 [03:56<00:00,  3.30it/s]



Validation Step:


100%|██████████| 303/303 [00:45<00:00,  6.62it/s]




Train Loss : 2111.83
Val Loss : 512.71
Train Accuracy : 70.1710
Val Accuracy : 79.0539
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 781/781 [03:55<00:00,  3.31it/s]



Validation Step:


100%|██████████| 303/303 [00:44<00:00,  6.75it/s]




Train Loss : 1893.46
Val Loss : 496.59
Train Accuracy : 71.1070
Val Accuracy : 79.6365
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 781/781 [03:56<00:00,  3.31it/s]



Validation Step:


100%|██████████| 303/303 [00:45<00:00,  6.70it/s]




Train Loss : 1831.96
Val Loss : 486.75
Train Accuracy : 71.5562
Val Accuracy : 80.0672

Elapsed Time: 844.3062949180603


100%|██████████| 10000/10000 [00:03<00:00, 2831.25it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


100%|██████████| 757/757 [03:48<00:00,  3.31it/s]



Validation Step:


100%|██████████| 303/303 [00:44<00:00,  6.76it/s]




Train Loss : 1814.57
Val Loss : 479.18
Train Accuracy : 71.1365
Val Accuracy : 80.1739
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 757/757 [03:48<00:00,  3.31it/s]



Validation Step:


100%|██████████| 303/303 [00:45<00:00,  6.68it/s]




Train Loss : 1766.52
Val Loss : 472.86
Train Accuracy : 71.4211
Val Accuracy : 80.3232
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 757/757 [03:48<00:00,  3.31it/s]



Validation Step:


100%|██████████| 303/303 [00:45<00:00,  6.72it/s]




Train Loss : 1729.99
Val Loss : 467.75
Train Accuracy : 71.6300
Val Accuracy : 80.4186

Elapsed Time: 821.4000256061554


100%|██████████| 10000/10000 [00:03<00:00, 2990.58it/s]


RuntimeError: Tensors must have same number of dimensions: got 3 and 2

## Testing the model

In [21]:
test = 'from what i understand'
generate(test, num_words=50)

"from what i understand that she 's first to the first world , the song in his own the last other in which was not the second @-@ home to become to a few years in a single and was a single the first season by <unk>"