<a href="https://colab.research.google.com/github/abudubai16/CNN-using-transfer-learning/blob/main/NLP_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [1]:
! pip install portalocker --q
! pip install datasets --q --q
! pip install positional-encodings[pytorch] --q --q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m307.2/542.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
from positional_encodings.torch_encodings import PositionalEncoding1D
from datasets import load_dataset, concatenate_datasets

from transformers import RobertaTokenizer
# Tokenizers
from tokenizers import Tokenizer
from tokenizers.normalizers import NFD, StripAccents, Sequence, Lowercase
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import WordPiece
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer

import os
import time
import math

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load the dataset

In [4]:
if not os.path.exists('/content/wikitext'):
  ! git clone 'https://huggingface.co/datasets/wikitext' --q

In [5]:
dataset = load_dataset('parquet', data_files={
    'train1':'/content/wikitext/wikitext-103-v1/train-00000-of-00002.parquet',
    'train2': '/content/wikitext/wikitext-103-v1/train-00001-of-00002.parquet',
    'val':'/content/wikitext/wikitext-103-v1/validation-00000-of-00001.parquet',
    'test':'/content/wikitext/wikitext-103-v1/test-00000-of-00001.parquet'
})

# Dataset for training the model
train_ds = concatenate_datasets([
    dataset['train1'],
      dataset['train2'],
      dataset['test']
])['text']

# Dataset for creating the tokenizer
stacked = concatenate_datasets([
      dataset['train1'],
      dataset['train2'],
      dataset['val'],
      dataset['test']
  ])

Generating train1 split: 0 examples [00:00, ? examples/s]

Generating train2 split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Create the tokenizer

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('FacebookAI/roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Create the dataloader

In [7]:
class token_ds(Dataset):
  def __init__(self, ds, max_length):
    super().__init__()
    length = len(ds)
    max_length += 1
    collector = []
    temp = torch.zeros(1, max_length)

    with tqdm(total=length) as pbar:
      for pos, line in enumerate(ds):
        sentences = line.split('.')
        for sentence in sentences:
          a = torch.tensor(tokenizer.encode(sentence)).long()
          length = a.shape[0]

          if a[1] == 2:
            continue

          if length < max_length:
            a = torch.cat([a, torch.ones(max_length-length)])
          else:
            a = a[-max_length:]

          collector.append(a.unsqueeze(0))
          pos += 1

          # Performance purpose
          if pos%10000 == 0:
            temp = torch.cat([temp, *collector], dim=0).long()
            collector = []

        pbar.update(1)

    temp = torch.cat([temp, *collector], dim=0).long()

    self.value = temp[1:, :-1]
    self.target = temp[1:, 1:]

  def __len__(self):
    return self.value.shape[0]

  def __getitem__(self, i):
    return self.value[i,:], self.target[i, :]

In [8]:
seq_len = 100

In [9]:
batch_size = 32

val_dl = DataLoader(token_ds(dataset['val']['text'], seq_len), num_workers=2, batch_size=batch_size)

100%|██████████| 3760/3760 [00:04<00:00, 825.89it/s]


In [10]:
for pos, (X,Y) in enumerate(val_dl):
  print(f'{X} \n {Y}')
  print(f'{X.shape} \n {Y.shape}')
  break

  self.pid = os.fork()


tensor([[    0,  5457, 11858,  ...,     1,     1,     1],
        [    0, 11858, 42292,  ...,     1,     1,     1],
        [    0,    85,    16,  ...,     1,     1,     1],
        ...,
        [    0,    20,    80,  ...,     1,     1,     1],
        [    0,    20,  4533,  ...,     1,     1,     1],
        [    0, 38187, 40037,  ...,     1,     1,     1]]) 
 tensor([[ 5457, 11858, 42292,  ...,     1,     1,     1],
        [11858, 42292, 20577,  ...,     1,     1,     1],
        [   85,    16,  3615,  ...,     1,     1,     1],
        ...,
        [   20,    80,  4707,  ...,     1,     1,     1],
        [   20,  4533,  6031,  ...,     1,     1,     1],
        [38187, 40037, 10361,  ...,     1,     1,     1]])
torch.Size([32, 100]) 
 torch.Size([32, 100])


## Creating the model

In [11]:
class TransformerModel(nn.Module):
  def __init__(self, ntokens, d_model, nhead, seq_len, device, nlayers=6, dim_feedforward=2048, dropout=0.1):
    super(TransformerModel, self).__init__()
    self.d_model = d_model
    self.device = device
    """
    ntokens: The number of words in the dictionary
    d_model: the size of each word embedding
    n_head: the number of heads in each encoder layer
    nlayers: the number of transformer encoder layers in the encoder
    """
    self.seq_len = seq_len

    self.embed = nn.Embedding(ntokens, d_model, padding_idx=0)
    self.pos_embed = PositionalEncoding1D(d_model)

    encoder_layer = nn.TransformerEncoderLayer(d_model,
                                               nhead=nhead,
                                               dim_feedforward=dim_feedforward,
                                               dropout=dropout,
                                               batch_first=True)

    self.encoder = nn.TransformerEncoder(encoder_layer,
                                         num_layers=nlayers,
                                         enable_nested_tensor=True)
    self.fc = nn.Linear(d_model, ntokens)
    self.dropout = nn.Dropout(dropout)

    self.src_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)
    self.init_weights()

  def init_weights(self):
    initrange = 0.1
    self.embed.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()
    self.fc.weight.data.uniform_(-initrange, initrange)

  def forward(self, src, src_mask = None):
    src = self.embed(src) * math.sqrt(self.d_model)
    src = src + self.pos_embed(src)

    if src_mask is None:
      src_mask = self.src_mask

    try:
      encoded = self.encoder(src, src_mask)
    except Exception as e:
      print(src.device, src_mask.device)
    target = self.dropout(self.fc(encoded))

    return target

### Function for sequence generation

In [22]:
def generate(src: str, num_words: int):
  src = torch.tensor(tokenizer.encode(src)[:-1]).long()

  for _ in range(num_words):
    # Pad the input sequence properly so that the model can understand
    temp = src
    pos = seq_len-1
    if int(src.shape[0]) < seq_len:
      temp = torch.cat([src, torch.ones(seq_len-src.shape[0], dtype=torch.long)])
      pos = src.shape[0]
    if int(src.shape[0]) > seq_len:
      temp = src[-seq_len:]

    # Get the predictions of words from the model
    temp = temp.unsqueeze(0).to(device)
    dist = model(temp) # (1, seq_len, word_probability)
    dist = dist.squeeze(0)[pos-1]  # <----- (pos-1)

    # Sort the words in most likely to come next
    dist = torch.sort(dist)[1]

    # Pick out the word from the top 3 recommendations of the model
    value = int(torch.rand(1)*3)
    token = dist[-(1+value)].unsqueeze(dim=0).to('cpu')

    # Add that word to the sentence
    src = torch.cat([src, token])

    if token == 2:
      break

  src = tokenizer.decode(list(src))

  return src

## Training function

In [17]:
loss_fn = nn.CrossEntropyLoss()

In [18]:
def train(model, optim, train_dl, val_dl, l_fn, epochs, check_overfit=True):
  H = {
      'train_loss': [],
      'train_acc': [],
      'val_loss': [],
      'val_acc': [],
  }
  e0 = 0

  for e in range(epochs):
    model.train()

    print(f"------------------------------------------------------------")
    print(f"EPOCH : {e+1}")

    train_loss = 0
    val_loss = 0
    train_correct = 0
    val_correct = 0

    print("Training Step:")
    with tqdm(total=len(train_dl)) as pbar:
      for _, (X, Y) in enumerate(train_dl):
        Y = Y.view(-1)
        X, Y = (X.to(device), Y.to(device))

        # Forward prop
        pred = model(X)
        B,T,C = pred.shape
        pred = pred.view(B*T, C)
        loss = loss_fn(pred, Y)
        train_loss += loss.to("cpu").detach().numpy()

        # Back prop
        optim.zero_grad()
        loss.backward()
        optim.step()

        train_correct += sum([1 for i, val in enumerate(pred.argmax(1)) if int(val) == int(Y[i])])

        pbar.update(1)

    print("\nValidation Step:")
    with tqdm(total=len(val_dl)) as pbar:
      with torch.no_grad():
        model.eval()

        for _, (X, Y) in enumerate(val_dl):
          Y = Y.view(-1)
          X, Y = (X.to(device), Y.to(device))

          pred = model(X)
          B,T,C = pred.shape
          pred = pred.view(B*T, C)
          loss = loss_fn(pred, Y)

          val_loss += loss.to("cpu").detach().numpy()
          val_correct += sum([1 for i, val in enumerate(pred.argmax(1)) if int(val) == int(Y[i])])
          pbar.update(1)


    # Store the important specifications of the training process
    H["train_loss"].append(train_loss)
    H["train_acc"].append(train_correct / len(train_dl.dataset))
    H["val_loss"].append(val_loss)
    H["val_acc"].append(val_correct/ len(val_dl.dataset))

    # Print the messages appropriately
    print(f"\n\nTrain Loss : {train_loss:.2f}")
    print(f"Val Loss : {val_loss:.2f}")
    print(f"Train Accuracy : {train_correct/len(train_dl.dataset):.4f}")
    print(f"Val Accuracy : {val_correct/len(val_dl.dataset):.4f}")

    # Check for overfitting
    if check_overfit and (len(H['val_acc']) > 1 and H['val_acc'][-1] < H['val_acc'][-2]):
      print("The model is showing signs of over fitting enter Y to continue, or N for breaking the training loop")
      t = input()
      if t == 'Y':
        continue
      else:
        return H, e
    e0 = e
  return H, e0

# Training the model

In [28]:
# Hyperparameters
nhead = 8
d_model = 400
ntokens = tokenizer.vocab_size
num_inputs = seq_len
model = TransformerModel(nhead=nhead, d_model=d_model, ntokens=ntokens, seq_len=num_inputs, nlayers=8, device=device).to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-5)

In [20]:
epochs = 3
num_sequences = 10_000
num_repetitions = 5

for i in range(num_repetitions):
  train_dl = DataLoader(token_ds(train_ds[i*num_sequences:(i+1)*num_sequences], seq_len), num_workers=2, batch_size=batch_size)

  start_time = time.time()
  train(model, opt, train_dl, val_dl, loss_fn, epochs, check_overfit=True)
  end_time = time.time()

  print(f'\nElapsed Time: {end_time-start_time}')

100%|██████████| 10000/10000 [00:11<00:00, 887.71it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


  self.pid = os.fork()
100%|██████████| 901/901 [06:08<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.71it/s]




Train Loss : 2356.98
Val Loss : 543.02
Train Accuracy : 71.9333
Val Accuracy : 81.1197
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 901/901 [06:10<00:00,  2.43it/s]



Validation Step:


100%|██████████| 352/352 [01:00<00:00,  5.80it/s]




Train Loss : 2100.93
Val Loss : 525.81
Train Accuracy : 72.9916
Val Accuracy : 81.5811
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 901/901 [06:10<00:00,  2.43it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.70it/s]




Train Loss : 2032.14
Val Loss : 514.77
Train Accuracy : 73.4145
Val Accuracy : 81.9697

Elapsed Time: 1294.1423873901367


100%|██████████| 10000/10000 [00:10<00:00, 967.85it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


100%|██████████| 876/876 [05:59<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.76it/s]




Train Loss : 2020.49
Val Loss : 504.75
Train Accuracy : 72.9830
Val Accuracy : 82.1839
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 876/876 [05:59<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.77it/s]




Train Loss : 1964.79
Val Loss : 496.30
Train Accuracy : 73.2178
Val Accuracy : 82.3296
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 876/876 [05:59<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:02<00:00,  5.68it/s]




Train Loss : 1914.27
Val Loss : 489.98
Train Accuracy : 73.5444
Val Accuracy : 82.4294

Elapsed Time: 1262.4538383483887


100%|██████████| 10000/10000 [00:10<00:00, 974.55it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


100%|██████████| 901/901 [06:12<00:00,  2.42it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.70it/s]




Train Loss : 1966.63
Val Loss : 488.04
Train Accuracy : 73.9728
Val Accuracy : 82.4483
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 901/901 [06:09<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.76it/s]




Train Loss : 1915.60
Val Loss : 481.87
Train Accuracy : 74.1956
Val Accuracy : 82.4733
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 901/901 [06:09<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.76it/s]




Train Loss : 1874.56
Val Loss : 476.74
Train Accuracy : 74.4402
Val Accuracy : 82.5493

Elapsed Time: 1295.0245747566223


100%|██████████| 10000/10000 [00:11<00:00, 880.41it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


100%|██████████| 959/959 [06:33<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:00<00:00,  5.80it/s]




Train Loss : 2071.34
Val Loss : 469.25
Train Accuracy : 73.9770
Val Accuracy : 82.6630
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 959/959 [06:33<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:00<00:00,  5.79it/s]




Train Loss : 2019.99
Val Loss : 465.45
Train Accuracy : 74.2058
Val Accuracy : 82.7212
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 959/959 [06:33<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.75it/s]




Train Loss : 1981.56
Val Loss : 463.47
Train Accuracy : 74.4348
Val Accuracy : 82.7741

Elapsed Time: 1363.278246164322


100%|██████████| 10000/10000 [00:09<00:00, 1087.75it/s]


------------------------------------------------------------
EPOCH : 1
Training Step:


100%|██████████| 873/873 [05:58<00:00,  2.43it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.75it/s]




Train Loss : 1830.16
Val Loss : 457.23
Train Accuracy : 74.4756
Val Accuracy : 82.8340
------------------------------------------------------------
EPOCH : 2
Training Step:


100%|██████████| 873/873 [05:58<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.73it/s]




Train Loss : 1788.11
Val Loss : 454.88
Train Accuracy : 74.6997
Val Accuracy : 82.8826
------------------------------------------------------------
EPOCH : 3
Training Step:


100%|██████████| 873/873 [05:58<00:00,  2.44it/s]



Validation Step:


100%|██████████| 352/352 [01:01<00:00,  5.75it/s]



Train Loss : 1756.21
Val Loss : 453.53
Train Accuracy : 74.9216
Val Accuracy : 82.9124

Elapsed Time: 1258.65167760849





## Testing the model

**Generated text from trained model: **

from what i understand the album, the city, is a new game <unk>, and a number, which was not the most common \n<pad><pad>, a single, and is the album, as well as well

**Generated text from untrained model: **

from what i understand attackedlit chocolate announgrading butcherfine butcher Paladinfine discriminationiationeterminedningseterminedDefault announ memorable1975fineeterminedConfiguration Paladin announfinerientavery defyeterminedningsinement fostering presidents Paladinfully announ½beta artisan Paladinamliseinement reconstruct__ Constitutional Paladin announ½ announ½ announ Wilmingtonlude Constitutional Constitutional Constitutional Paladin artisan tit Kodi Constitutional Constitutional Constitutional soldinement reconstruct Naruto announ½">< announinement reconstruct

In [27]:
test = 'from what i understand'
generate(test, num_words=500)

'<s>from what i understand the album, the city, is a new game <unk>, and a number, which was not the most common \n<pad><pad>, a single, and is the album, as well as well </s>'

In [29]:
test = 'from what i understand'
generate(test, num_words=500)

'<s>from what i understand attackedlit chocolate announgrading butcherfine butcher Paladinfine discriminationiationeterminedningseterminedDefault announ memorable1975fineeterminedConfiguration Paladin announfinerientavery defyeterminedningsinement fostering presidents Paladinfully announ½beta artisan Paladinamliseinement reconstruct__ Constitutional Paladin announ½ announ½ announ Wilmingtonlude Constitutional Constitutional Constitutional Paladin artisan tit Kodi Constitutional Constitutional Constitutional soldinement reconstruct Naruto announ½">< announinement reconstruct__514 Courtesy">< announ½iation presidentsinementarrett presidentselniation precludeRP announ Wilmingtoninement reconstruct__ tit Lanka Kodi hiding robbinglude Newsletterise boasted Survivalinement minced announ tumors Wilmington tit reconstructKTinement patriotism fostering announ uncons Wilmington mummy tit Wilmington circum tit helpsrienticipinement Lanka announlude Wilmington titettle tit tit Survival Survival HO