In [1]:
!pip install gdown --no-cache-dir -U



# Import libraries

In [1]:
import random
%matplotlib inline

In [2]:
import time
from timeit import default_timer as timer

import pathlib as pl

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader

---

In [None]:
!gdown "18d7-qbKjt2uS1ORdvVIr8LBrTqdZYaTI"
!tar xvjf "/content/C4_200M.hdf5-00001.3-of-00010.tar.bz2"

In [3]:
import h5py
from torch.utils.data import Dataset

class Hdf5Dataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, h5_path, transform=None, num_entries=None):
        self.h5f = h5py.File(h5_path, "r")
        if num_entries:
            self.num_entries = num_entries
        else:
            self.num_entries = self.h5f["labels"].shape[0]
        self.transform = transform

    def __getitem__(self, index):
        if index > self.num_entries:
            raise StopIteration
        input = self.h5f["input"][index].decode("utf-8")
        label = self.h5f["labels"][index].decode("utf-8")
        if self.transform is not None:
            features = self.transform(input)
        return input, label

    def __len__(self):
        return self.num_entries

---

In [4]:
from typing import Iterable, List
from tqdm import tqdm
import pathlib as pl
from torchtext.data import get_tokenizer

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, index: int) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in tqdm(data_iter):
        if data_sample[index] and isinstance(data_sample[index], str):
            yield token_transform(data_sample[index])

SRC_LANGUAGE = "incorrect"
TGT_LANGUAGE = "correct"

MAX_LENGTH = 512
VOCAB_SIZE = 20000
N_TRAIN_SAMPLES = 1000000
N_VAL_SAMPLES = 100000

# Place-holders
token_transform = get_tokenizer("basic_english")
vocab_transform = None

folder = "./data"
train_filename = "C4_200M.hdf5-00000-of-00010"
valid_filename = "C4_200M.hdf5-00001-of-00010"

embedding_path = "./glove.42B.300d.txt"

checkpoint_folder = "./checkpoints"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import gdown

gdown.download_folder(
    "https://drive.google.com/drive/folders/1FQ_jm765fgwcD5lLtjl6ef9k532hdADR",
    quiet=True,
)

In [5]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ["<UNK>", "<PAD>", "<BOS>", "<EOS>"]

In [7]:
from torchtext.vocab import GloVe, vocab

def pretrained_embs(name: str, dim: str, max_vectors: int = None):
    glove_vectors = GloVe(name=name, dim=dim, max_vectors=max_vectors)
    glove_vocab = vocab(glove_vectors.stoi)
    pretrained_embeddings = glove_vectors.vectors
    glove_vocab.insert_token("<UNK>", UNK_IDX)
    pretrained_embeddings = torch.cat(
        (torch.mean(pretrained_embeddings, dim=0, keepdims=True), pretrained_embeddings)
    )
    glove_vocab.insert_token("<PAD>", PAD_IDX)
    pretrained_embeddings = torch.cat(
        (torch.zeros(1, pretrained_embeddings.shape[1]), pretrained_embeddings)
    )
    glove_vocab.insert_token("<BOS>", PAD_IDX)
    pretrained_embeddings = torch.cat(
        (torch.rand(1, pretrained_embeddings.shape[1]), pretrained_embeddings)
    )
    glove_vocab.insert_token("<EOS>", PAD_IDX)
    pretrained_embeddings = torch.cat(
        (torch.rand(1, pretrained_embeddings.shape[1]), pretrained_embeddings)
    )
    glove_vocab.set_default_index(UNK_IDX)
    return glove_vocab, pretrained_embeddings

vocab, embeddings = pretrained_embs("42B", "300", 20000)

torch.save(embeddings, "glove.42B.300d.20K.pth")

.vector_cache/glove.42B.300d.zip: 1.88GB [05:53, 5.32MB/s]                                                                                                                                   
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19999/20000 [00:01<00:00, 11888.61it/s]


In [6]:
# Load vocabulary and pretrained embeddings

vocab_transform = torch.load("vocab/vocab_20K.pth")
embeddings = torch.load("glove.42B.300d.20K.pth")

---

## Collation

In [7]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func


# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat(
        (torch.tensor([BOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX]))
    )

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = sequential_transforms(
    token_transform, vocab_transform, tensor_transform
)  # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform(src_sample.rstrip("\n")))
        tgt_batch.append(text_transform(tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [8]:
text = "data mining is awesome!"
tokenized_input = token_transform(text)
print("tokenized input:\n", tokenized_input)

encoded_input = vocab_transform(tokenized_input)
print("encoded input:\n", encoded_input)

print("transformed input:\n", text_transform(text))

tokenized input:
 ['data', 'mining', 'is', 'awesome', '!']
encoded input:
 [157, 1185, 13, 1480, 32]
transformed input:
 tensor([   2,  157, 1185,   13, 1480,   32,    3])


## Unknown words

In [9]:
text = "dataminingisawesome!"
tokenized_input = token_transform(text)
print(tokenized_input)

encoded_input = vocab_transform(tokenized_input)
print(encoded_input)

['dataminingisawesome', '!']
[0, 32]


RNN Network

In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

teacher_forcing_ratio = 0.5
torch.manual_seed(0)

EMB_SIZE = 300
HIDDEN_SIZE = 512
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1

learning_rate = 0.001

cuda


In [11]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = (
        mask.float()
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask


def create_mask(src):
    src_seq_len = src.shape[0]

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    return src_mask, src_padding_mask

In [12]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for src, trg in tqdm(iterator):

        optimizer.zero_grad()

        src = src.to(DEVICE)
        trg = trg.to(DEVICE)

        output = model(src, trg)

        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for src, trg in tqdm(iterator):

            src = src.to(DEVICE)
            trg = trg.to(DEVICE)

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [13]:
from torch.utils.data import DataLoader
from random import random

import torch
from torch import nn
import torch.nn.functional as F

MAX_LENGTH = 512

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout, embedding_weights=None):
        super().__init__()

        self.hid_dim = hid_dim

        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=1)

        if embedding_weights is not None:
            self.embedding.weight = torch.nn.Parameter(
                torch.from_numpy(embedding_weights)
            )

        self.rnn = nn.GRU(emb_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedded = self.dropout(self.embedding(src).float())

        # embedded = [src len, batch size, emb dim]

        outputs, hidden = self.rnn(embedded)  # no cell state!

        # outputs = [src len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer

        return hidden


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, embedding_weights=None):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=1)

        if embedding_weights is not None:
            self.embedding.weight = torch.nn.Parameter(
                torch.from_numpy(embedding_weights)
            )

        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)

        # self.fc_out = nn.Linear(hid_dim, output_dim)
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, context):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        # context = [n layers * n directions, batch size, hid dim]

        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hid dim]
        # context = [1, batch size, hid dim]

        input = input.unsqueeze(0)

        # input = [1, batch size]

        embedded = self.dropout(self.embedding(input).float())

        # embedded = [1, batch size, emb dim]
        emb_con = torch.cat((embedded, context), dim=2)

        # emb_con = [1, batch size, emb dim + hid dim]
        output, hidden = self.rnn(emb_con, hidden)

        # output = [seq len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]

        # seq len, n layers and n directions will always be 1 in the decoder, therefore:
        # output = [1, batch size, hid dim]
        # hidden = [1, batch size, hid dim]

        output = torch.cat(
            (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1
        )

        # output = [batch size, emb dim + hid dim * 2]

        prediction = self.softmax(self.fc_out(output))

        # prediction = [batch size, output dim]

        return prediction, hidden

In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert (
            encoder.hid_dim == decoder.hid_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src len, batch size]
        # trg = [trg len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden state of the encoder is the context
        context = self.encoder(src)

        # context also used as the initial hidden state of the decoder
        hidden = context

        # first input to the decoder is the <sos> tokens
        input = trg[0, :]

        for t in range(1, trg_len):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)

            # place predictions in a tensor holding predictions for each token
            outputs[t] = output

            # decide if we are going to use teacher forcing or not
            teacher_force = np.random.rand() < teacher_forcing_ratio

            # get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [15]:
# attn = Attention(HIDDEN_SIZE, HIDDEN_SIZE)

encoder1 = Encoder(
    VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, 0, embedding_weights=np.array(embeddings)
)
decoder1 = Decoder(
    VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, 0.1, embedding_weights=np.array(embeddings)
)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# optimizer = torch.optim.Adam(encoder1.parameters(), lr = learning_rate , betas=(0.9, 0.98), eps=1e-9)
# decoder_optimizer = torch.optim.Adam(encoder1.parameters(), lr = learning_rate, betas=(0.9, 0.98), eps=1e-9)

model = Seq2Seq(encoder1, decoder1, DEVICE)
model = nn.DataParallel(model, device_ids=[0,1,2,3])
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 41,789,440 trainable parameters


In [16]:
current_time = lambda: time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
print(current_time())

2023-07-27-23:50:44


In [None]:
NUM_EPOCHS = 1
CLIP = 1 # gradient clipping
RESUME = False

train_iter = Hdf5Dataset(
    pl.Path(folder) / train_filename, num_entries=N_TRAIN_SAMPLES)
train_dataloader = DataLoader(
    train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
val_iter = Hdf5Dataset(pl.Path(folder) / valid_filename, num_entries=N_VAL_SAMPLES)
val_dataloader = DataLoader(
    val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# make sure folder exists
pl.Path("checkpoints").mkdir(parents=True, exist_ok=True)

model.train()
if RESUME:
    checkpoint = torch.load(
        pl.Path("checkpoints") /
        f"model-epoch_{NUM_EPOCHS-1}-{current_time()}.pt"
    )
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = timer()
    print(
        f"\033[92mEpoch {epoch} of {NUM_EPOCHS} - time: {current_time()}\033[0m")
    print(f"\033[92mTraining...\033[0m")
    train_loss = train(model, train_dataloader, optimizer, loss_fn, 0)
    end_time = timer()
    print(f"\033[92mValidating...\033[0m")
    val_loss = evaluate(model, val_dataloader, loss_fn)
    print(
        (
            f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "
            f"Epoch time = {(end_time - start_time):.3f}s"
        )
    )
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": val_loss,
        },
        pl.Path("checkpoints") /
        f"model-epoch_{NUM_EPOCHS-1}-{current_time()}.pt",
    )

[92mEpoch 1 of 1 - time: 2023-07-27-23:50:47[0m
[92mTraining...[0m


 15%|█████████████████████▏                                                                                                                           | 9109/62500 [37:27<3:40:05,  4.04it/s]

In [None]:
import re

# function to generate output sequence using greedy algorithm
def correct_sentence_vectorized(src_tensor, model, max_len=50):
    assert isinstance(src_tensor, torch.Tensor)

    model.eval()
    src_tensor = src_tensor.unsqueeze(1).to(DEVICE)
    # get length of input sentence
    src_len = src_tensor.shape[0]

    trg_vocab_size = model.decoder.output_dim

    # tensor to store decoder outputs
    outputs = torch.zeros(max_len, 1, trg_vocab_size).to(DEVICE)

    # last hidden state of the encoder is the context
    with torch.no_grad():
        context = model.encoder(src_tensor)

    # context also used as the initial hidden state of the decoder
    hidden = context

    # first input to the decoder is the <sos> tokens
    input = src_tensor[0, :]
    # enc_src = [batch_sz, src_len, hid_dim]
    # Even though some examples might have been completed by producing a <eos> token
    # we still need to feed them through the model because other are not yet finished
    # and all examples act as a batch. Once every single sentence prediction encounters
    # <eos> token, then we can stop predicting.
    for t in range(1, max_len):
        # insert input token embedding, previous hidden state and the context state
        # receive output tensor (predictions) and new hidden state
        output, hidden = model.decoder(input, hidden, context)

        # place predictions in a tensor holding predictions for each token
        outputs[t] = output

        # get the highest predicted token from our predictions
        top1 = output.argmax(1)

        # if teacher forcing, use actual next token as next input
        # if not, use predicted token
        input = top1

    pred_sentence = []

    for i in range(1, len(outputs)):
        topv, topi = outputs[i, :, :].topk(1)
        pred_sentence.append(vocab_transform.vocab.itos_[topi])
        if topi == EOS_IDX:
            break

    return " ".join(pred_sentence)

In [None]:
import os
from pathlib import Path

latest_checkpoint = sorted(Path("checkpoints").glob("*.pt"), key=os.path.getmtime)[-1]

checkpoint = torch.load(latest_checkpoint)
model.load_state_dict(checkpoint["model_state_dict"])

model.eval()

# Pick one in 18M examples
val_iter = Hdf5Dataset(pl.Path(folder) / valid_filename, num_entries=None)

src, trg = random.choice(val_iter)

print('input: "', src, '"')
print('target: "', trg, '"')

src = text_transform(src)

print(f"\033[91mModel's prediction: \033[0m", end="")
print(correct_sentence_vectorized(src, model))