# Transfer Learning Using Pretrained Embeddings for Document Classification

## 1. Imports

In [None]:
import os
import re

import requests
import gzip
import torch
import torch.nn.functional as F

from argparse import Namespace
from tqdm.notebook import tqdm

from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Downloading data

* glove pretrained embeddings
* dataset with splits for classification

In [None]:
download_name = "glove.6B.100d.txt.gz"
if not os.path.exists(download_name):
    response = requests.get(f"https://github.com/allenai/spv2/blob/master/model/{download_name}?raw=true")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()

name = "glove.6B.100d.txt"
if not os.path.exists(name):
    with gzip.open(download_name, 'rb') as gzf, open(name, 'wb') as fp:
        fp.write(gzf.read())

download_name = "news_with_splits.csv.bz2"
if not os.path.exists(download_name):
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a04/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()

name = "news_with_splits.csv"
if not os.path.exists(name):
    import bz2
    with open(download_name, 'rb') as bzf, open(name, 'wb') as fp:
        fp.write(bz2.decompress(bzf.read()))            

# 3. Settings

In [None]:
args = Namespace(
    # files
    pretrained_embedding_file = "glove.6B.100d.txt",
    data_file = "news_with_splits.csv",
    # hyper parameter
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stop=5,
    # model
    model_filename="model.pth"
)

args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 4. Reading

Read embedding vectors, dataset and words from the datase.

## 4.1. Read embeddings

👍  
Read embedding file and store embeddings into dictionary `embeddings`.  
Keys are the words and values are word embeddings represented as list of floats

In [None]:
embeddings = {}


assert len(embeddings) == 400000
assert len(embeddings["dog"]) == 100
assert embeddings["dog"][:3] == [0.30817, 0.30938, 0.52803]

## 4.2. Read words and classes from data with preprocessing

Preprocessing function is used to prepare text for tokenization.

In [None]:
def preprocess(txt):
    txt = " " + txt + " "
    txt = re.sub(r"(\w)([\"\'*/+-,.:;?!#&\(\)\\])+\s", r"\1 \2 ", txt) # token ends with 1 or more non-words
    txt = re.sub(r"\s([\"\'*/+-,.:;?!#&\(\)\\])+(\w)", r" \1 \2", txt) # token begins with 1 or more non-words
    txt = re.sub(r"(\w)n't\s", r"\1 n't ", txt)
    txt = re.sub(r"(\w)'(s|re|ll|m|ve|d)\s", r"\1 '\2 ", txt)
    return txt.lower().strip()

preprocess("John's hand-made glasses don't fit on her nose!")

👍  
Read dataset file and store it into variable `df` as pandas DataFrame object.  
Columns of `df` are:
* title - title of the news,
* category - one of four possible categories of a title,
* split - train, val or test split.

Create set of `words` by preprocessing titles from `df`.  
`words` must include only words which appear also in `embeddings`.
Create set of `categories` by collecting all categories from `df`


In [None]:



assert set(df.columns) == {"title", "category", "split"}
assert len(df) == 120000, "not good"
assert len(words) == 29271
assert categories == {'Business', 'Sci/Tech', 'Sports', 'World'}

# 5. Create vocab and embedding

Class `Vocab` is used for indexing tokens. There can be two special tokens: padding token and unknown token.  
If special tokens are set, they would be the first entries in the vocabulary, having indexes 0 and 1 respectively.

Two vocabularies are created:
* `title_vocab` from set of `words` and is having special tokens,
* `category_vocab` form set of `categories` without special tokens.

In [None]:
class Vocab(object):

    def __init__(self, tokens=None, pad_token=None, unk_token=None):
        self._tok2idx = {}
        self._idx2tok = {}
        
        self.pad_token = pad_token
        self.pad_idx = None
        if pad_token is not None:
            self.pad_idx = self.add_token(pad_token)
        
        self.unk_token = unk_token
        self.unk_idx = None
        if unk_token is not None:
            self.unk_idx = self.add_token(unk_token)

        if tokens is not None:
            self.add_tokens(tokens)

    def add_token(self, token):
        if token not in self._tok2idx:
            idx = len(self._tok2idx)
            self._tok2idx[token] = idx
            self._idx2tok[idx] = token
            return idx
        return self._tok2idx[token]

    def add_tokens(self, tokens):
        return [self.add_token(token) for token in tokens]

    def ordered_indices(self):
        return sorted(self._idx2tok)

    def ordered_tokens(self):
        for i in sorted(self._idx2tok):
            yield self._idx2tok[i]

    def __getitem__(self, token_or_idx):
        if isinstance(token_or_idx, str):
            return self._tok2idx.get(token_or_idx, self.unk_idx)
        if isinstance(token_or_idx, int):
            return self._idx2tok.get(token_or_idx, self.unk_token)

    def __len__(self):
        return len(self._tok2idx)

    def info(self):
        txt = f"Vocabulary size:{len(self)}"
        for i in range(min(4, len(self))):
            txt += f" {self[i]}:{i}"
        txt += " ..."
        print(txt)

title_vocab = Vocab(tokens=sorted(words), pad_token="<PAD>", unk_token="<UNK>")
category_vocab = Vocab(tokens=sorted(categories))

title_vocab.info()
category_vocab.info()

👍  
Not all embeddings are going to be used for classification tasks.  
Only tokens from `title_vocab` will have their embeddings.  
`embeddings` does not have vectors for padding token and unknown token and they have to be created.
* padding token is zero vector
* unknown token is mean of all embeddings stored in `embeddings`

Create 2D tensor `emb` whose first two rows would be embeddings for padding and unknown token.  
Other rows must match tokens from `title_vocab`.

In [None]:


assert emb.shape == (29273, 100)
assert bool(torch.all(emb[0] == torch.zeros(100)))
assert bool(torch.all(torch.eq(emb[96], torch.tensor(embeddings[title_vocab[96]]))))
assert bool(torch.all(torch.eq(emb[345], torch.tensor(embeddings[title_vocab[345]]))))

# Vectorizer

* `vectorizer(tokens)` should return long tensor (vector). Vector values corresponds to tokens. Vector should be filled with padding indexes to satisfy vector maximal size.

In [None]:
class Vectorizer():

    def __init__(self, vocabulary, max_size=-1):
        self.vocab = vocabulary
        self.max_size = max_size

    def vectorize(self, tokens):
        
        return        

title_max_size = max(len(preprocess(title).split()) for title in df.title)
title_vectorizer = Vectorizer(title_vocab, title_max_size)
category_vectorizer = Vectorizer(category_vocab)

assert torch.all(category_vectorizer.vectorize(["World"]) == torch.tensor([3]))
assert torch.all(title_vectorizer.vectorize(["john", "went", "home"]) == torch.tensor([14357, 28510, 12839, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

# 6. Dataset and vectorization

`NewsDataset` class inherits `torch.utils.data.Dataset`.  
Implemented methods are:
* `__init__(df, vectorizer_x, vectorizer_y)` initialization receives dataframe `df`, `vectorizer_x` vectorizer for data and `vectorizer_y` for targets.
* `set_split()` for setting current data split
* 👍 `__getitem__(idx)` should return pair of vectors for data and target

In [None]:
class NewsDataset(torch.utils.data.Dataset):

    def __init__(self, df, vectorizer_x, vectorizer_y):
        self.df = df        
        self.vectorizer_x = vectorizer_x
        self.vectorizer_y = vectorizer_y
        self._lookup = {split: df[df.split == split] for split in set(df.split)}
        self.set_split("train")
        
    def set_split(self, split):
        self._target_split = split
        self._target_df = self._lookup[split]

    def vectorize_x(self, title):
        return self.vectorizer_x.vectorize(preprocess(title).split())

    def vectorize_y(self, category):
        return self.vectorizer_y.vectorize([category]).squeeze()

    def frequency_x(self):
        return torch.tensor([len(self.df[self.df.title==tok]) for tok in self.vectorizer_x.vocab.ordered_tokens()])

    def frequency_y(self):
        return torch.tensor([len(self.df[self.df.category==tok]) for tok in self.vectorizer_y.vocab.ordered_tokens()])

    def __getitem__(self, idx):
        
        return
        
    def __len__(self):
        return len(self._target_df)

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True):
    for x, y in torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle):
        yield x.to(args.device), y.to(args.device)

dataset = NewsDataset(df, title_vectorizer, category_vectorizer)

assert len(dataset) == 84000
assert torch.all(dataset.vectorize_x("John was there.") == torch.tensor([14357, 28332, 26280, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
assert dataset.vectorize_y("World").shape == tuple()
assert dataset.vectorize_y("World") == torch.tensor(3)
assert torch.all(dataset[4][0] == torch.tensor([ 4086,  9729,  1905, 10689,  6558, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

# 7. Classifier

👍  
`NewsClassifier` initialization receives 
* `num_channels` number of convolutional channels
* `hidden_dim` dimension of a hidden layer
* `num_classes` dimension of output layer
* `dropout_p` probability of dropout
* `embeddings` weights for embedding layer.

Model will consist of: 
* Embedding layer whose weights are passed by `embeddings`, 
* 4 Convolutional layers with ELU activations,
* 2 Fully connected layers with ReLU activation and dropout for the hidden layer

In forward will apply AvgPool after convolitions to reduce last dimension to 1 and apply dropout before going to FC layers.

In [None]:
class NewsClassifier(torch.nn.Module):

    def __init__(self, num_channels, num_classes, hidden_dim, dropout_p, embeddings):
        super(NewsClassifier, self).__init__()

        return


    def forward(self, x, apply_softmax=True):
        
        return


classifier = NewsClassifier(num_channels=args.num_channels,
                            num_classes=len(dataset.vectorizer_y.vocab), 
                            hidden_dim=args.hidden_dim,
                            dropout_p=args.dropout_p,
                            embeddings=emb)



x_batch, y_batch = next(generate_batches(dataset, batch_size=3))
assert classifier(x_batch).shape == (3, 4)
assert y_batch.shape == (3, )

# 8. Train routine

In [None]:
# accuracy
def compute_accuracy(y_hat, y):
    _, y_hat_indices = y_hat.max(dim=1)
    n_correct = torch.eq(y_hat_indices, y).sum().item()
    return n_correct / len(y_hat_indices) * 100

# early stopping
def early_stop(train_state, model):
    val_loss = train_state["val_loss"]
    if len(val_loss) < 2:
        torch.save(model.state_dict(), args.model_filename)
        return False
    
    if val_loss[-1] < val_loss[-2]:
        torch.save(model.state_dict(), args.model_filename)
    
    if len(val_loss) >= args.early_stop:
        val_loss =  val_loss[-args.early_stop:]
        return all(val_loss[i] < val_loss[i + 1] 
                   for i in range(args.early_stop - 1))

    return False


In [None]:
# seed
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# loss, optimizer, scheduler
loss_func = torch.nn.CrossEntropyLoss(1 - torch.log(dataset.frequency_y()))
optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

# progress bars
epoch_bar = tqdm(desc='epochs', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm(desc='train', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='val', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)

# train state tracker
train_state = {"train_loss": [],
               "train_acc": [],
               "val_loss": [],
               "val_acc": [],}


classifier = classifier.to(args.device)
try:
    for epoch_index in range(args.num_epochs):
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        classifier.train()
        for batch_index, (x, y) in enumerate(batch_generator):
            optimizer.zero_grad()
            y_hat = classifier(x)

            loss = loss_func(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            loss.backward()
            optimizer.step()

            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)        

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        classifier.eval()
        for batch_index, (x, y) in enumerate(batch_generator):
            y_hat =  classifier(x)

            loss = loss_func(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            
            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)   

        if early_stop(train_state, classifier):
            print("Early stopping")
            break
        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")


# 9. Evaluation

Calculating test accuracy

In [None]:
classifier.load_state_dict(torch.load(args.model_filename))

classifier = classifier.to(args.device)
loss_func = torch.nn.CrossEntropyLoss(1 / torch.log(dataset.frequency_y()))

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size)

running_loss = 0.
running_acc = 0.

classifier.eval()
for batch_index, (x, y) in enumerate(batch_generator):
    y_hat =  classifier(x)
    
    # compute the loss
    loss = loss_func(y_hat, y)
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_hat, y)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print(f"Test loss: {running_loss:.4f}")
print(f"Test Accuracy: {running_acc:.4f}")

👍  Show cross-validation as heatmap.

# 10. Inference

👍  Make function for predicting

In [None]:
def predict(model, vectorizer_x, vectorizer_y, title):
    return

predict(classifier, title_vectorizer, category_vectorizer, "John was there")
