Surname generation

In [1]:
import os

download_name = "eng_fra_simplest.csv.bz2"
if not os.path.exists(download_name):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a05/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()
        
name = "eng_fra_simplest.csv"
if not os.path.exists(name):
    import bz2
    with open(download_name, 'rb') as bzf, open(name, 'wb') as fp:
        fp.write(bz2.decompress(bzf.read()))       

In [2]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm


# Settings

In [3]:
args = Namespace(
    # Data and path information
    dataset_csv="eng_fra_simplest.csv",
    model_filename="model.pth",
    # Model hyper parameter
    char_embedding_size=32,
    rnn_hidden_size=32,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=5e-4,
    batch_size=64,
    seed=1337,
    early_stop=5,
    source_embedding_size=64,
    target_embedding_size=64,
    envoding_size=64
    # Runtime hyper parameter
)

args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device

device(type='cpu')

# Reading

In [4]:
df = pd.read_csv(args.dataset_csv)
df


Unnamed: 0.1,Unnamed: 0,source_language,split,target_language
0,0,he 's the cutest boy in town .,train,c'est le garçon le plus mignon en ville .
1,1,he 's a nonsmoker .,train,il est non-fumeur .
2,2,he 's smarter than me .,train,il est plus intelligent que moi .
3,3,he 's a lovely young man .,train,c'est un adorable jeune homme .
4,4,he 's three years older than me .,train,il a trois ans de plus que moi .
...,...,...,...,...
13057,13057,you are n't invited .,test,vous n'êtes pas invités .
13058,13058,you are always watching tv .,test,tu regardes tout le temps la télé .
13059,13059,you are trusted by every one of us .,test,chacun de nous te fait confiance .
13060,13060,you are blinded by love .,test,vous êtes aveuglé par l'amour .


## Vocabulary

Generalized vocabulary can have:
* padding token - to fill up empty space
* unknown token - token for out-of-vocabulary tokens
* begin sequence - token for start of a sequence
* end sequence - token for end of a sequence


In [5]:
class Vocab(object):

    def __init__(self, tokens=None, pad_token=None, unk_token=None, begin_seq_token=None, end_seq_token=None):
        self._tok2idx = {}
        self._idx2tok = {}
        
        self.pad_token = pad_token
        self.pad_idx = None
        if pad_token is not None:
            self.pad_idx = self.add_token(pad_token)
        
        self.unk_token = unk_token
        self.unk_idx = None
        if unk_token is not None:
            self.unk_idx = self.add_token(unk_token)

        self.begin_seq_token = begin_seq_token
        self.begin_seq_idx = None
        if begin_seq_token is not None:
            self.begin_seq_idx = self.add_token(begin_seq_token)

        self.end_seq_token = end_seq_token
        self.end_seq_idx = None
        if end_seq_token is not None:
            self.end_seq_idx = self.add_token(end_seq_token)

        if tokens is not None:
            self.add_tokens(tokens)

    def add_token(self, token):
        if token not in self._tok2idx:
            idx = len(self._tok2idx)
            self._tok2idx[token] = idx
            self._idx2tok[idx] = token
            return idx
        return self._tok2idx[token]

    def add_tokens(self, tokens):
        return [self.add_token(token) for token in tokens]

    def ordered_indices(self):
        return sorted(self._idx2tok)

    def ordered_tokens(self):
        for i in sorted(self._idx2tok):
            yield self._idx2tok[i]

    def __getitem__(self, token_or_idx):
        if isinstance(token_or_idx, str):
            return self._tok2idx.get(token_or_idx, self.unk_idx)
        if isinstance(token_or_idx, int):
            return self._idx2tok.get(token_or_idx, self.unk_token)

    def __len__(self):
        return len(self._tok2idx)

    def __iter__(self):
        for i in sorted(self._idx2tok):
            yield self._idx2tok[i]

    def info(self):
        txt = f"Vocabulary size:{len(self)}"
        for i in range(min(4, len(self))):
            txt += f" {self[i]}:{i}"
        txt += " ..."
        print(txt)

source_words = {w for sent in df[df.split == "train"].source_language for w in sent.split()}
source_vocab = Vocab(sorted(source_words), pad_token="<PAD>", unk_token="<UNK>", begin_seq_token="<BOS>", end_seq_token="<EOS>")

target_words = {w for sent in df[df.split == "train"].target_language for w in sent.split()}
target_vocab = Vocab(sorted(target_words), pad_token="<PAD>", unk_token="<UNK>", begin_seq_token="<BOS>", end_seq_token="<EOS>")

source_vocab._tok2idx

{'<PAD>': 0,
 '<UNK>': 1,
 '<BOS>': 2,
 '<EOS>': 3,
 '!': 4,
 "'": 5,
 "''": 6,
 "'ll": 7,
 "'m": 8,
 "'re": 9,
 "'s": 10,
 "'ve": 11,
 ',': 12,
 '--': 13,
 '.': 14,
 '10': 15,
 '100': 16,
 '18': 17,
 '19': 18,
 '1:00': 19,
 '2': 20,
 '20': 21,
 '229': 22,
 '25': 23,
 '35': 24,
 '5': 25,
 '6': 26,
 '80': 27,
 '?': 28,
 '``': 29,
 'a': 30,
 'ability': 31,
 'able': 32,
 'about': 33,
 'above': 34,
 'abroad': 35,
 'absent': 36,
 'absent-minded': 37,
 'absolute': 38,
 'absolutely': 39,
 'absorbed': 40,
 'abuse': 41,
 'abusing': 42,
 'accept': 43,
 'accident': 44,
 'account': 45,
 'accredited': 46,
 'accusations': 47,
 'accustomed': 48,
 'accustoming': 49,
 'acquaintance': 50,
 'acquainted': 51,
 'across': 52,
 'act': 53,
 'acting': 54,
 'active': 55,
 'actor': 56,
 'actress': 57,
 'actresses': 58,
 'actually': 59,
 'adamant': 60,
 'adapt': 61,
 'adaptable': 62,
 'addict': 63,
 'addicted': 64,
 'adding': 65,
 'address': 66,
 'adequate': 67,
 'admit': 68,
 'admitting': 69,
 'adorable': 70,
 '

## Vectorizer

* `vectorizer(tokens)` should return long tensor (vector). Vector values corresponds to tokens. Vector should be filled with padding indexes to satisfy vector maximal size.  

* 👍  method `vectorize(tokens, seq=True)` receives 
    * `tokens` - a list of vocabulary entities, and
    * `seq` - if set to true, then resulting vector represents a sequence.

Let 0 is padding index, 2 is begin of sequence index and 3 is end of sequence index and maximal size is 10. Then for tokens whose indices are, for example, 56, 96 41, a resulting vector should be `[2 56 96 41 3 0 0 0 0]`.  
If `seq` is set to false, resulting vector should be `[56 96 41 0 0 0 0 0 0]`.

In [6]:
class Vectorizer():

    def __init__(self, vocabulary, max_size=-1):
        self.vocab = vocabulary
        self.max_size = max_size

    def vectorize(self, tokens, seq=True):
        indices = [self.vocab[tok] for tok in tokens]
        if seq:
            indices = [self.vocab.begin_seq_idx] + indices + [self.vocab.end_seq_idx]
        
        if self.max_size >= 0:
            indices = indices[:self.max_size]
            indices += [self.vocab.pad_idx] * (self.max_size - len(indices))
        return torch.LongTensor(indices)    

source_max_size = max(len(sent.split()) for sent in df.source_language)
source_vectorizer = Vectorizer(source_vocab, source_max_size + 2)

target_max_size = max(len(sent.split()) for sent in df.target_language)
target_vectorizer = Vectorizer(target_vocab, target_max_size + 2)

assert source_vectorizer.vectorize("i was there before you was .".split(), seq=True).tolist() == [2, 1371, 2901, 2682, 289, 3015, 2901, 14, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
assert target_vectorizer.vectorize("j'étais là avant toi .".split(), seq=True).tolist() == [2, 2264, 2581, 325, 4482, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## Dataset

`NMTDataset` class inherits `torch.utils.data.Dataset`.  
Implemented methods are:
* `__init__(df, vectorizer_x, vectorizer_h, nationalities)` initialization receives dataframe `df`, `vectorizer_x` vectorizer for surnames, and `vectorizer_h` for nationalities.
* `set_split()` for setting current data split
* 👍 `__getitem__(idx)` should return triple of vectors: x, y, h where 
    * x is vectorized surname, for example `[2 56 96 41 3 0 0 0 0]`
    * y is x moved to left, for example `[56 96 41 3 0 0 0 0 0]`
    * h is vector for nationality

    

In [7]:
class NMTDataset(torch.utils.data.Dataset):

    def __init__(self, df, source_vectorizer, target_vectorizer):
        self.df = df
        self.source_vectorizer = source_vectorizer
        self.target_vectorizer = target_vectorizer
        self._lookup = {split: self.df[self.df.split == split] for split in set(self.df.split)}
        self.set_split("train")
        
    def set_split(self, split):
        self._target_split = split
        self._target_df = self._lookup[split]

    def vectorize_source(self, sent):
        return self.source_vectorizer.vectorize(sent.split(), seq=True)

    def vectorize_target(self, sent):
        return self.target_vectorizer.vectorize(sent.split(), seq=True)

    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        return self.vectorize_source(data.source_language), self.vectorize_target(data.target_language)
        
    def __len__(self):
        return len(self._target_df)

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True):
    for x, y in torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle):
        yield x.to(args.device), y.to(args.device)

dataset = NMTDataset(df, source_vectorizer, target_vectorizer)

assert len(dataset) == 9138
assert len(dataset[0]) == 2

x, y = dataset[0]
assert x.tolist() == [2, 1274, 10, 2676, 682, 367, 1396, 2754, 14, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
assert y.tolist() == [2, 510, 2498, 1874, 2498, 3396, 2814, 1505, 4682, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Encoder

👍  
`SurnameGenerator` initialization receives 
* `embedding_size` dimension of embedding vector (for surnames)
* `num_embeddings` size of surname vocabulary
* `rnn_hidden_size` dimension of hidden RNN layer
* `num_rnn_hidden_embedding` size of nationality vocabulary
* `dropout_p` probability of dropout

Model will consist of: 
* $E_s$ - embedding layer for surnames, 
* $E_n$ - embedding layer for nationalities,
* GRU - gated reccurent unit
* FC - fully connected layer with dropout

Forward receives 
* $x$ indicies of surnames
* $h$ indicies of nationalityes

then $\hat{y} = FC(GRU(E_s(x), E_n(h)))$.

Apply softmax if `apply_softmax` is set to true.


In [8]:
class NMTEncoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NMTEncoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(num_embeddings=input_size,
                                            embedding_dim=hidden_size)

        self.rnn = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input, hidden=None):
        input = self.embedding(input)
        output, hidden = self.rnn(input, hidden)
        return output, hidden


encoder = NMTEncoder(input_size=len(dataset.source_vectorizer.vocab),
                     hidden_size=args.source_embedding_size)

batch_size = 3
source_vec_max_size = dataset.source_vectorizer.max_size
x, y = next(generate_batches(dataset, batch_size=batch_size))
y_hat, h = encoder(x)

assert y_hat.shape == (batch_size, source_vec_max_size, args.source_embedding_size)

# y_hat[:,-1,:] == h.squeeze()  # zadnji output je zapravo izlazni hidden

# Decoder

👍 

In [9]:
class NMTDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(NMTDecoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(num_embeddings=output_size, 
                                      embedding_dim=hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(in_features=hidden_size, 
                            out_features=output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden=None, apply_softmax=False):
        output = self.embedding(input)
        print(output.shape)
        output = F.relu(output)
        print(output.shape)
        output, hidden = self.rnn(output, hidden)
        print(output.shape, hidden.shape)
        output = self.softmax(self.fc(output[0]))
        return output, hidden


decoder = NMTDecoder(hidden_size=args.target_embedding_size,
                     output_size=len(dataset.target_vectorizer.vocab))

batch_size = 3
target_vec_max_size = dataset.target_vectorizer.max_size
x, y = next(generate_batches(dataset, batch_size=batch_size))
y_hat, h = decoder(y)




torch.Size([3, 26, 64])
torch.Size([3, 26, 64])
torch.Size([3, 26, 64]) torch.Size([1, 3, 64])


## Settings

In [10]:
# accuracy
def compute_accuracy(y_hat, y):
    _, y_hat_indices = y_hat.max(dim=1)
    n_correct = torch.eq(y_hat_indices, y).sum().item()
    return n_correct / len(y_hat_indices) * 100

# early stopping
def early_stop(train_state, model):
    val_loss = train_state["val_loss"]
    if len(val_loss) < 2:
        torch.save(model.state_dict(), args.model_filename)
        return False
    
    if val_loss[-1] < val_loss[-2]:
        torch.save(model.state_dict(), args.model_filename)
    
    if len(val_loss) >= args.early_stop:
        val_loss =  val_loss[-args.early_stop:]
        return all(val_loss[i] < val_loss[i + 1] 
                   for i in range(args.early_stop - 1))

    return False

# Defining loss function

For 
* $N$ - batch size
* $C$ - sequence size
* $V$ - vocabulary size

let $\hat{y}$ be a prediction tensor of shape $N \times C \times V$ and $y$ be a target tensor of shape $N \times C$.  
Function `compute_loss(y_hat, y)` is responsible for computing negative log-likelihood loss for each datapoint in the batch.

Before applying pyTorch's NLLLoss, each sequence in the batch $\hat{y}$ has to be turned into log of probabilities, i.e. $log(softmax(\hat{y}_i))$ for $i=1...N$. After calculating all $N$ losses by $NLLLoss(log(softmax(\hat{y}_i)), y_i)$ `compute_loss()` returns their mean.




In [11]:
loss_func = torch.nn.NLLLoss(ignore_index=0)

def compute_loss(y_hat, y):
    y_hat = F.log_softmax(y_hat, dim=-1)
    losses = []
    for b_y_hat, b_y in zip(y_hat, y):
        lv = loss_func(b_y_hat, b_y)
        losses.append(lv)
    return torch.stack(losses).mean()

batch_size = 3
seq_size = 2
vocab_size = 4

torch.manual_seed(42)
y_hat = torch.rand(batch_size, seq_size, vocab_size)
y = torch.tensor([[0, 1], [2, 1], [3, 0]])
loss = compute_loss(y_hat, y)
assert torch.all(loss == torch.tensor(1.33540785))


# Training loop

In [12]:
def compute_accuracy(y_hat, y):
    _, y_hat_indices = y_hat.max(dim=-1)
    y_hat_indices = y_hat_indices.ravel()
    y = y.ravel()
    n_correct = torch.eq(y_hat_indices, y).sum().item()
    return n_correct / len(y_hat_indices) * 100    

# encoder-decoder

encoder = NMTEncoder(hidden_size=args.source_embedding_size,
                     output_size=len(dataset.source_vectorizer.vocab))
encoder = encoder.to(args.device)
                     
decoder = NMTDecoder(hidden_size=args.target_embedding_size,
                     output_size=len(dataset.target_vectorizer.vocab))
decoder = decoder.to(args.device)                

# seed
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# loss, optimizer, scheduler
loss_func = torch.nn.NLLLoss(ignore_index=dataset.vectorizer_x.vocab.pad_idx)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate)

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

# progress bars
epoch_bar = tqdm(desc='epochs', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm(desc='train', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='val', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)

# train state tracker
train_state = {"train_loss": [],
               "train_acc": [],
               "val_loss": [],
               "val_acc": [],}



try:
    for epoch_index in range(args.num_epochs):
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        encoder.train()
        decoder.train()
        for batch_index, (x, y) in enumerate(batch_generator):
            encoder.zero_grad()
            y_hat, h = encoder(x, h)
            loss = compute_loss(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            loss.backward()
            encoder_optimizer.step()

            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)        

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        generator.eval()
        for batch_index, (x, y, h) in enumerate(batch_generator):
            y_hat =  generator(x, h)

            loss = compute_loss(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)   

        if early_stop(train_state, generator):
            print("Early stopping")
            break
        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")


TypeError: __init__() got an unexpected keyword argument 'output_size'

In [None]:
plt.plot(train_state["val_loss"])
plt.plot(train_state["train_loss"])

# Testing

In [None]:
generator.load_state_dict(torch.load(args.model_filename))

generator = generator.to(args.device)
loss_func = torch.nn.NLLLoss()

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size)

running_loss = 0.
running_acc = 0.

generator.eval()
for batch_index, (x, y, h) in enumerate(batch_generator):
    y_hat =  generator(x, h)
    
    # compute the loss
    loss = compute_loss(y_hat, y)
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_hat, y)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print(f"Test loss: {running_loss:.4f}")
print(f"Test Accuracy: {running_acc:.4f}")

## Sampling

Function `sample_from_model(model, vectorizer, num_samples=10, nationality_idx=None)` must generate `num_samples` surnames. If `nationality_idx` is set to some nationality index, then generated surnames belong to specific nationality. Nationality is represented as first hidden input $h_0$ to GRU.

In [None]:
def sample_from_model(model, vectorizer, num_samples=10, nationality_idx=None):
    return []

for nationality in nationality_vocab:
    print(nationality)
    samples = sample_from_model(generator, surname_vectorizer, num_samples=3, nationality_idx=nationality_vocab[nationality])
    for sample in samples:
        print(" -", sample)

# Misc

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, svd_solver='full')

emb = generator.emb.weight.data[1:,:]
labels = list(surname_vocab.ordered_tokens())[1:]
x = torch.tensor(pca.fit_transform(emb))


plt.axis("off")
plt.axis([torch.min(x).item(), torch.max(x).item(), torch.min(x).item(), torch.max(x).item()])
for (xi, yi), lbl in zip(x, labels):
    plt.text(xi, yi, lbl)