In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
from argparse import Namespace
import string
import re
import random
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [159]:
args = Namespace(
    # Data and path information
    dataset_csv="eng_fra_simplest.csv",
    model_filename="model.pth",
    # Model hyper parameter
    source_embedding_size=64,
    target_embedding_size=64,
    encoding_size=64,    
    # Training hyper parameter
    num_epochs=100,
    learning_rate=5e-4,
    batch_size=128,
    seed=1337,
    early_stop=5,
    # Runtime hyper parameter
    teach_ratio=0.1
)

args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device

device(type='cpu')

In [50]:
df = pd.read_csv(args.dataset_csv)
df

Unnamed: 0.1,Unnamed: 0,source_language,split,target_language
0,0,he 's the cutest boy in town .,train,c'est le garçon le plus mignon en ville .
1,1,he 's a nonsmoker .,train,il est non-fumeur .
2,2,he 's smarter than me .,train,il est plus intelligent que moi .
3,3,he 's a lovely young man .,train,c'est un adorable jeune homme .
4,4,he 's three years older than me .,train,il a trois ans de plus que moi .
...,...,...,...,...
13057,13057,you are n't invited .,test,vous n'êtes pas invités .
13058,13058,you are always watching tv .,test,tu regardes tout le temps la télé .
13059,13059,you are trusted by every one of us .,test,chacun de nous te fait confiance .
13060,13060,you are blinded by love .,test,vous êtes aveuglé par l'amour .


In [51]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


class Vocab(object):

    def __init__(self, tokens=None, pad_token=None, unk_token=None, begin_seq_token=None, end_seq_token=None):
        self._tok2idx = {}
        self._idx2tok = {}
        
        self.pad_token = pad_token
        self.pad_idx = None
        if pad_token is not None:
            self.pad_idx = self.add_token(pad_token)
        
        self.unk_token = unk_token
        self.unk_idx = None
        if unk_token is not None:
            self.unk_idx = self.add_token(unk_token)

        self.begin_seq_token = begin_seq_token
        self.begin_seq_idx = None
        if begin_seq_token is not None:
            self.begin_seq_idx = self.add_token(begin_seq_token)

        self.end_seq_token = end_seq_token
        self.end_seq_idx = None
        if end_seq_token is not None:
            self.end_seq_idx = self.add_token(end_seq_token)

        if tokens is not None:
            self.add_tokens(tokens)

    def add_token(self, token):
        if token not in self._tok2idx:
            idx = len(self._tok2idx)
            self._tok2idx[token] = idx
            self._idx2tok[idx] = token
            return idx
        return self._tok2idx[token]

    def add_tokens(self, tokens):
        return [self.add_token(token) for token in tokens]

    def ordered_indices(self):
        return sorted(self._idx2tok)

    def ordered_tokens(self):
        for i in sorted(self._idx2tok):
            yield self._idx2tok[i]

    def __getitem__(self, token_or_idx):
        if isinstance(token_or_idx, str):
            return self._tok2idx.get(token_or_idx, self.unk_idx)
        if isinstance(token_or_idx, int):
            return self._idx2tok.get(token_or_idx, self.unk_token)

    def __len__(self):
        return len(self._tok2idx)

    def __iter__(self):
        for i in sorted(self._idx2tok):
            yield self._idx2tok[i]

    def info(self):
        txt = f"Vocabulary size:{len(self)}"
        for i in range(min(4, len(self))):
            txt += f" {self[i]}:{i}"
        txt += " ..."
        print(txt)

source_words = {w for sent in df[df.split == "train"].source_language for w in sent.split()}
source_vocab = Vocab(sorted(source_words), pad_token="<PAD>", unk_token="<UNK>", begin_seq_token="<BOS>", end_seq_token="<EOS>")

target_words = {w for sent in df[df.split == "train"].target_language for w in sent.split()}
target_vocab = Vocab(sorted(target_words), pad_token="<PAD>", unk_token="<UNK>", begin_seq_token="<BOS>", end_seq_token="<EOS>")





In [52]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs



# Data

In [53]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
pair = random.choice(pairs)
print(pair)

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['vous etes aveuglee par l amour .', 'you are blinded by love .']


In [54]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device)# .view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

tensorsFromPair(pair)


class Vectorizer():

    def __init__(self, vocab, max_size=-1):
        self.vocab = vocab
        self.max_size = max_size

    def vectorize(self, tokens, seq=True):
        indices = [self.vocab[tok] for tok in tokens]
        if seq:
            indices = [self.vocab.begin_seq_idx] + indices + [self.vocab.end_seq_idx]
        
        if self.max_size >= 0:
            indices = indices[:self.max_size]
            indices += [self.vocab.pad_idx] * (self.max_size - len(indices))
        return torch.LongTensor(indices)    

#source_max_size = max(len(sent.split()) for sent in df.source_language)
source_vectorizer = Vectorizer(source_vocab)

#target_max_size = max(len(sent.split()) for sent in df.target_language)
target_vectorizer = Vectorizer(target_vocab)

datapoint = df.iloc[0]
source_vectorizer.vectorize(datapoint.source_language.split())
target_vectorizer.vectorize(datapoint.target_language.split())


class NMTDataset(torch.utils.data.Dataset):

    def __init__(self, df, source_vectorizer, target_vectorizer):
        self.df = df
        self.source_vectorizer = source_vectorizer
        self.target_vectorizer = target_vectorizer
        self._lookup = {split: self.df[self.df.split == split] for split in set(self.df.split)}
        self.set_split("train")
        
    def set_split(self, split):
        self._target_split = split
        self._target_df = self._lookup[split]

    def vectorize_source(self, sent):
        return self.source_vectorizer.vectorize(sent.split(), seq=True)

    def vectorize_target(self, sent):
        return self.target_vectorizer.vectorize(sent.split(), seq=True)

    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        return self.vectorize_source(data.source_language), self.vectorize_target(data.target_language)
        
    def __len__(self):
        return len(self._target_df)

    def get_num_batches(self, batch_size):
        return len(self) // batch_size



def generate_batches(dataset, batch_size, shuffle=True):

    def collate_fn(batch):
        x, y = [], []
        for xi, yi in batch:
            x.append(xi)
            y.append(yi)
        x = torch.nn.utils.rnn.pad_sequence(x, padding_value=dataset.source_vectorizer.vocab.pad_idx).transpose(1, 0)
        y = torch.nn.utils.rnn.pad_sequence(y, padding_value=dataset.target_vectorizer.vocab.pad_idx).transpose(1, 0)
        return x, y

    for x, y in torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn, drop_last=True):
        yield x.to(args.device), y.to(args.device)

dataset = NMTDataset(df, source_vectorizer, target_vectorizer)

assert len(dataset) == 9138
assert len(dataset[0]) == 2

for x, y in generate_batches(dataset, batch_size=3):
    print(x)
    print(y)
    break


tensor([[   2, 1371,    8, 2868, 1257, 3015,    7,  271, 2877, 2732, 1801, 1731,
           14,    3],
        [   2, 1371,    8, 1818, 2262,   14,    3,    0,    0,    0,    0,    0,
            0,    0],
        [   2, 1371,    8,  202,   14,    3,    0,    0,    0,    0,    0,    0,
            0,    0]])
tensor([[   2, 2279, 4283, 4575,  877, 3656, 4576, 4699, 4485, 2498, 2847, 3491,
            9,    3],
        [   2, 2279, 2985, 4283, 3217, 4555,    9,    3,    0,    0,    0,    0,
            0,    0],
        [   2, 2279, 4283, 4267,    9,    3,    0,    0,    0,    0,    0,    0,
            0,    0]])


# Encoder

In [141]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input, hidden=None):
        embedded = self.embedding(input)# .view(1, 1, -1)
        output = embedded
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


x, y = tensorsFromPair(pair)
hidden_size=64
encoder = EncoderRNN(input_lang.n_words, hidden_size)
y_hat, h = encoder(x, None)



encoder = EncoderRNN(len(dataset.source_vectorizer.vocab), args.source_embedding_size)
x, y = next(generate_batches(dataset, batch_size=3))
x.shape, y.shape

y_hat, h = encoder(x)
print(y_hat.shape)
print(h.shape)


torch.Size([3, 9, 64])
torch.Size([1, 3, 64])


# Decoder

In [142]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden=None):
        output = self.embedding(input) # .view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.fc(output)
        output = self.softmax(output)   
        return output, hidden

    def generate(self, input, hidden=None, size=-1):
        output = self.embedding(input)
        output = F.relu(output)

        outputs = [output]
        for _ in range(size - 1):
            output, hidden = self.rnn(output, hidden)
            outputs.append(output)
            output = F.relu(output)

        outputs = torch.cat(outputs, dim=1)
        outputs = self.fc(outputs)
        outputs = self.softmax(outputs)           
        return outputs, hidden


    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


encoder = EncoderRNN(len(dataset.source_vectorizer.vocab), args.source_embedding_size)
decoder = DecoderRNN(args.target_embedding_size, len(dataset.target_vectorizer.vocab))
x, y = next(generate_batches(dataset, batch_size=3))

y_hat, h = encoder(x)
print(y_hat.shape)
print(h.shape)
y_hat, h = decoder(y, h)


y_hat, h = decoder.generate(y[:,[0]], size=y.shape[1])
print(y.shape)
print(y_hat.shape)
print(h.shape)



torch.Size([3, 11, 64])
torch.Size([1, 3, 64])
torch.Size([3, 17])
torch.Size([3, 17, 4911])
torch.Size([1, 3, 64])


# Attention Decoder

In [143]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# D' Training

In [163]:
# early stopping
def early_stop(train_state, model):
    val_loss = train_state["val_loss"]
    if len(val_loss) < 2:
        torch.save(model.state_dict(), args.model_filename)
        return False
    
    if val_loss[-1] < val_loss[-2]:
        torch.save(model.state_dict(), args.model_filename)
    
    if len(val_loss) >= args.early_stop:
        val_loss =  val_loss[-args.early_stop:]
        return all(val_loss[i] < val_loss[i + 1] 
                   for i in range(args.early_stop - 1))

    return False

def compute_accuracy(y_hat, y):
    _, y_hat_indices = y_hat.max(dim=-1)
    y_hat_indices = y_hat_indices.ravel()
    y = y.ravel()
    n_correct = torch.eq(y_hat_indices, y).sum().item()
    return n_correct / len(y_hat_indices) * 100    


def compute_loss(y_hat, y):
    y_hat = F.log_softmax(y_hat, dim=-1)
    losses = []
    for b_y_hat, b_y in zip(y_hat, y):
        lv = loss_func(b_y_hat, b_y)
        losses.append(lv)
    return torch.stack(losses).mean()    

# encoder-decoder
encoder = EncoderRNN(len(dataset.source_vectorizer.vocab), args.source_embedding_size).to(args.device)
decoder = DecoderRNN(args.target_embedding_size, len(dataset.target_vectorizer.vocab)).to(args.device)
                     
# seed
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# loss, optimizer, scheduler
loss_func = torch.nn.NLLLoss(ignore_index=dataset.target_vectorizer.vocab.pad_idx)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate)

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

# progress bars
epoch_bar = tqdm(desc='epochs', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm(desc='train', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='val', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)

# train state tracker
train_state = {"train_loss": [],
               "train_acc": [],
               "val_loss": [],
               "val_acc": [],}



try:
    for epoch_index in range(args.num_epochs):
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        encoder.train()
        decoder.train()
    
        for batch_index, (x, y) in enumerate(batch_generator):
            encoder.zero_grad()
            decoder.zero_grad()

            if torch.rand(1).item() > args.teach_ratio:
                y_hat, he = encoder(x)
                y_hat, hd = decoder(y, he.detach().clone())
            else:
                y_hat, he = encoder(x)
                y_hat, hd = decoder.generate(y[:,[0]], he.detach().clone(), y.shape[1])

            loss = compute_loss(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)        

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size)
        running_loss = running_acc = 0.0
        
        encoder.eval()
        decoder.eval()

        for batch_index, (x, y) in enumerate(batch_generator):
            if torch.rand(1).item() > args.teach_ratio:
                y_hat, he = encoder(x)
                y_hat, hd = decoder(y, he.detach().clone())
            else:
                y_hat, he = encoder(x)
                y_hat, hd = decoder.generate(y[:,[0]], he.detach().clone(), y.shape[1])

            loss = compute_loss(y_hat, y)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_hat, y)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)   

        if early_stop(train_state, decoder):
            print("Early stopping")
            break
        # scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")

epochs:   0%|          | 0/100 [00:00<?, ?it/s]

train:   0%|          | 0/71 [00:00<?, ?it/s]

val:   0%|          | 0/15 [00:00<?, ?it/s]

# Testing

In [161]:
#decoder.load_state_dict(torch.load(args.model_filename))


loss_func = torch.nn.NLLLoss()

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size)

running_loss = 0.
running_acc = 0.

encoder.eval()
decoder.eval()
for batch_index, (x, y) in enumerate(batch_generator):
    y_hat, he = encoder(x)
    y_hat, hd = decoder(y, he)
    
    # compute the loss
    loss = compute_loss(y_hat, y)
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_hat, y)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print(f"Test loss: {running_loss:.4f}")
print(f"Test Accuracy: {running_acc:.4f}")

Test loss: 8.1676
Test Accuracy: 11.6598


In [162]:
sent = "i love you ."


def translate(sent):
    x = source_vectorizer.vectorize(sent.split())
    y_hat, hidden = encoder(x)
    hidden = hidden.unsqueeze(0)
    y = torch.tensor([target_vocab.begin_seq_idx]).unsqueeze(0)
    output, _ = decoder.generate(y, hidden=hidden, size=100)
    output = output.squeeze().argmax(dim=1)
    words = []
    for i in output:
        i = i.item()
        if i == target_vocab.end_seq_idx:
            break
        if i == target_vocab.begin_seq_idx:
            continue
        words.append(target_vocab[i])
    return " ".join(words)


for sent in df.source_language:
    print(sent, translate(sent))










he 's the cutest boy in town . je suis suis . .
he 's a nonsmoker . je suis suis . .
he 's smarter than me . je suis suis . .
he 's a lovely young man . je suis suis . .
he 's three years older than me . je suis suis . .
he 's washing your car . je suis suis . .
he 's your typical workaholic . je suis suis . .
he 's waiting for you at home . je suis suis . .
he 's a historian . je suis suis . .
he 's studying . je suis suis . .
he 's down to earth . je suis suis . .
he 's starting to feel desperate . je suis suis . .
he 's sketching an apple . je suis suis . .
he 's worried about the result . je suis suis . .
he 's out of town . je suis suis . .
he 's a freelance journalist . je suis suis . .
he 's afraid of snakes . je suis suis . .
he 's nothing more than a common thug . je suis suis . .
he 's a journalist . je suis suis . .
he 's waiting for the train to leave . je suis suis . .
he 's not available . je suis suis . .
he 's very young . he 's much younger than tom . je suis suis . .


KeyboardInterrupt: 

# Evaluation