In [70]:
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import warnings
import numpy as np

warnings.filterwarnings("ignore")

Selecting device for calculations

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [72]:
device

device(type='cuda')

Defining vocabluary and index to token/token to index conversions

In [73]:
TOKENS = list(range(0, 10, 1))
INDEX_TO_TOKEN = ['SOS', 'EOS'] + TOKENS
TOKEN_TO_INDEX = { token : i for i, token in enumerate(INDEX_TO_TOKEN) }

In [74]:
TOKEN_TO_INDEX

{'SOS': 0,
 'EOS': 1,
 0: 2,
 1: 3,
 2: 4,
 3: 5,
 4: 6,
 5: 7,
 6: 8,
 7: 9,
 8: 10,
 9: 11}

In [75]:
TOKENS

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Generating sequence pairs

In [76]:
MAX_LENGTH = 10

In [77]:
def generate_sequences(n):
    pairs = []

    for _ in range(n):
        x = list(np.random.randint(size=np.random.randint(1, MAX_LENGTH), low=0, high=10))
        y = [v if j == 0 else v + x[0] if v + x[0] < 10 else v + x[0] - 10 for j, v in enumerate(x)]

        pairs.append((x, y))

    return pairs

In [78]:
pairs = generate_sequences(100000)

In [79]:
pairs[0]

([2, 1, 9, 4, 0], [2, 3, 1, 6, 2])

Defining encoder and decoder with attention mechanism

In [80]:
class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embedding_size)

        self.gru = nn.GRU(embedding_size, hidden_size)

    def forward(self, input, hidden):
        embeddings = self.embedding(input).view(1, 1, -1)

        return self.gru(embeddings, hidden)

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [81]:
class AttentionDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttentionDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Defining util functions for model training

In [82]:
def indexesFromSequence(sequence):
    return [TOKEN_TO_INDEX[t] for t in sequence]

def tensorFromSequence(sequence):
    indexes = indexesFromSequence(sequence)
    indexes.append(TOKEN_TO_INDEX['EOS'])
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSequence(pair[0])
    target_tensor = tensorFromSequence(pair[1])
    return (input_tensor, target_tensor)

In [83]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[TOKEN_TO_INDEX['SOS']]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == TOKEN_TO_INDEX['EOS']:
                break
    
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / target_length

In [84]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / percent
    rs = es - s
    return '%s (- eta: %s)' % (asMinutes(s), asMinutes(rs))

In [85]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.switch_backend('agg')


def showPlot(points):
    plt.figure()
    _, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [86]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]

    criterion = nn.NLLLoss()

    for epoch in range(1, n_iters + 1):
        training_pair = training_pairs[epoch - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_iters),
                                         epoch, epoch / n_iters * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    showPlot(plot_losses)

Training model:

In [145]:
hidden_size = 512
encoder1 = EncoderRNN(len(INDEX_TO_TOKEN), hidden_size, hidden_size).to(device)
decoder1 = AttentionDecoderRNN(hidden_size, len(INDEX_TO_TOKEN), dropout_p=0.1).to(device)
trainIters(encoder1, decoder1, 105000, print_every=5000)

KeyboardInterrupt: 

Defining utils for model test on new inputs

In [124]:
def predict_y_sequence(encoder, decoder, x_sequence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSequence(x_sequence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[TOKEN_TO_INDEX['SOS']]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        y_sequence = []

        for di in range(max_length):
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, 
                decoder_hidden, 
                encoder_outputs)
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == TOKEN_TO_INDEX['EOS']:
                break
            else:
                y_sequence.append(INDEX_TO_TOKEN[topi.item()])

            decoder_input = topi.squeeze().detach()

        return y_sequence

Checking model on new inputs:

In [140]:
def evaluateRandomly(encoder, decoder, n=10000):
    test_pairs = generate_sequences(n)

    refs = []
    hyps = []
    for pair in test_pairs:
        predicted_y_sequence = predict_y_sequence(encoder, decoder, pair[0])
        
        print('>', pair[0])
        print('=', pair[1])
        print('<', predicted_y_sequence)
        print('')

        refs.append([pair[1]])
        hyps.append(predicted_y_sequence)
    
    return refs, hyps

In [141]:
refs, hyps = evaluateRandomly(encoder1, decoder1)

> [0, 6, 4, 5, 9, 3, 0, 1]
= [0, 6, 4, 5, 9, 3, 0, 1]
< [0, 6, 4, 5, 9, 3, 0, 1]

> [3, 8, 1, 7, 3]
= [3, 1, 4, 0, 6]
< [3, 1, 4, 0, 6]

> [5, 6, 7, 5, 0, 3]
= [5, 1, 2, 0, 5, 8]
< [5, 1, 2, 0, 5, 8]

> [1, 2, 7, 6, 5, 7, 1]
= [1, 3, 8, 7, 6, 8, 2]
< [1, 3, 8, 7, 6, 8, 2]

> [1, 9, 4, 1, 7, 3, 0]
= [1, 0, 5, 2, 8, 4, 1]
< [1, 0, 5, 2, 8, 4, 1]

> [3, 9]
= [3, 2]
< [3, 2]

> [3, 7, 8, 5, 6, 7, 4, 0]
= [3, 0, 1, 8, 9, 0, 7, 3]
< [3, 0, 1, 8, 9, 0, 3, 3]

> [2, 4, 1, 3, 3, 0]
= [2, 6, 3, 5, 5, 2]
< [2, 6, 3, 5, 5, 2]

> [6, 4]
= [6, 0]
< [6, 0]

> [5, 6, 7, 4, 1, 7]
= [5, 1, 2, 9, 6, 2]
< [5, 1, 2, 9, 6, 2]

> [8]
= [8]
< [8]

> [7, 8, 7, 8, 8, 9]
= [7, 5, 4, 5, 5, 6]
< [7, 5, 4, 5, 5, 6]

> [5, 2, 4, 8, 2]
= [5, 7, 9, 3, 7]
< [5, 7, 9, 3, 7]

> [5, 1, 8, 0, 2, 6, 1]
= [5, 6, 3, 5, 7, 1, 6]
< [5, 6, 5, 5, 7, 7, 6]

> [4, 5, 4, 6]
= [4, 9, 8, 0]
< [4, 9, 8, 0]

> [7, 8, 2, 2, 6]
= [7, 5, 9, 9, 3]
< [7, 5, 9, 9, 3]

> [9]
= [9]
< [9]

> [7, 6, 2, 2, 9, 5, 8, 7, 6]
= [7, 3, 9, 9, 6, 2, 5, 4,

In [150]:
refs[100]

[[5, 9, 9, 2, 2, 0, 1]]

In [151]:
hyps[100]

[5, 9, 9, 2, 2, 0, 1]

In [152]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

corpus_bleu_score = corpus_bleu(refs, hyps)
print("Final BLEU Score:", corpus_bleu_score)

Final BLEU Score: 0.6796450709441539
