In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math

import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# START/END OF SENT TOKENS
SOS_token = 0
EOS_token = 1

# MAX SENTENCE LEN
MAX_LENGTH = 10

# HIDDEN DIM OF RNN
hidden_size = 256

# LIST OF PHRASES WE WANT SENTENCES TO START WITH
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


In [3]:
# USED TO DEFINE A LANGUAGE DICTIONARY W/ OTHER NECESSARY COMPONENTS
class Lang:
    def __init__(self, name):
        self.name = name
        # WORD INDEX REP, WORD COUNT, INDEX WORD REP
        self.word2index = {}
        self.word2count = {}
        # EVERY LANG NEEDS START/END TOKEN
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        # CHECK IF WORD IN DICT ALREADY, IF NOT INSERT
        # ELSE WE JUST UPDATE THE WORD COUNT
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [5]:
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse input/output order, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
def filterPair(p):
    # REMOVE PAIRS WHERE INPUT/OUTPUT TOO LONG AND ONLY USE
    # THE START PHRASES WE WANTED 
    # print(p)
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
        # p[1].startswith(eng_prefixes)

In [8]:
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [9]:
def prepareData(lang1, lang2, reverse=False):
    # GET OUR LANGUAGE DICTS AND THE SENTENCES
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    # REMOVE TOO LONG SENTENCES, SENTENCES W/ INCORRECT START
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    # FILL OUR DICT
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [10]:
# GET OUR INPUT/OUTPUT DICT AND THE USABLE SENTENCES
# input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
input_lang, output_lang, pairs = prepareData('sindarin', 'eng', True)
# PRINT RANDOM SENTENCE EXAMPLE
# print(random.choice(pairs))

random.shuffle(pairs)

eng_arr_len = [0,0,0,0,0,0,0,0,0,0,0]
sind_arr_len = [0,0,0,0,0,0,0,0,0,0,0]
for pair in pairs:
    eng_arr_len[len(pair[0].split())] += 1
    sind_arr_len[len(pair[1].split())] += 1

print(eng_arr_len)
print(sind_arr_len)
print(random.choice(pairs))

train_pairs = pairs[:round((len(pairs) * .9))]
test_pairs = pairs[round((len(pairs) * .9)):]
pairs = train_pairs

Reading lines...
Read 4004 sentence pairs
Trimmed to 3785 sentence pairs
Counting words...
Counted words:
eng 1702
sindarin 2260
[0, 107, 445, 704, 614, 682, 556, 391, 179, 107, 0]
[0, 285, 1036, 989, 671, 396, 196, 137, 57, 18, 0]
['i want water', 'aniron nen']


In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        # EMBED OUR INPUT TO THE HIDDEN DIMENSION SPACE
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        # FEED OUR EMBEDDED INPUT TO AN RNN TO BUILD A 
        # SEQ REPRESENTATION OF THE EMBEDDED INPUT
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        # USED TO INIT THE HIDDEN INPUT
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
# THIS IS JUST A STANDARD RNN DECODER EXAMPLE
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # EMBED OUR INPUT TO THE HIDDEN DIMENSION SPACE
        output = self.embedding(input).view(1, 1, -1)
        # ADD A NON-LINEARITY TO THE EMBEDDED INPUT
        output = F.relu(output)
        # FEED OUR EMBEDDED INPUT TO AN RNN TO BUILD A 
        # SEQ REPRESENTATION OF THE EMBEDDED INPUT
        output, hidden = self.gru(output, hidden)
        # PREDICT OUR SENTENCE BASED ON THE END OUTPUT
        # OF THE RNN
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        # USED TO INIT THE HIDDEN INPUT
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # EMBED OUR INPUT TO THE HIDDEN DIMENSION SPACE
        embedded = self.embedding(input).view(1, 1, -1)
        # ADD SOME DROPOUT TO THE EMBEDDED SPACE TO PREVENT
        # OVERFITTING THAT MAY OCCUR DURING TRAINING
        embedded = self.dropout(embedded)
        
        # COMBINE THE HIDDEN STATE AND THE INPUT AND LEARN SOME
        # ATTN FOR THE INPUT, CONVERT TO WEIGHTS
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        # APPLY WEIGHT TO ENCODER INFO SO WE FOCUS ON CORRECT
        # AREA OF SENTENCE
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        # COMBINE OUR ATTN EMBEDDING W/ EMBEDDED INPUT
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        # ADD A NON-LINEARITY TO THE ATTN SENTENCE
        output = F.relu(output)
        # FEED OUR ATTN SENTENCE TO AN RNN TO BUILD A 
        # SEQ REPRESENTATION OF THE OUTPUT
        output, hidden = self.gru(output, hidden)
        
        # PREDICT OUR SENTENCE BASED ON THE END OUTPUT
        # OF THE RNN
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        # USED TO INIT THE HIDDEN INPUT
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [14]:
# TOKENIZE SENTENCE BASED ON DICT INDEX
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

In [15]:
# CREATE TENSOR OF SENTENCE TOKENS
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [16]:
# CREATE INPUT AND OUTPUT TENSORS
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [17]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    # USE TO DECIDE IF WE TAKE OUR PREDICTION OR THE GROUND TRUTH
    teacher_forcing_ratio = 0.25
    # INIT HIDDEN STATE OF ENCODER
    encoder_hidden = encoder.initHidden()
    
    # SET TENSORS TO 0 FOR ENCODER/DECORDER
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # INPUT/OUTPUT SENTENCE LEN
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    # TENSOR TO HOLD ENCORDER REP OF SENTENCE
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    # FOR EACH WORD OF ENCODER BUILD REP
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # ADD START TOK TO DECODER INPUT
    decoder_input = torch.tensor([[SOS_token]], device=device)

    # USE THE LEARNED REP OF ENCODER FOR DECODER
    decoder_hidden = encoder_hidden

    # SEE IF WE SHOULD USE GROUND TRUTH OR NOT
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # PREDICT EACH WORD OF SENTENCE BASED ON PREV INPUT TERM
    # USE THE HIDDEN STATE OF ENCODER & THE REPRESENTATION 
    # OF EACH TERM FROM ENCODER
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    # UDPATE MODEL
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [18]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [19]:
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [20]:
plt.switch_backend('agg')

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [21]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # DEFINE OPTIMIZERS FOR ENCODER & DECODER
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # GET RANDOM TRAINING SENTENCES
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    # NLL LOSS
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        # TRAIN ON SENTENCE
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [22]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [23]:
print(type(test_pairs[0][1]))

pair = test_pairs[0]
print(type(pair[1]))

<class 'str'>
<class 'str'>


In [24]:
def evaluateRandomly(encoder, decoder, n=10, showExamples=True):
    predictions_words = []
    references_words = []
    predictions = []
    references = []
    for i in range(n):
        # pair = random.choice(pairs)
        pair = test_pairs[i]
        if showExamples:
            print('>', pair[0])
            print('=', pair[1])
        references.append(pair[1])
        references_words.append(pair[1].split())
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        predictions_words.append(output_words[:-1])
        output_sentence = ' '.join(output_words[:-1])
        predictions.append(output_sentence)
        if showExamples:
            print(output_words)
            print('<', output_sentence)
            print('')
        

    from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
    from evaluate import load
    # references = references_words
    candidates = predictions_words
    cherrychen = SmoothingFunction()
    bleu_4_sum_score = corpus_bleu(references_words, candidates, smoothing_function=cherrychen.method7)
    bleu_3_sum_score = corpus_bleu(references_words, candidates, weights=(0.33, 0.33, 0.33, 0), smoothing_function=cherrychen.method7)
    bleu_2_sum_score = corpus_bleu(references_words, candidates, weights=(0.5, 0.5, 0, 0), smoothing_function=cherrychen.method7)
    bleu_1_score = corpus_bleu(references_words, candidates, weights=(1, 0, 0, 0), smoothing_function=cherrychen.method7)
    bleu_2_score = corpus_bleu(references_words, candidates, weights=(0, 1, 0, 0), smoothing_function=cherrychen.method7)
    bleu_3_score = corpus_bleu(references_words, candidates, weights=(0, 0, 1, 0), smoothing_function=cherrychen.method7)
    bleu_4_score = corpus_bleu(references_words, candidates, weights=(0, 0, 0, 1), smoothing_function=cherrychen.method7)
    
    # BLEU SUM SCORES
    print("BLEU SUM SCORES:")
    print(bleu_4_sum_score)
    print(bleu_3_sum_score)
    print(bleu_2_sum_score)
    
    # BLEU IND SCORES
    print("BLEU IND SCORES:")
    print(bleu_4_score)
    print(bleu_3_score)
    print(bleu_2_score)
    print(bleu_1_score)


    wer = load("wer")
    wer_score = wer.compute(predictions=predictions, references=references)
    print("WER SCORE:")
    print(wer_score)

In [25]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [26]:
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

In [27]:
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

1m 35s (- 22m 12s) (5000 6%) 3.7931
3m 14s (- 21m 2s) (10000 13%) 2.9493
4m 56s (- 19m 44s) (15000 20%) 2.2029
6m 40s (- 18m 20s) (20000 26%) 1.6652
8m 25s (- 16m 50s) (25000 33%) 1.2103
10m 10s (- 15m 15s) (30000 40%) 0.9465
11m 56s (- 13m 39s) (35000 46%) 0.7247
13m 43s (- 12m 0s) (40000 53%) 0.5591
15m 31s (- 10m 21s) (45000 60%) 0.4671
17m 18s (- 8m 39s) (50000 66%) 0.3985
19m 5s (- 6m 56s) (55000 73%) 0.3430
20m 51s (- 5m 12s) (60000 80%) 0.3031
22m 39s (- 3m 29s) (65000 86%) 0.2835
24m 25s (- 1m 44s) (70000 93%) 0.2527
26m 13s (- 0m 0s) (75000 100%) 0.2496


In [28]:
evaluateRandomly(encoder1, attn_decoder1)

> you mean me ?
= ni thelig ?
['ni', 'thelidh', '?', '<EOS>']
< ni thelidh ?

> trap !
= gadas !
['gadas', '!', '<EOS>']
< gadas !

> where do you go ?
= na mhan menidh ?
['na', 'van', 'menig', '?', '<EOS>']
< na van menig ?

> we will not
= avomh
['avof', '<EOS>']
< avof

> may your horse be swift
= aen lagor i roch l n
['no', 'lim', 'i', 'aran', 'no', 'n', '<EOS>']
< no lim i aran no n

> follow the stream one day and go south
= aphado i hirion er arad a pado na charad
['mae', 'ah', 'i', 'edain', 'a', 'ar', '<EOS>']
< mae ah i edain a ar

> i don t understand now everything
=  u chenion hi bain
['avon', 'aniron', 'athen', '<EOS>']
< avon aniron athen

> i promise
= gweston
['im', '<EOS>']
< im

> i m not feeling happy
=  u vathon alu
['avon', 'cared', '<EOS>']
< avon cared

> i am a merchant
= ni vachor
['ni', 'nunadan', '<EOS>']
< ni nunadan



  from .autonotebook import tqdm as notebook_tqdm


BLEU SUM SCORES:
0.0893981108674668
0.1510855991601059
0.24759060481766332
BLEU IND SCORES:
0.019512942920593954
0.05339766550049111
0.14658403492746838
0.4181977090773145
WER SCORE:
0.7142857142857143


In [29]:
evaluateRandomly(encoder1, attn_decoder1, n=len(test_pairs), showExamples=False)

BLEU SUM SCORES:
0.07832875467128161
0.13742066430219577
0.23239602486815544
BLEU IND SCORES:
0.015351845629706175
0.04540115510278304
0.1345190972306758
0.40148881078131676
WER SCORE:
0.562351072279587


In [30]:
# from nltk.translate.bleu_score import corpus_bleu
# references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]
# candidates = [['this', 'is', 'a', 'test']]
# score = corpus_bleu(references, candidates)
# print(score)

In [31]:
# from evaluate import load
# wer = load("wer")
# wer_score = wer.compute(predictions=["this is a test", "this is a test"], references=["this is a test", "this is test"])
# print(wer_score)

In [32]:
# output_words, attentions = evaluate(
#     encoder1, attn_decoder1, "je suis trop froid .")
# plt.matshow(attentions.numpy())

In [33]:
# evaluateAndShowAttention("elle a cinq ans de moins que moi .")

In [34]:
# evaluateAndShowAttention("elle est trop petit .")

In [35]:
# evaluateAndShowAttention("je ne crains pas de mourir .")

In [36]:
# evaluateAndShowAttention("c est un jeune directeur plein de talent .")