# 1 Translation Systems
We provide two files:  
• Train.tsv (the training data) contains a parallel text: each line contains six tab-separated expressions, with each column expressing the same meaning in a different language. The languages are:  
Chinese, English, Spanish, Hindi, Japanese, and Norwegian. Text in the same tab-separated column is
always in the same language.  
  
• Test.tsv (the test data) is in a similar format, but in each row, four of the tab-separated columns
are empty, and one of them contains a ? symbol. The other not-empty column contains an expression
in that column’s language. For example, the first line of this file contains text in the first column (a
Chinese expression) and ? in the third column (which is Spanish), indicating that this text should be
translated into Spanish.  
  
Your task is to build 30 translation systems: one for every ordered pair of the six languages represented
in the training data. The design is up to you, and you may use any resources that you can find to complete
the task. 

In [161]:
import torch
import random
import pandas as pd
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [162]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [163]:
# read the training data and test data and store them in a dataframe
df = pd.read_csv("./data/Train.tsv", sep="\t", names=["chinese", "english", "spanish", "hindi", "japanese", "norwegian"])

In [164]:
df.head()

Unnamed: 0,chinese,english,spanish,hindi,japanese,norwegian
0,零,zero,cero,शून्य,零,
1,一,one,uno,एक,一,en
2,二,two,dos,दो,二,to
3,三,three,tres,तीन,三,tre
4,四,four,cuatro,चार,四,fire


In [165]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

len(train_df), len(val_df)

(15740, 3936)

In [166]:
train_df.spanish

13649    seiscientos setenta y tres mil ochocientos nov...
9913     cuatrocientos setenta y tres mil quinientos se...
12752    seiscientos veintisiete mil quinientos setenta...
2132                       ochenta y siete mil ciento diez
14195    setecientos cuatro mil doscientos treinta y cinco
                               ...                        
11284     quinientos cuarenta y cinco mil ochenta y cuatro
11964    quinientos ochenta y tres mil seiscientos sete...
5390     doscientos treinta y nueve mil quinientos sese...
860               treinta y uno mil ochocientos diecisiete
15795    setecientos ochenta y nueve mil doscientos tre...
Name: spanish, Length: 15740, dtype: object

In [193]:
# let's first try to translate from chinese to english
# we will use the chinese column as the input and english column as the output
# tokenise the input and output and create a vocabulary for each
# we will use the vocabulary to convert the input and output to a sequence of integers
# we will use the sequence of integers as the input and output to the model
# we will use the vocabulary to convert the output of the model to a sequence of words

# the tokenisation of chinese will be done on basis of each chinese character (assuming that's the correct way)
generalised_vocab = set()

# create a chinese vocabulary
for chinese_number in df["chinese"]:
    for character in chinese_number:
        generalised_vocab.add(character)

# create an english vocabulary
for english_number in df["english"]:
    for word in english_number.split():
        if "-" in word:
            for sub_word in word.split("-"):
                generalised_vocab.add(sub_word)
        else:
            generalised_vocab.add(word)

# create a spanish vocabulary
for spanish_number in df["spanish"]:
    for word in spanish_number.split():
        generalised_vocab.add(word)

# create a hindi vocabulary
for hindi_number in df["hindi"]:
    for word in hindi_number.split():
        generalised_vocab.add(word)

# create a japanese vocabulary
for japanese_number in df["japanese"]:
    for word in japanese_number.split():
        generalised_vocab.add(word)

# create a norwegian vocabulary
for norwegian_number in df["norwegian"]:
    if isinstance(norwegian_number, float):
        continue
    for word in norwegian_number.split():
        generalised_vocab.add(word)

token2idx = {ch: idx+32 for idx, ch in enumerate(generalised_vocab)}
token2idx["<SOS>"] = 0
token2idx["<EOS>"] = 1
token2idx["<EN><CN>"] = 2
token2idx["<EN><ES>"] = 3
token2idx["<EN><HI>"] = 4
token2idx["<EN><JP>"] = 5
token2idx["<EN><NO>"] = 6
token2idx["<CN><EN>"] = 7
token2idx["CN><ES>"] = 8
token2idx["<CN><HI>"] = 9
token2idx["<CN><JP>"] = 10
token2idx["<CN><NO>"] = 11
token2idx["<ES><EN>"] = 12
token2idx["<ES><CN>"] = 13
token2idx["<ES><HI>"] = 14
token2idx["<ES><JP>"] = 15
token2idx["<ES><NO>"] = 16
token2idx["<HI><EN>"] = 17
token2idx["<HI><CN>"] = 18
token2idx["<HI><ES>"] = 19
token2idx["<HI><JP>"] = 20
token2idx["<HI><NO>"] = 21
token2idx["<JP><EN>"] = 22
token2idx["<JP><CN>"] = 23
token2idx["<JP><ES>"] = 24
token2idx["<JP><HI>"] = 25
token2idx["<JP><NO>"] = 26
token2idx["<NO><EN>"] = 27
token2idx["<NO><CN>"] = 28
token2idx["<NO><ES>"] = 29
token2idx["<NO><HI>"] = 30
token2idx["<NO><JP>"] = 31



idx2token = {idx: ch for ch, idx in token2idx.items()}

# add special tokens to the vocabulary
generalised_vocab.add("<SOS>")
generalised_vocab.add("<EOS>")
generalised_vocab.add("<EN><CN>")
generalised_vocab.add("<EN><ES>")
generalised_vocab.add("<EN><HI>")
generalised_vocab.add("<EN><JP>")
generalised_vocab.add("<EN><NO>")
generalised_vocab.add("<CN><EN>")
generalised_vocab.add("<CN><ES>")
generalised_vocab.add("<CN><HI>")
generalised_vocab.add("<CN><JP>")
generalised_vocab.add("<CN><NO>")
generalised_vocab.add("<ES><EN>")
generalised_vocab.add("<ES><CN>")
generalised_vocab.add("<ES><HI>")
generalised_vocab.add("<ES><JP>")
generalised_vocab.add("<ES><NO>")
generalised_vocab.add("<HI><EN>")
generalised_vocab.add("<HI><CN>")
generalised_vocab.add("<HI><ES>")
generalised_vocab.add("<HI><JP>")
generalised_vocab.add("<HI><NO>")
generalised_vocab.add("<JP><EN>")
generalised_vocab.add("<JP><CN>")
generalised_vocab.add("<JP><ES>")
generalised_vocab.add("<JP><HI>")
generalised_vocab.add("<JP><NO>")
generalised_vocab.add("<NO><EN>")
generalised_vocab.add("<NO><CN>")
generalised_vocab.add("<NO><ES>")
generalised_vocab.add("<NO><HI>")
generalised_vocab.add("<NO><JP>")


In [168]:
def en_to_tensors(english_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in english_sentence.split():
        if "-" in token:
            for sub_word in token.split("-"):
                token_ids.append(token2idx[sub_word])
        else:
            token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

def cn_to_tensors(chinese_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in chinese_sentence:
        token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

def es_to_tensors(spanish_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in spanish_sentence.split():
        token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

def hi_to_tensors(hindi_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in hindi_sentence.split():
        token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

def jp_to_tensors(japanese_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in japanese_sentence.split():
        token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

def no_to_tensors(norwegian_sentence, add=None):
    token_ids = []
    if add:
        token_ids.append(token2idx[add])
    for token in norwegian_sentence.split():
        token_ids.append(token2idx[token])
    token_ids.append(token2idx["<EOS>"])
    return torch.tensor(token_ids, dtype=torch.long, device=device).view(-1, 1)

In [169]:
# english translation pairs
en_cn_pairs = [(en, cn) for en, cn in zip(train_df["english"], train_df["chinese"])]
en_es_pairs = [(en, es) for en, es in zip(train_df["english"], train_df["spanish"])]
en_hi_pairs = [(en, hi) for en, hi in zip(train_df["english"], train_df["hindi"])]
en_jp_pairs = [(en, jp) for en, jp in zip(train_df["english"], train_df["japanese"])]
en_no_pairs = [(en, no) for en, no in zip(train_df["english"], train_df["norwegian"]) if isinstance(no, str)]

# chinese translation pairs
cn_en_pairs = [(cn, en) for cn, en in zip(train_df["chinese"], train_df["english"])]
cn_es_pairs = [(cn, es) for cn, es in zip(train_df["chinese"], train_df["spanish"])]
cn_hi_pairs = [(cn, hi) for cn, hi in zip(train_df["chinese"], train_df["hindi"])]
cn_jp_pairs = [(cn, jp) for cn, jp in zip(train_df["chinese"], train_df["japanese"])]
cn_no_pairs = [(cn, no) for cn, no in zip(train_df["chinese"], train_df["norwegian"]) if isinstance(no, str)]

# spanish translation pairs
es_en_pairs = [(es, en) for es, en in zip(train_df["spanish"], train_df["english"])]
es_cn_pairs = [(es, cn) for es, cn in zip(train_df["spanish"], train_df["chinese"])]
es_hi_pairs = [(es, hi) for es, hi in zip(train_df["spanish"], train_df["hindi"])]
es_jp_pairs = [(es, jp) for es, jp in zip(train_df["spanish"], train_df["japanese"])]
es_no_pairs = [(es, no) for es, no in zip(train_df["spanish"], train_df["norwegian"]) if isinstance(no, str)]

# hindi translation pairs
hi_en_pairs = [(hi, en) for hi, en in zip(train_df["hindi"], train_df["english"])]
hi_cn_pairs = [(hi, cn) for hi, cn in zip(train_df["hindi"], train_df["chinese"])]
hi_es_pairs = [(hi, es) for hi, es in zip(train_df["hindi"], train_df["spanish"])]
hi_jp_pairs = [(hi, jp) for hi, jp in zip(train_df["hindi"], train_df["japanese"])]
hi_no_pairs = [(hi, no) for hi, no in zip(train_df["hindi"], train_df["norwegian"]) if isinstance(no, str)]

# japanese translation pairs
jp_en_pairs = [(jp, en) for jp, en in zip(train_df["japanese"], train_df["english"])]
jp_cn_pairs = [(jp, cn) for jp, cn in zip(train_df["japanese"], train_df["chinese"])]
jp_es_pairs = [(jp, es) for jp, es in zip(train_df["japanese"], train_df["spanish"])]
jp_hi_pairs = [(jp, hi) for jp, hi in zip(train_df["japanese"], train_df["hindi"])]
jp_no_pairs = [(jp, no) for jp, no in zip(train_df["japanese"], train_df["norwegian"]) if isinstance(no, str)]

# norwegian translation pairs
no_en_pairs = [(no, en) for no, en in zip(train_df["norwegian"], train_df["english"]) if isinstance(no, str)]
no_cn_pairs = [(no, cn) for no, cn in zip(train_df["norwegian"], train_df["chinese"]) if isinstance(no, str)]
no_es_pairs = [(no, es) for no, es in zip(train_df["norwegian"], train_df["spanish"]) if isinstance(no, str)]
no_hi_pairs = [(no, hi) for no, hi in zip(train_df["norwegian"], train_df["hindi"]) if isinstance(no, str)]
no_jp_pairs = [(no, jp) for no, jp in zip(train_df["norwegian"], train_df["japanese"]) if isinstance(no, str)]


featurised_pairs = []
for i in range(len(en_no_pairs)):
    featurised_pairs.append((en_to_tensors(en_cn_pairs[i][0], add="<EN><CN>"), cn_to_tensors(en_cn_pairs[i][1])))
    featurised_pairs.append((en_to_tensors(en_es_pairs[i][0], add="<EN><ES>"), es_to_tensors(en_es_pairs[i][1])))
    featurised_pairs.append((en_to_tensors(en_hi_pairs[i][0], add="<EN><HI>"), hi_to_tensors(en_hi_pairs[i][1])))
    featurised_pairs.append((en_to_tensors(en_jp_pairs[i][0], add="<EN><JP>"), jp_to_tensors(en_jp_pairs[i][1])))
    featurised_pairs.append((en_to_tensors(en_no_pairs[i][0], add="<EN><NO>"), no_to_tensors(en_no_pairs[i][1])))
    featurised_pairs.append((cn_to_tensors(cn_en_pairs[i][0], add="<CN><EN>"), en_to_tensors(cn_en_pairs[i][1])))
    featurised_pairs.append((cn_to_tensors(cn_es_pairs[i][0], add="<CN><ES>"), es_to_tensors(cn_es_pairs[i][1])))
    featurised_pairs.append((cn_to_tensors(cn_hi_pairs[i][0], add="<CN><HI>"), hi_to_tensors(cn_hi_pairs[i][1])))
    featurised_pairs.append((cn_to_tensors(cn_jp_pairs[i][0], add="<CN><JP>"), jp_to_tensors(cn_jp_pairs[i][1])))
    featurised_pairs.append((cn_to_tensors(cn_no_pairs[i][0], add="<CN><NO>"), no_to_tensors(cn_no_pairs[i][1])))
    featurised_pairs.append((es_to_tensors(es_en_pairs[i][0], add="<ES><EN>"), en_to_tensors(es_en_pairs[i][1])))
    featurised_pairs.append((es_to_tensors(es_cn_pairs[i][0], add="<ES><CN>"), cn_to_tensors(es_cn_pairs[i][1])))
    featurised_pairs.append((es_to_tensors(es_hi_pairs[i][0], add="<ES><HI>"), hi_to_tensors(es_hi_pairs[i][1])))
    featurised_pairs.append((es_to_tensors(es_jp_pairs[i][0], add="<ES><JP>"), jp_to_tensors(es_jp_pairs[i][1])))
    featurised_pairs.append((es_to_tensors(es_no_pairs[i][0], add="<ES><NO>"), no_to_tensors(es_no_pairs[i][1])))
    featurised_pairs.append((hi_to_tensors(hi_en_pairs[i][0], add="<HI><EN>"), en_to_tensors(hi_en_pairs[i][1])))
    featurised_pairs.append((hi_to_tensors(hi_cn_pairs[i][0], add="<HI><CN>"), cn_to_tensors(hi_cn_pairs[i][1])))
    featurised_pairs.append((hi_to_tensors(hi_es_pairs[i][0], add="<HI><ES>"), es_to_tensors(hi_es_pairs[i][1])))
    featurised_pairs.append((hi_to_tensors(hi_jp_pairs[i][0], add="<HI><JP>"), jp_to_tensors(hi_jp_pairs[i][1])))
    featurised_pairs.append((hi_to_tensors(hi_no_pairs[i][0], add="<HI><NO>"), no_to_tensors(hi_no_pairs[i][1])))
    featurised_pairs.append((jp_to_tensors(jp_en_pairs[i][0], add="<JP><EN>"), en_to_tensors(jp_en_pairs[i][1])))
    featurised_pairs.append((jp_to_tensors(jp_cn_pairs[i][0], add="<JP><CN>"), cn_to_tensors(jp_cn_pairs[i][1])))
    featurised_pairs.append((jp_to_tensors(jp_es_pairs[i][0], add="<JP><ES>"), es_to_tensors(jp_es_pairs[i][1])))
    featurised_pairs.append((jp_to_tensors(jp_hi_pairs[i][0], add="<JP><HI>"), hi_to_tensors(jp_hi_pairs[i][1])))
    featurised_pairs.append((jp_to_tensors(jp_no_pairs[i][0], add="<JP><NO>"), no_to_tensors(jp_no_pairs[i][1])))
    featurised_pairs.append((no_to_tensors(no_en_pairs[i][0], add="<NO><EN>"), en_to_tensors(no_en_pairs[i][1])))
    featurised_pairs.append((no_to_tensors(no_cn_pairs[i][0], add="<NO><CN>"), cn_to_tensors(no_cn_pairs[i][1])))
    featurised_pairs.append((no_to_tensors(no_es_pairs[i][0], add="<NO><ES>"), es_to_tensors(no_es_pairs[i][1])))
    featurised_pairs.append((no_to_tensors(no_hi_pairs[i][0], add="<NO><HI>"), hi_to_tensors(no_hi_pairs[i][1])))
    featurised_pairs.append((no_to_tensors(no_jp_pairs[i][0], add="<NO><JP>"), jp_to_tensors(no_jp_pairs[i][1])))

len(featurised_pairs), len(en_cn_pairs)

(78700, 15740)

In [170]:
# compute max length of input and output
max_length_ch = max([len(ch) for ch in train_df["chinese"]])
max_length_en = max([len(en.split()) for en in train_df["english"]])
max_length_es = max([len(es.split()) for es in train_df["spanish"]])
max_length_hi = max([len(hi.split()) for hi in train_df["hindi"]])
max_length_jp = max([len(jp.split()) for jp in train_df["japanese"]])
max_length_no = max([len(no.split()) for no in train_df["norwegian"]])
print(max_length_ch, max_length_en, max_length_es, max_length_hi, max_length_jp, max_length_no)
MAX_LENGTH = 11

11 7 9 7 1 9


## Model Code

In [171]:
# The Encoder

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [172]:
# The Decoder
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## Training the model

In [173]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[token2idx["<SOS>"]]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == token2idx["<EOS>"]:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [174]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [175]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [(random.choice(featurised_pairs[:10]))
                      for i in range(n_iters)]
    # training_pairs = []
    # for i in range(n_iters):
    #     chosen_pair_en_cn = random.choice(en_cn_pairs[:10])
    #     chosen_pair_en_es = random.choice(en_es_pairs[:10])
    #     training_pairs.append((en_to_tensors(chosen_pair[0], add="<EN><CN>"), cn_to_tensors(chosen_pair[1])))

    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    # showPlot(plot_losses)

In [177]:
# start training
hidden_size = 256
encoder1 = EncoderRNN(len(generalised_vocab), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(generalised_vocab), dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 10000, print_every=1000)

1m 45s (- 15m 53s) (1000 10%) 1.7336
7m 3s (- 28m 15s) (2000 20%) 0.0163
18m 4s (- 42m 10s) (3000 30%) 0.0047
20m 3s (- 30m 5s) (4000 40%) 0.0028
22m 4s (- 22m 4s) (5000 50%) 0.0020
24m 3s (- 16m 2s) (6000 60%) 0.0016
26m 1s (- 11m 9s) (7000 70%) 0.0013
27m 54s (- 6m 58s) (8000 80%) 0.0011
29m 50s (- 3m 18s) (9000 90%) 0.0009
31m 48s (- 0m 0s) (10000 100%) 0.0008


## Evaluation

In [143]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH, target_lang="cn", input_lang="en"):
    with torch.no_grad():
        if input_lang == "english":
            if target_lang == "chinese":
                input_tensor = en_to_tensors(sentence, add="<EN><CN>")
            if target_lang == "spanish":
                input_tensor = en_to_tensors(sentence, add="<EN><ES>")
            if target_lang == "hindi":
                input_tensor = en_to_tensors(sentence, add="<EN><HI>")
            if target_lang == "japanese":
                input_tensor = en_to_tensors(sentence, add="<EN><JP>")
            if target_lang == "norwegian":
                input_tensor = en_to_tensors(sentence, add="<EN><NO>")
        if input_lang == "chinese":
            if target_lang == "english":
                input_tensor = cn_to_tensors(sentence, add="<CN><EN>")
            if target_lang == "spanish":
                input_tensor = cn_to_tensors(sentence, add="<CN><ES>")
            if target_lang == "hindi":
                input_tensor = cn_to_tensors(sentence, add="<CN><HI>")
            if target_lang == "japanese":
                input_tensor = cn_to_tensors(sentence, add="<CN><JP>")
            if target_lang == "norwegian":
                input_tensor = cn_to_tensors(sentence, add="<CN><NO>")
        if input_lang == "spanish":
            if target_lang == "english":
                input_tensor = es_to_tensors(sentence, add="<ES><EN>")
            if target_lang == "chinese":
                input_tensor = es_to_tensors(sentence, add="<ES><CN>")
            if target_lang == "hindi":
                input_tensor = es_to_tensors(sentence, add="<ES><HI>")
            if target_lang == "japanese":
                input_tensor = es_to_tensors(sentence, add="<ES><JP>")
            if target_lang == "norwegian":
                input_tensor = es_to_tensors(sentence, add="<ES><NO>")
        if input_lang == "hindi":
            if target_lang == "english":
                input_tensor = hi_to_tensors(sentence, add="<HI><EN>")
            if target_lang == "chinese":
                input_tensor = hi_to_tensors(sentence, add="<HI><CN>")
            if target_lang == "spanish":
                input_tensor = hi_to_tensors(sentence, add="<HI><ES>")
            if target_lang == "japanese":
                input_tensor = hi_to_tensors(sentence, add="<HI><JP>")
            if target_lang == "norwegian":
                input_tensor = hi_to_tensors(sentence, add="<HI><NO>")
        if input_lang == "japanese":
            if target_lang == "english":
                input_tensor = jp_to_tensors(sentence, add="<JP><EN>")
            if target_lang == "chinese":
                input_tensor = jp_to_tensors(sentence, add="<JP><CN>")
            if target_lang == "spanish":
                input_tensor = jp_to_tensors(sentence, add="<JP><ES>")
            if target_lang == "hindi":
                input_tensor = jp_to_tensors(sentence, add="<JP><HI>")
            if target_lang == "norwegian":
                input_tensor = jp_to_tensors(sentence, add="<JP><NO>")
        if input_lang == "norwegian":
            if target_lang == "english":
                input_tensor = no_to_tensors(sentence, add="<NO><EN>")
            if target_lang == "chinese":
                input_tensor = no_to_tensors(sentence, add="<NO><CN>")
            if target_lang == "spanish":
                input_tensor = no_to_tensors(sentence, add="<NO><ES>")
            if target_lang == "hindi":
                input_tensor = no_to_tensors(sentence, add="<NO><HI>")
            if target_lang == "japanese":
                input_tensor = no_to_tensors(sentence, add="<NO><JP>")

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[token2idx["<SOS>"]]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == token2idx["<EOS>"]:
                decoded_words.append('<EOS>')
                break
            else:
                # decoded_words.append(cn_idx2char[topi.item()])
                decoded_words.append(idx2token[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [178]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(en_es_pairs[:10])
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0], target_lang="spanish")
        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print(' ')

In [179]:
evaluateRandomly(encoder1, attn_decoder1)

> eighty-seven thousand one hundred ten
= ochenta y siete mil ciento diez
< seiscientossetentaytresmilquinientossesentaycuatro<EOS>
 
> eight hundred seventy-eight thousand two hundred twenty-four
= ochocientos setenta y ocho mil doscientos veinticuatro
< cuatrocientossetentaytresmilquinientossesentaycuatro<EOS>
 
> eight hundred seventy-eight thousand two hundred twenty-four
= ochocientos setenta y ocho mil doscientos veinticuatro
< cuatrocientossetentaytresmilquinientossesentaycuatro<EOS>
 
> two hundred two thousand two hundred one
= doscientos dos mil doscientos uno
< seiscientossetentaytresmilquinientossesentaycuatro<EOS>
 
> two hundred two thousand two hundred one
= doscientos dos mil doscientos uno
< seiscientossetentaytresmilquinientossesentaycuatro<EOS>
 
> two hundred two thousand two hundred one
= doscientos dos mil doscientos uno
< seiscientossetentaytresmilquinientossesentaycuatro<EOS>
 
> six hundred ninety-seven thousand five hundred twenty
= seiscientos noventa y siete

In [185]:
# read test data
test_df = pd.read_csv("./data/Test.tsv",sep="\t", names=["chinese", "english", "spanish", "hindi", "japanese", "norwegian"])

In [191]:
language_list = ["chinese", "english", "spanish", "hindi", "japanese", "norwegian"]
for idx in test_df.index:
    if isinstance(test_df["chinese"][idx], str) and test_df["chinese"][idx]!= "?":
        for language in language_list:
            if language == "chinese":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["chinese"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence
    if isinstance(test_df["english"][idx], str) and test_df["english"][idx]!= "?":
        for language in language_list:
            if language == "english":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["english"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence
    
    if isinstance(test_df["spanish"][idx], str) and test_df["spanish"][idx]!= "?":
        for language in language_list:
            if language == "spanish":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["spanish"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence
    
    if isinstance(test_df["hindi"][idx], str) and test_df["hindi"][idx]!= "?":
        for language in language_list:
            if language == "hindi":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["hindi"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence

    if isinstance(test_df["japanese"][idx], str) and test_df["japanese"][idx]!= "?":
        for language in language_list:
            if language == "japanese":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["japanese"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence

    if isinstance(test_df["norwegian"][idx], str) and test_df["norwegian"][idx]!= "?":
        for language in language_list:
            if language == "norwegian":
                continue
            output_words, _ = evaluate(encoder1, attn_decoder1, test_df["norwegian"][idx], target_lang=language)
            output_words = [word for word in output_words if word not in ['<SOS>', '<EOS>']]
            output_sentence = ' '.join(output_words)
            test_df[language][idx] = output_sentence

五十二
六百二十六
一千零三十九
一千零八十二
一千八百四十一
二千一百一十一
三千一百四十二
三千七百
三千七百一十一
三千七百七十一
四千一百四十七
四千二百六十二
四千六百五十八
四千八百七十五
五千一百四十九
五千六百一十四
五千八百四十五
五千九百八十二
六千零六十八
七千零一十五
七千零一十八
七千三百二十二
七千三百五十八
七千八百一十八
七千八百四十一
七千九百七十五
八千四百四十八
九千三百一十七
九千六百九十四
一万零九百七十八
一万一千二百四十一
一万一千七百八十四
一万二千二百三十一
一万二千二百三十七
一万二千三百零二
一万二千五百四十
一万二千五百四十九
一万二千六百九十四
一万三千一百四十三
一万三千二百九十九
一万三千三百五十五
一万三千四百九十三
一万三千六百三十三
一万四千一百八十七
一万四千五百四十七
一万四千八百三十一
一万五千二百四十二
一万五千五百零三
一万六千三百五十
一万六千九百三十七
一万七千一百六十八
一万七千二百二十二
一万七千二百九十三
一万八千四百九十五
一万八千八百一十三
一万八千八百八十二
一万八千九百七十六
一万九千一百七十
一万九千七百零八
一万九千七百一十八
二万零四百九十
二万零九百零六
二万一千五百四十
二万一千六百零四
二万一千七百二十三
二万一千八百零四
二万二千二百二十四
二万二千五百九十七
二万二千六百三十
二万二千七百五十五
二万三千零三十八
二万三千一百七十三
二万三千二百七十四
二万三千四百九十
二万三千四百九十三
二万三千八百三十
二万四千一百二十一
二万四千一百四十九
二万四千六百五十二
二万四千七百六十七
二万四千七百八十八
二万四千八百八十一
二万五千三百三十四
二万五千三百四十三
二万五千六百零七
二万五千六百二十二
二万五千七百八十一
二万五千九百八十八
二万六千四百零五
二万六千六百一十三
二万六千七百九十三
二万七千三百一十七
二万七千三百五十三
二万七千五百九十
二万七千七百八十三
二万八千零六十三
二万八千三百零三
二万八千六百零三
二万九千零六十九
二万九千三百九十四
二万九千九百四十九
三万零二百二十三
三万零五百七十
三万零六百零七
三万零八百二十三
三万一千二百零一
三万一千七百七十七
三万二千零六十二
三万二千一百二十二
三万二千五百二十
三万三千零九

In [194]:
token2idx["fire"]

17069