# Note

The approach presented below, as well as the task itself, is far from perfect.

Should be considered just as an illustration of the general idea of the machine translation pipeline.


# Setup

In [63]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import json
import spacy
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

import random
import numpy as np

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR

from tqdm import tqdm as tqdma
from IPython.display import clear_output
from IPython import display
import matplotlib.pyplot as plt
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## OOP

In [64]:
sos_idx = 0
eos_idx = 1
unk_idx = 2
pad_idx = 3
sos = '<sos>'
eos = '<eos>'
unk = '<unk>'
pad = '<pad>'

In [65]:
class Lang:
    def __init__(self, name, tokenize):
        self.name = name
        self.word2count = {}
        self.word2index = {sos: sos_idx, eos: eos_idx, unk: unk_idx, pad: pad_idx}
        self.index2word = {sos_idx: sos, eos_idx: eos, unk_idx: unk, pad_idx: pad}
        self.n_tokens = 4
        self.n_tokens_trimmed = self.n_tokens
        self.tokenize = tokenize

    def addSentence(self, sentence):
        for token in self.tokenize(sentence):
            self.addWord(token)

    def addWord(self, token):
        if token not in self.word2index:
            self.word2count[token] = 1
            self.word2index[token] = self.n_tokens
            self.n_tokens += 1
            self.n_tokens_trimmed += 1
            self.index2word[self.n_tokens] = token
        else:
            self.word2count[token] += 1

    def trimDict(self, min_freq = 3):
        for token in self.word2count:
            if self.word2count[token] < min_freq:
                self.word2index[token] = unk_idx
                self.n_tokens_trimmed -= 1

    def normalizeWord2Index(self):
        idx = 4
        word2index = {sos: sos_idx, eos: eos_idx, unk: unk_idx, pad: pad_idx}
        index2word = {sos_idx: sos, eos_idx: eos, unk_idx: unk, pad_idx: pad}
        for token in self.word2count:
            if self.word2index[token] != unk_idx:
                word2index[token] = idx
                index2word[idx] = token
                idx += 1
            else:
                word2index[token] = unk_idx
        self.word2index = word2index
        self.index2word = index2word

# Real data

In [66]:
train_data = []
with open('/kaggle/input/dataset1/train') as f_train:
    for line in f_train.readlines():
        line = json.loads(line)
        dst = line['dst']
        src = line['src']
        train_data.append([src, dst])

val_data = []
with open('/kaggle/input/dataset1/val') as f_val:
    for line in f_val.readlines():
        line = json.loads(line)
        dst = line['dst']
        src = line['src']
        val_data.append([src, dst])

test_data = []
with open('/kaggle/input/dataset1/test_no_reference') as f_test:
    for line in f_test.readlines():
        line = json.loads(line)
        src = line['src']
        test_data.append(src)

print(f'#train: {len(train_data)}, #val: {len(val_data)}, #test: {len(test_data)}')
train_data[0], val_data[0], test_data[0]

#train: 300000, #val: 500, #test: 1000


(['◄▴◓◠▨ ◨▽◠▦◈◬◓▪▼◬▵', '- Intriguing.'],
 ["◘◚ ◞◠▷◫◀◗ ▫◠▨◬◎ ▨◪▦◈◫▦◫ ▫◧▻▱◠◈▪ ◚◪ ◝◂▾▼▷◠◓◈'◬▦ 27:37'◈▴▨◗ ◕◂▱◭ ◀◗◓ ▨▴▢ ◈◠▷◠ ◞▨◂◓◨ ▴◒◗▫▱◪◈◗▵",
  'The hosts regrouped, and Bouchard evened the score again, scoring a goal with a 27-37 man advantage.'],
 '◲▦◠▦◬▦■ ◉◗▢◕◗ ◍◗▱◎ ▽◠▽▪▦◠ ◕▴◉◗▦▼▴ ◀◗◓◉◧▨ ◎▴◞◠▸ ◠▱◈▪▨ ◚◪ ◀◨ ◎◪◞◠▸▱◠◓◬▦ ◀◠▢▪▱◠◓▪ ▻◪▨ ◈◂◞▫◉◠ ◈▴◐◫▱◈◗▵')

In [67]:
tokenizer = WordPunctTokenizer()

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s, dst = False):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def split_src_str(s, n = 3):
    res = []
    i = 0
    while i + n < len(s):
      res.append(s[i: i+n])
      i += n
    else:
      res.append(s[-n:])
    return res

def tokenize_src(sent):
    sent = sent.replace('▵', '')
    if sent[0] == '►':
        sent = sent[1:]
    sent = re.sub(r"([.!?])", r" \1", sent)
    #return list(sent)
    sent = sent.strip().split()
    res = []
    for s in sent:
        tok_s = split_src_str(s)
        for tok in tok_s:
          res.append(tok)
        res.append(' ')
    return res

    #return list(sent)
def tokenize_dst(sent):
    return tokenizer.tokenize(normalizeString(sent))

for i in range(5):
    src_tok = tokenize_src(train_data[i][0])
    dst_tok = tokenize_dst(train_data[i][1])
    print(src_tok, len(src_tok))
    print(dst_tok, len(dst_tok))

['◄▴◓', '◓◠▨', ' ', '◨▽◠', '▦◈◬', '◓▪▼', '▪▼◬', ' '] 8
['intriguing'] 1
['▽◪◎', '◗▦◫', '◫▦◫', ' ', '▫▴▨', '◓◠◓', ' ', '▴▫◎', '◪▱◫', ' ', '◚▴', ' ', '◞◧▦', '◞▾▢', '▱◨▨', ' ', '◒◠◓', '◠◀▪', '▦◈◠', '◈◠▦', ' ', '◫◉◎', '▴▱◫', ' '] 24
['he', 'would', 'need', 'to', 'repeat', 'his', 'vows', 'in', 'the', 'land', 'of', 'the', 'living', 'and', 'drink', 'from', 'the', 'wine', 'of', 'ages'] 20
['◄▴◞', '◠▸▱', '◠◓▪', '▪◎◠', ' ', '◀◫▱', '◫▱◪', ' ', '▼◪◚', '◚◠▻', ' ', '◚▴◓', '▴◎◪', '◈◗▦', ' ', '◎◫', ' ', '?', ' '] 19
['you', 'couldn', 't', 'even', 'answer', 'my', 'texts', '?'] 8
['▯◪', ' ', '▨◠◈', '◈◠◓', ' ', '◞◭◓', '◓◠▫', ' ', '◳◠▻', '◬◳◧', '◳◧◓', ' ', '◞▴▦', '◗▦▨', '▦▨◫', ' ', '?', ' '] 18
['how', 'fast', 'do', 'you', 'go', '?'] 6
['◈◠', ' ', '◧▱◠', '▱◠▦', ' ', '◀◫◓', ' ', '▨◠◉', ' ', '◂▱◠', '▽◈◠', '◈◠▦', ' ', '◀◠▷', '◞◪◈', '◗◳◧', '◧◓■', ' ', '◉◧◐', '▾▦▱', '◨◐▾', ' ', '○▱◎', '◠▦▱', '◠◓◈', '◈◠▦', ' ', '▨◠◉', '◉◠▦', ' ', '▽◠▷', '◨◈◫', '▱▴◓', ' '] 34
['he', 's', 'talking', 'about', 'a', 'few', 'right', 

In [68]:
# # trimming train
# trimmed_train_data = []
# for i in tqdma(range(len(train_data))):
#     src = train_data[i][0]
#     dst = train_data[i][1]
#     src_tok = tokenize_src(src)
#     dst_tok = tokenize_dst(dst)
#     max_len = max(len(src_tok), len(dst_tok))
#     min_len = max(0.01, min(len(src_tok), len(dst_tok)))
#     if max_len <= 5:
#         if max_len / min_len <= 2:
#             trimmed_train_data.append([src, dst])
#     else:
#         if max_len / min_len <= 1.65:
#             trimmed_train_data.append([src, dst])
# print(len(trimmed_train_data))
# train_data = trimmed_train_data

# for i in range(10):
#     print(tokenize_src(train_data[i][0]))
#     print(tokenize_dst(train_data[i][1]))

# Preprocessing

## Making Langs

In [69]:
lang_src = Lang("src", tokenize_src)
lang_dst = Lang("dst", tokenize_dst)

for src, dst in tqdma(train_data):
     lang_src.addSentence(src)
     lang_dst.addSentence(dst)
for src, dst in tqdma(val_data):
     lang_src.addSentence(src)
     lang_dst.addSentence(dst)
lang_src.trimDict(min_freq = 2)
lang_dst.trimDict(min_freq = 2)
for src in tqdma(test_data):
     lang_src.addSentence(src)

lang_src.normalizeWord2Index()
lang_dst.normalizeWord2Index()

100%|██████████| 300000/300000 [00:10<00:00, 28668.81it/s]
100%|██████████| 500/500 [00:00<00:00, 10248.36it/s]
100%|██████████| 1000/1000 [00:00<00:00, 23995.84it/s]


In [70]:
print(lang_src.n_tokens, lang_src.n_tokens_trimmed)
print(lang_dst.n_tokens, lang_dst.n_tokens_trimmed)
print(max(lang_src.word2index.values()))

32358 21702
50485 26630
21701


## Indexing

In [71]:
def indexing_src(src, lang_src = lang_src):
    src = tokenize_src(src)
    return [sos_idx] + [lang_src.word2index[token] for token in src] + [eos_idx]
def indexing_dst(dst, lang_dst = lang_dst):
    dst = tokenize_dst(dst)
    return [sos_idx] + [lang_dst.word2index[token] for token in dst] + [eos_idx]

def translate_src(indexed_src, lang_src = lang_src):
    return [lang_src.index2word[token] for token in indexed_src]
def translate_dst(indexed_dst, lang_dst = lang_dst):
    return [lang_dst.index2word[token] for token in indexed_dst]


In [72]:
src, dst = random.sample(train_data, 1)[0]
indexed_src = indexing_src(src)
indexed_dst = indexing_dst(dst)
translated_src = translate_src(indexed_src)
translated_dst = translate_dst(indexed_dst)
print(src, indexed_src, translated_src)
print(dst, indexed_dst, translated_dst)

◦◈◬◎ ◆◗◀◞◂▦▵ [0, 6456, 420, 6, 2788, 524, 6, 1] ['<sos>', '◦◈◬', '◈◬◎', ' ', '◆◗◀', '◞◂▦', ' ', '<eos>']
Bob Gibson. [0, 5874, 11830, 1] ['<sos>', 'bob', 'gibson', '<eos>']


In [73]:
def indexing_data(data, test = False):
    indexed_data = []

    if not test:
        for src, dst in tqdma(data):
            indexed_data.append([indexing_src(src), indexing_dst(dst)])
        return sorted(indexed_data, key = lambda x: (len(x[0]), len(x[1])))
    else:
        if len(np.shape(data)) == 2:
            for src, _ in tqdma(data):
                indexed_data.append(indexing_src(src))
        else:
            for src in tqdma(data):
                indexed_data.append(indexing_src(src))
        return sorted(indexed_data, key = lambda x: len(x))

In [74]:
indexed_train_data = indexing_data(train_data)
indexed_val_data = indexing_data(val_data)
indexed_test_data = indexing_data(test_data, test = True)

100%|██████████| 300000/300000 [00:11<00:00, 25557.98it/s]
100%|██████████| 500/500 [00:00<00:00, 11917.74it/s]
100%|██████████| 1000/1000 [00:00<00:00, 32952.07it/s]


In [75]:
print(indexed_train_data[0])
print(indexed_val_data[0])
print(indexed_test_data[0])

[[0, 2683, 6, 1], [0, 1]]
[[0, 4496, 640, 6, 548, 6, 316, 496, 6, 1], [0, 48, 34, 123, 425, 1]]
[0, 1979, 2997, 6, 1]


## Batches

In [118]:
batch_size = 128

In [77]:
indices = np.sort(np.random.choice(10, size = 10, p = None, replace = False))
indices

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [122]:
def prepare_epoch(indexed_data, batch_size = batch_size, pad_idx = pad_idx, probs = None, N = 300000, factor = 1):
    indices = np.sort(np.random.choice(len(indexed_data), size = N, p = probs, replace = False))
    epoch_indexed_data = []
    for i in range(N):
        epoch_indexed_data.append(indexed_data[indices[i]])
    #print(batch_size)
    i = 0
    batches = []
    while i <= len(epoch_indexed_data) - batch_size * factor:
        src_list = []
        dst_list = []
        for j in range(i, i + batch_size * factor):
            src_list.append(torch.LongTensor(epoch_indexed_data[j][0]))
            dst_list.append(torch.LongTensor(epoch_indexed_data[j][1]))
        batch_src = pad_sequence(src_list, padding_value = pad_idx)
        batch_dst = pad_sequence(dst_list, padding_value = pad_idx)
        batches.append([batch_src, batch_dst])
        i += batch_size * factor
    return batches

In [121]:
batches = prepare_epoch(indexed_train_data)
len(batches)


128


2343

In [117]:
print(batches[0][0].shape)

torch.Size([4, 32])


In [123]:
n_epochs = 10
data_epochs = []
for i in tqdma(range(n_epochs)):
    data_epochs.append(prepare_epoch(indexed_train_data))

100%|██████████| 10/10 [01:07<00:00,  6.76s/it]


In [81]:
indexed_train_data[1]

[[0, 6240, 6, 1], [0, 293, 1]]

# Models

## Seq2seq with attention, num_layers = 1

In [95]:
class EncoderAttn(nn.Module):
    def __init__(
            self,
            input_size,
            embedding_size,
            hidden_size,
            num_layers = 1,
            dropout_p = 0.0
            ):

        super(EncoderAttn, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(
            input_size = embedding_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            bidirectional = True
            )
        self.dropout = nn.Dropout(dropout_p)
        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, input):
        #input: (seq_len, batch_size)

        embedded = self.dropout(self.embedding(input))
        #embedded: (seq_len, batch_size, embedding_size)

        output, hidden = self.gru(embedded)
        #output: (seq_len, batch_size, embedding_size)
        #hidden: (D * num_layers, batch_size, hidden_size)
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim = 2))
        return output, hidden


In [83]:
src_ex, dst_ex = batches[0][0], batches[0][1]
print(src_ex.shape, dst_ex.shape)

torch.Size([4, 32]) torch.Size([3, 32])


In [84]:
inp_size = lang_src.n_tokens_trimmed
emb_size = 16
hid_size = 24
encoder = EncoderAttn(inp_size, emb_size, hid_size)
output, hidden = encoder(src_ex)
output.shape, hidden.shape

(torch.Size([4, 32, 48]), torch.Size([1, 32, 24]))

In [85]:
class DecoderAttn(nn.Module):
    def __init__(
            self,
            input_size,
            embedding_size,
            hidden_size,
            output_size,
            num_layers = 1,
            dropout_p = 0.0
            ):

        super(DecoderAttn, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(
            embedding_size + hidden_size * 2,
            hidden_size,
            num_layers
            )

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.softmax = nn.Softmax(dim = 0)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, encoder_states, hidden):
        # input: (batch_size), we want (1, batch_size)
        # encoder_states: (seq_length, batch_size, hidden_size * 2)
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))
        # embedded: (1, batch_size, embedding_size)

        seq_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(seq_length, 1, 1)
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim = 2)))
        attention = self.softmax(energy)
        # attention: (seq_length, batch_size, 1)
        attention = attention.permute(1, 2, 0)
        # attention: (batch_size, seq_length, 1)
        encoder_states = encoder_states.permute(1, 0, 2)
        # encoder_states: (batch_size, hidden_size * 2, seq_length)

        context_vector = torch.bmm(attention, encoder_states).permute(1, 0, 2)
        # context_vector: (batch_size, 1, hidden_size * 2) -> (1, batch_size, hidden_size * 2)

        rnn_input = torch.cat((context_vector, embedded), dim = 2)
        output, hidden = self.gru(rnn_input, hidden)
        # output: (1, batch_size, hidden_size)

        predictions = self.fc(output)
        # predictions: (1, batch_size, vocab_len)

        predictions = predictions.squeeze(0)

        return predictions, hidden

In [86]:
inp_size = lang_dst.n_tokens_trimmed
emb_size = 18
hid_size = hid_size
out_size = lang_dst.n_tokens_trimmed
encoder_states = output

decoder = DecoderAttn(inp_size, emb_size, hid_size, out_size)
pred, hidden = decoder(dst_ex[0], encoder_states, hidden)
pred.shape, hidden.shape

(torch.Size([32, 26630]), torch.Size([1, 32, 24]))

In [127]:
class Seq2SeqAttn(nn.Module):
    def __init__(self, encoder, decoder, max_length = 25):
        super(Seq2SeqAttn, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.max_length = max_length

    def forward(self, src, dst = None, tf_ratio = 0.5):
        batch_size = src.shape[1]
        len_dst = dst.shape[0] if dst is not None else self.max_length
        vocab_size_dst = lang_dst.n_tokens_trimmed
        outputs = torch.zeros(len_dst, batch_size, vocab_size_dst).to(device)

        encoder_states, hidden = self.encoder(src)

        if dst is not None:
            x = dst[0]
            for t in range(1, len_dst):
                output, hidden = self.decoder(x, encoder_states, hidden)
                outputs[t] = output
                if random.random() < tf_ratio:
                    x = dst[t]
                else:
                    top2 = torch.topk(output, 2, dim = 1)[1]
                    unk_test = (top2[:, 0] == unk_idx)
                    best_guess = (~unk_test) * top2[:, 0] + unk_test * top2[:, 1]
                    x = best_guess

        else:
            x = torch.LongTensor([sos_idx]).to(device)
            for t in range(1, len_dst):
                output, hidden = self.decoder(x, encoder_states, hidden)
                outputs[t] = output
                top2 = torch.topk(output, 2, dim = 1)[1]
                unk_test = (top2[:, 0] == unk_idx)
                best_guess = (~unk_test) * top2[:, 0] + unk_test * top2[:, 1]
                x = best_guess
                if x == eos_idx:
                    break

        return outputs

In [88]:
seq2seq = Seq2SeqAttn(encoder, decoder)
seq2seq(src_ex, dst_ex).shape

torch.Size([3, 32, 26630])

In [89]:
test_tens = torch.tensor([[unk_idx, 1],
                          [1, unk_idx],
                          [8, 3],
                          [unk_idx, 4],
                          [3, 6],
                          [5, unk_idx]]
                         )
x = (test_tens[:, 0] == unk_idx)
ans = (~x) * test_tens[:, 0] + x * test_tens[:, 1]
ans

tensor([1, 1, 8, 4, 3, 5])

In [90]:
T = 2
test_tens = torch.tensor(
    [[1., 3., 5.],
     [4., 2., 8.],
     [3., 7., 1.],
     [9., 4., 2.]])
sm = F.softmax(test_tens/T, dim = 1)
print(sm)
top2 = torch.multinomial(sm, 1)

top2

tensor([[0.0900, 0.2447, 0.6652],
        [0.1142, 0.0420, 0.8438],
        [0.1142, 0.8438, 0.0420],
        [0.8991, 0.0738, 0.0271]])


tensor([[2],
        [1],
        [0],
        [0]])

In [91]:
sm.gather(1, top2)

tensor([[0.6652],
        [0.0420],
        [0.1142],
        [0.8991]])

In [92]:
sm[range(len(sm)), top2.squeeze()] = 0

In [93]:
sm[range(len(sm)), top2.squeeze()]

tensor([0., 0., 0., 0.])

In [94]:
sm

tensor([[0.0900, 0.2447, 0.0000],
        [0.1142, 0.0000, 0.8438],
        [0.0000, 0.8438, 0.0420],
        [0.0000, 0.0738, 0.0271]])

### Train

In [128]:
num_epochs = 20
lr = 3e-4
batch_size = batch_size

input_size_encoder = lang_src.n_tokens_trimmed
input_size_decoder = lang_dst.n_tokens_trimmed
output_size = lang_dst.n_tokens_trimmed
encoder_embedding_size = 400
decoder_embedding_size = 400
hidden_size = 1024
num_layers = 1
enc_dropout = 0.2
dec_dropout = 0.2

In [129]:
encoder = EncoderAttn(
    input_size_encoder,
    encoder_embedding_size,
    hidden_size,
    num_layers,
    enc_dropout
    ).to(device)

decoder = DecoderAttn(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout
    ).to(device)

model = Seq2SeqAttn(encoder, decoder).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [98]:
val_cases = random.sample(indexed_val_data, 10)
val_cases = [(torch.LongTensor(val_cases[i][0]).to(device), torch.LongTensor(val_cases[i][1]).to(device)) for i in range(len(val_cases))]

In [99]:
def translate_cases(val_cases, T = 0.5):
    for src, dst in val_cases:
        with torch.no_grad():
            output = model(src.unsqueeze(1))/T
            sm = F.softmax(output.squeeze(), dim = 1)
            #argmax = torch.argmax(sm, dim = 1)
            top2 = torch.multinomial(sm, 2)
            unk_test = (top2[:, 0] == unk_idx)
            argmax = (~unk_test) * top2[:, 0] + unk_test * top2[:, 1]
            argmax[0] = 0
            pred = []
            tgt = []
            for idx in argmax:
                pred.append(lang_dst.index2word[idx.item()])
                if idx.item() == 1:
                    break
            for idx in dst:
                tgt.append(lang_dst.index2word[idx.item()])
            print(' '.join(pred), ' | ', ' '.join(tgt))
translate_cases(val_cases)

<sos> vexes banquet dereliction arabia expanded cadogan salads neuter georgian rigid scaling tarzan margot sweated atlantis islands thermostat maximus microscope  |  <sos> syria has stated that it is ready for the voluntary return of refugees and is seeking assistance in rebuilding the country which has been devastated by more than seven years of war <eos>
<sos> aurora swanson disarmed adversities forrest ensure tidings passengers jolly ya sat historically operations filtered works lena bonus inc troop  |  <sos> it s not an easy task to get him out there but i ll only consider it a success if i can bring the club back <eos>
<sos> hartley fellow talker merci sneakers infect dummy gaby freshman swamp preparing heights dissipated gentleman sails brow clu mclean immune  |  <sos> mrs humphries who has been friends with mrs davis for many years lost mark her husband of years shortly after his mother died <eos>
<sos> compassion nascar undone complained deemed b bop stocking dior daring past l

In [259]:
#torch.save(model.state_dict(), 'params1' + str(0) + '.pt')
#model.load_state_dict(torch.load('/kaggle/working/params12.pt'))

  model.load_state_dict(torch.load('/kaggle/working/params12.pt'))


<All keys matched successfully>

In [53]:
#model.load_state_dict(torch.load('/kaggle/working/params12.pt'))

  model.load_state_dict(torch.load('/kaggle/working/params18.pt'))


<All keys matched successfully>

In [100]:
train_cases = random.sample(indexed_train_data, 10)
train_cases = [(torch.LongTensor(train_cases[i][0]).to(device), torch.LongTensor(train_cases[i][1]).to(device)) for i in range(len(train_cases))]

In [101]:
translate_cases(train_cases)

<sos> insides pilar intruding screw unsafe tommy cinderella brainer involved interplanetary inner jealousy struggles mixer fortress minnesota snap fluctuate black  |  <sos> yeah we ve got him <eos>
<sos> overweight clipped pacified winks coffins roko suggests girdle verified altogether rhythm next penthouse dictator which achilles supplement worm courtesans  |  <sos> nobody bothers me and i m totally happy <eos>
<sos> mopes shark tash vicente rustling pakistani waiter mired rejecting bloodstain drunkard entertainer concord convenient mitnick markham mates mandar makoto  |  <sos> you ditched the kids for her <eos>
<sos> jaga zelena behold witnessed bayou stunned extras forbidden ran zeus messenger opinions undergone incoming cowards vin centimetres occupants zooey  |  <sos> is that ok with you ? <eos>
<sos> <eos>  |  <sos> ! then why did you take my order ? ! <eos>
<sos> honey gina julianna weld loving restraining scones obstructing linda chiang neech leaves traumas striptease glandular

In [239]:
translate_cases(val_cases)

<sos> i had the had one a a past days ago <eos>  |  <sos> he writes on twitter i had a stroke a month ago <eos>
<sos> but if is the one to s the color the his and it judge of the <eos>  |  <sos> talk show host karen hunter tweeted that west was just who he is and that s absolutely wonderful <eos>
<sos> when he first was the first one who the was the first one he the s was a source  |  <sos> according to one source the truly brand was holly s top priority <eos>
<sos> i m sure ask for to my ass in my ass to the ass in the ass in attaching  |  <sos> for me success will depend on whether i can get this club back into the premier league <eos>
<sos> when the new of the world the former the world will media the or or the is the world  |  <sos> a new <unk> poll shows that percent of <unk> strongly or somewhat believe ford s testimony while percent said they strongly or somewhat believe kavanaugh s <eos>
<sos> but was the first of the night the first time of the night the <eos>  |  <sos> but it

In [110]:
from IPython.display import clear_output 

def train(
        model,
        data,
        start_epoch = 0,
        name = 'params',
        num_epochs = num_epochs,
        optimizer = optimizer,
        criterion = criterion,
        tf_ratio_start = 0.99,
        min_tf_ratio = 0.25,
        delta_tf = 0.025):
    
    for epoch in tqdma(range(start_epoch, len(data))):
        model.train()
        batches = data[epoch]
        loss_epoch = []
        for i, batch in enumerate(tqdma(batches)):
            src, dst = batch[0], batch[1]
            src = src.to(device)
            dst = dst.to(device)
            tf_ratio = max(min_tf_ratio, tf_ratio_start - delta_tf*epoch)

            output = model(src, dst, tf_ratio = tf_ratio)
            output = output[1:].reshape(-1, output.shape[2])
            target = dst[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            loss_epoch.append(loss.item())
            if i % 500 == 0:
                print(np.average(loss_epoch))
                model.eval()
                translate_cases(val_cases)
                model.train()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)

        clear_output()    
        torch.save(model.state_dict(), name + str(epoch) + '.pt')
        print(np.average(loss_epoch), tf_ratio)
        model.eval()
        translate_cases(val_cases)

In [131]:
train(model, data_epochs, name = 'last_params', start_epoch = 0, tf_ratio_start = 0.9, min_tf_ratio = 0.5, delta_tf = 0.1)

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/2343 [00:00<?, ?it/s][A

10.21047306060791
<sos> frowned pacing ports viagra street curling appearance flats robbins petitions manhood gears garland plinking humanichs meacham foothold gaggle eatin homies zeitung honest stoner legs  |  <sos> syria has stated that it is ready for the voluntary return of refugees and is seeking assistance in rebuilding the country which has been devastated by more than seven years of war <eos>
<sos> thriving audience zach preferable clary catalogues showed danton moony sae fundamentally mathematician stickup compressed trimming humbled heightened enabling jez caine apiece skank eats firms  |  <sos> it s not an easy task to get him out there but i ll only consider it a success if i can bring the club back <eos>
<sos> cope originally hydrogen carroll amiss afraid recovery veer cater farce greta draper cesar notwithstanding whisk vagrant hires fluffy intrusions neighbour lasted ya kayla stefani  |  <sos> mrs humphries who has been friends with mrs davis for many years lost mark her


  0%|          | 1/2343 [00:00<19:45,  1.98it/s][A

<sos> handouts quadruple intertwined turtle starling moustache costing leaky muscles ut vocal sliders diner nonviolence grinding dawning terrance minus hammering joan hallucinating chaise franchise billionaire  |  <sos> despite his colleagues fleeing for their lives year old <unk> <unk> <unk> refused to leave his post in the wildly swaying control tower of <unk> sis al <unk> airport in palu <eos>
<sos> handmade guts electronics cut bluff badly upriver feminists madman strangers frightening beneath battleships pulsar sheppard ricardo arial overstep fulton horrific tradition drones torn mendoza  |  <sos> nickel mining is also important for the province s economy but it is mainly concentrated in <unk> on the opposite coast of sulawesi <eos>
<sos> buffs buttons chatter galleries negotiations monstrosity cousin quantico starbase jamie separating nelly forger presided bates starting hooper clippings castillo posed juicy detestable repressed cyclops  |  <sos> george w bush is calling senators


  0%|          | 2/2343 [00:00<12:28,  3.13it/s][A
  0%|          | 4/2343 [00:00<06:08,  6.35it/s][A
  0%|          | 6/2343 [00:00<04:31,  8.60it/s][A
  0%|          | 8/2343 [00:01<03:58,  9.79it/s][A
  0%|          | 10/2343 [00:01<03:58,  9.80it/s][A
  1%|          | 12/2343 [00:01<05:47,  6.71it/s][A
  1%|          | 14/2343 [00:01<04:40,  8.31it/s][A
  1%|          | 16/2343 [00:02<03:56,  9.85it/s][A
  1%|          | 18/2343 [00:02<03:27, 11.20it/s][A
  1%|          | 20/2343 [00:02<03:08, 12.29it/s][A
  1%|          | 22/2343 [00:02<02:56, 13.18it/s][A
  1%|          | 24/2343 [00:02<02:52, 13.43it/s][A
  1%|          | 26/2343 [00:02<02:59, 12.92it/s][A
  1%|          | 28/2343 [00:02<03:05, 12.47it/s][A
  1%|▏         | 30/2343 [00:03<03:20, 11.52it/s][A
  1%|▏         | 32/2343 [00:03<03:34, 10.79it/s][A
  1%|▏         | 34/2343 [00:03<03:41, 10.42it/s][A
  2%|▏         | 36/2343 [00:03<03:46, 10.18it/s][A
  2%|▏         | 38/2343 [00:03<03:49, 10.05it/s]

KeyboardInterrupt: 

In [None]:
train(model, data_epochs, start_epoch = 0, tf_ratio_start = 0.8, min_tf_ratio = 0.3, delta_tf = 0.02)

In [241]:
n_epochs = 3
data_epochs = []
for i in tqdma(range(n_epochs)):
    data_epochs.append(prepare_epoch(indexed_train_data, N = len(indexed_train_data)))

100%|██████████| 3/3 [00:26<00:00,  8.81s/it]


In [None]:
train(model, data_epochs, name = 'full3grams', start_epoch = 0, tf_ratio_start = 0.5, min_tf_ratio = 0.5, delta_tf = 0.02)

In [None]:
n_epochs = 20
data_epochs = []
for i in tqdma(range(n_epochs)):
    data_epochs.append(prepare_epoch(indexed_train_data))

In [None]:
train(model, data_epochs, tf_ratio_start = 0.75, min_tf_ratio = 0.25, delta_tf = 0.05)

## A little test

In [None]:
from collections import Counter

In [None]:
test_lang_src = Lang("test", tokenize_src)
for src in tqdma(test_data):
     test_lang_src.addSentence(src)
print(len(test_lang_src.word2count))

In [None]:
res = {i: 0 for i in test_lang_src.index2word.keys() if i > 4}
vals = np.zeros(len(indexed_train_data))

for i in tqdma(range(len(indexed_train_data))):
    for search in res:
        if search in indexed_train_data[i][0]:
            vals[i] += 1
            if vals[i] == len(indexed_train_data[i][0]): break
            # for j in range(1, len(indexed_train_data[i][0])-1):
            #     if indexed_train_data[i][0][j] not in res[search]:
            #         res[search][indexed_train_data[i][0][j]] = 1
            #     else:
            #         res[search][indexed_train_data[i][0][j]] += 1
            # #print(i, search)
print(vals)

In [None]:
res = {i: 0 for i in test_lang_src.index2word.keys() if i > 4}
vals = np.zeros(len(indexed_train_data))

for i in tqdma(range(len(indexed_train_data))):
    for search in res:
        if search in indexed_train_data[i][0]:
            vals[i] += 1
            if vals[i] == len(indexed_train_data[i][0]): break
            # for j in range(1, len(indexed_train_data[i][0])-1):
            #     if indexed_train_data[i][0][j] not in res[search]:
            #         res[search][indexed_train_data[i][0][j]] = 1
            #     else:
            #         res[search][indexed_train_data[i][0][j]] += 1
            # #print(i, search)
print(vals)

In [None]:
mults = np.array([1/(1 + np.log(len(indexed_train_data[i][0]))) for i in range(len(indexed_train_data))])
probs = mults * vals
probs /= np.sum(probs)

In [None]:
np.random.choice(len(indexed_train_data), size = 50000, replace = False, p = probs)

In [None]:
res = {i: {} for i in test_lang_src.index2word.keys() if i > 4}

for i in tqdma(range(len(indexed_train_data))):
    for search in res:
        if search in indexed_train_data[i][0]:
            for j in range(1, len(indexed_train_data[i][0])-1):
                if indexed_train_data[i][0][j] not in res[search]:
                    res[search][indexed_train_data[i][0][j]] = 1
                else:
                    res[search][indexed_train_data[i][0][j]] += 1
            #print(i, search)
print(res)

In [None]:
src_idx = lang_src.word2index['◠◎◠']
src_idx

In [None]:
dst_idx = 88

res_dst = {i: {} for i in lang_dst.index2word.keys() if i > 4}

for i in tqdma(range(len(indexed_train_data))):
    if dst_idx in indexed_train_data[i][1]:
        for j in range(1, len(indexed_train_data[i][0])-1):
            #print(indexed_train_data[i][0][j])
            if indexed_train_data[i][0][j] not in res_dst[dst_idx]:
                res_dst[dst_idx][indexed_train_data[i][0][j]] = 1
            else:
                res_dst[dst_idx][indexed_train_data[i][0][j]] += 1
