In [1]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Mounted at /content/drive
cuda


# 1. Возьмите англо-русскую пару фраз
www.manythings.org....org/anki/

In [2]:
!tail drive/MyDrive/eng-rus.txt

We need to uphold laws against discrimination — in hiring, and in housing, and in education, and in the criminal justice system. That is what our Constitution and our highest ideals require.	Нам нужно отстаивать законы против дискриминации при найме на работу, в жилищной сфере, в сфере образования и правоохранительной системе. Этого требуют наша Конституция и высшие идеалы.	CC-BY 2.0 (France) Attribution: tatoeba.org #5762728 (BHO) & #6390439 (odexed)
I've heard that you should never date anyone who is less than half your age plus seven. Tom is now 30 years old and Mary is 17. How many years will Tom need to wait until he can start dating Mary?	Я слышал, что никогда не следует встречаться с кем-то вдвое младше вас плюс семь лет. Тому 30 лет, a Мэри 17. Сколько лет Тому нужно ждать до тех пор, пока он сможет начать встречаться с Мэри?	CC-BY 2.0 (France) Attribution: tatoeba.org #10068197 (CK) & #10644473 (notenoughsun)
I do have one final ask of you as your president, the same thing I a

In [3]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'SOS', 1: 'EOS'}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s, alphabet='latin'):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    if alphabet == 'latin':
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    if alphabet == 'cyrillic':
        s = re.sub(r"[^а-яёА-ЯЁ.!?]+", r" ", s)
    return s

In [5]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('drive/MyDrive/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(l.split('\t')[0]), normalizeString(l.split('\t')[1], alphabet='cyrillic')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [7]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 487600 sentence pairs
Trimmed to 28240 sentence pairs
Counting words...
Counted words:
rus 10116
eng 4289
['он умен .', 'he is intelligent .']


The Encoder
-----------





In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------




In [9]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = self.attn_combine(torch.cat((embedded[0], attn_applied[0]), 1))

        output = F.relu(output.unsqueeze(0))
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    output_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, output_tensor)

In [11]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [12]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [13]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [14]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(MAX_LENGTH):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di+1]

In [15]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

# 2. Обучите на них seq2seq with attention

In [16]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

1m 43s (- 24m 2s) (5000 6%) 3.1142
3m 19s (- 21m 33s) (10000 13%) 2.5832
4m 54s (- 19m 38s) (15000 20%) 2.3000
6m 30s (- 17m 52s) (20000 26%) 2.1352
8m 5s (- 16m 11s) (25000 33%) 1.9915
9m 42s (- 14m 33s) (30000 40%) 1.8541
11m 17s (- 12m 54s) (35000 46%) 1.7350
12m 54s (- 11m 17s) (40000 53%) 1.6757
14m 30s (- 9m 40s) (45000 60%) 1.5614
16m 7s (- 8m 3s) (50000 66%) 1.5005
17m 43s (- 6m 26s) (55000 73%) 1.4318
19m 19s (- 4m 49s) (60000 80%) 1.3728
21m 0s (- 3m 13s) (65000 86%) 1.3169
22m 39s (- 1m 37s) (70000 93%) 1.2805
24m 20s (- 0m 0s) (75000 100%) 1.2310


In [17]:
evaluateRandomly(encoder1, attn_decoder1)

> я немного боюсь .
= i m a bit scared .
< i m a bit scared . <EOS>

> они закрыты .
= they re closed .
< they re closed . <EOS>

> я очень сильная .
= i m very strong .
< i m very strong . <EOS>

> я все еще голодна .
= i m still hungry .
< i m still hungry . <EOS>

> ты интересная девушка .
= you re an interesting girl .
< you re an girl girl . <EOS>

> вы хорошии человек .
= you are a good person .
< you re a good person . <EOS>

> я поеду в бостон в эти выходные .
= i m going to boston this weekend .
< i m going to to the the <EOS>

> ты врач .
= you are a doctor .
< you re a doctor . <EOS>

> они опоздали как обычно .
= they re late as usual .
< they re almost as busy as . <EOS>

> я тебя не боюсь том .
= i m not afraid of you tom .
< i m not afraid of you tom . <EOS>



# a. На основе скалярного произведения

In [18]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def dot_score(self, hidden, encoder_outputs):
        return torch.sum(hidden * encoder_outputs, dim=2)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.dot_score(hidden, encoder_outputs), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = self.attn_combine(torch.cat((embedded[0], attn_applied[0]), 1))

        output = F.relu(output.unsqueeze(0))
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
hidden_size = 256
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder2 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder2, attn_decoder2, 75000, print_every=5000)

1m 38s (- 22m 53s) (5000 6%) 3.1093
3m 10s (- 20m 37s) (10000 13%) 2.5911
4m 42s (- 18m 49s) (15000 20%) 2.3030
6m 14s (- 17m 9s) (20000 26%) 2.1102
7m 45s (- 15m 31s) (25000 33%) 1.9292
9m 17s (- 13m 56s) (30000 40%) 1.8111
10m 53s (- 12m 26s) (35000 46%) 1.6855
12m 26s (- 10m 52s) (40000 53%) 1.5981
13m 58s (- 9m 19s) (45000 60%) 1.5213
15m 33s (- 7m 46s) (50000 66%) 1.4209
17m 7s (- 6m 13s) (55000 73%) 1.3642
18m 45s (- 4m 41s) (60000 80%) 1.2984
20m 25s (- 3m 8s) (65000 86%) 1.2371
22m 4s (- 1m 34s) (70000 93%) 1.2011
23m 39s (- 0m 0s) (75000 100%) 1.1603


In [20]:
evaluateRandomly(encoder2, attn_decoder2)

> уверен что ничего серьезного .
= i m sure it s nothing serious .
< i m sure that s ll . <EOS>

> мы ищем где остановиться .
= we re looking for a place to stay .
< we re looking for a place to . . <EOS>

> я тебе не слуга .
= i m not your servant .
< i m not your servant . <EOS>

> ты прям как моя мать .
= you re just like my mother .
< you re just a mother . <EOS>

> ты очень храбрыи .
= you re really brave .
< you re really brave . <EOS>

> я тому не пара .
= i m no match for tom .
< i m no match for tom . <EOS>

> я ухожу .
= i m going away .
< i m going . <EOS>

> я многому учусь .
= i m learning a lot .
< i m really a . <EOS>

> я постоянно теряю ключи .
= i m always losing my keys .
< i m always losing my my . <EOS>

> он владелец этои земли .
= he is the owner of this land .
< he is the owner of this land . <EOS>



# b. На основе MLP

In [31]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
        self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        query = self.linear_q(embedded)
        key = self.linear_k(hidden)

        attn_weights = F.softmax(self.attn(torch.cat((query[0], key[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = self.attn_combine(torch.cat((embedded[0], attn_applied[0]), 1))

        output = F.relu(output.unsqueeze(0))
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [32]:
hidden_size = 256
encoder3 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder3 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder3, attn_decoder3, 75000, print_every=5000)

1m 55s (- 26m 52s) (5000 6%) 3.0748
3m 43s (- 24m 15s) (10000 13%) 2.5683
5m 33s (- 22m 14s) (15000 20%) 2.3316
7m 26s (- 20m 29s) (20000 26%) 2.1436
9m 19s (- 18m 39s) (25000 33%) 2.0165
11m 12s (- 16m 49s) (30000 40%) 1.8727
13m 3s (- 14m 55s) (35000 46%) 1.7605
14m 59s (- 13m 7s) (40000 53%) 1.6615
16m 51s (- 11m 14s) (45000 60%) 1.6038
18m 41s (- 9m 20s) (50000 66%) 1.5357
20m 32s (- 7m 28s) (55000 73%) 1.4951
22m 23s (- 5m 35s) (60000 80%) 1.4643
24m 14s (- 3m 43s) (65000 86%) 1.4365
26m 6s (- 1m 51s) (70000 93%) 1.3688
27m 57s (- 0m 0s) (75000 100%) 1.3294


In [33]:
evaluateRandomly(encoder3, attn_decoder3)

> он на тебя сердится .
= he is angry with you .
< he s mad at you . <EOS>

> ты неправильно это делаешь .
= you re doing that wrong .
< you re doing it wrong this . <EOS>

> простите что не смог вам помочь .
= i m sorry i couldn t help you .
< i m sorry i can t help you . <EOS>

> я с нетерпением жду вашего письма .
= i m looking forward to getting your letter .
< i m looking forward your your . <EOS>

> я на три месяца младше вас .
= i m three months younger than you .
< i m three years younger than you . <EOS>

> прости что ушел без тебя .
= i m sorry i left without you .
< i m sorry i hurt you . <EOS>

> вы упрямая .
= you re obstinate .
< you re lying . <EOS>

> я честныи .
= i m fair .
< i m an . . <EOS>

> ты идешь не в ту сторону .
= you re going in the wrong direction .
< you re going the wrong the direction . <EOS>

> я устала от ее жалоб .
= i m tired of her complaints .
< i m tired of your complaints . <EOS>

