In [0]:
import os
os.chdir('/content/drive/My Drive/PMLDL_HW4')
import random
import torch
from torch import nn, optim, tensor, device
from neural_machine_translation import NMTEncoder, NMTDecoder
from prepare_data import NMTDataset

In [0]:
BATCH_SIZE = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-3
hidden_dim = 128

dataset = NMTDataset('data/corpus.en_ru.1m.ru',
                      'data/corpus.en_ru.1m.en')

train_data, validation_data = dataset.get_iterators(batch_size=BATCH_SIZE, 
                                                    debug=False)
input_dim, output_dim = dataset.INPUT_DIM, dataset.OUTPUT_DIM

In [0]:
def train(data, encoder, decoder, criterion, e_optimizer, de_optimizer, tag_ix,
          save_path, debug_steps=100, epochs=10):
    encoder.train(True)
    decoder.train(True)
    encoder.to(device)
    decoder.to(device)
    for e in range(epochs):
        ovr_loss = 0
        hidden = encoder.init_hidden(BATCH_SIZE)
        for i, batch in enumerate(data):
            e_optimizer.zero_grad()
            de_optimizer.zero_grad()
            src = batch.Russian.to(device)
            trg = batch.English.to(device)
            if src.size(0) != BATCH_SIZE:
                continue
            loss = 0
            encoder_output, hidden = encoder(src, hidden)
            decoder_input = torch.tensor([[tag_ix]] * BATCH_SIZE, device=device)

            if random.random() < 0.5:
                for t in range(1, trg.size(1)):
                    out, hidden = decoder(decoder_input,
                                          hidden,
                                          encoder_output)
                    loss += criterion(out, trg[:, t])
                    decoder_input = trg[:, t].unsqueeze(1)
            else:
                for t in range(1, trg.size(1)):
                    out, hidden = decoder(decoder_input,
                                          hidden,
                                          encoder_output)
                    target = trg[:, t]
                    decoder_input = torch.argmax(out, 1).unsqueeze(1)
                    loss += criterion(out, target)

            loss.backward()
            e_optimizer.step()
            de_optimizer.step()
            ovr_loss += loss.item()/trg.size(1)
            if i % debug_steps == 0:
                print(f'Epoch {e}({int(i/debug_steps)}/{int(len(data)/debug_steps)}): loss={ovr_loss / (i + 1)}')
                torch.save(encoder.state_dict(), save_path + "/encoder.pth")
                torch.save(decoder.state_dict(), save_path + "/decoder.pth")

In [4]:
encoder = NMTEncoder(input_dim, hidden_dim)
encoder.to(device)
decoder = NMTDecoder(hidden_dim, output_dim)
decoder.to(device)
criterion = nn.CrossEntropyLoss()
e_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
de_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
encoder.load_state_dict(torch.load("encoder.pth"))
decoder.load_state_dict(torch.load("decoder.pth"))

<All keys matched successfully>

In [0]:
train(train_data, encoder, decoder, criterion, e_optimizer, de_optimizer, 
      dataset.get_src_tag_idx('<eos>'), '.')

Epoch 0(0/56): loss=11.387696486253004
Epoch 0(1/56): loss=4.752938019665981
Epoch 0(2/56): loss=4.155878453873728
Epoch 0(3/56): loss=3.93823202231801
Epoch 0(4/56): loss=3.823199090551774
Epoch 0(5/56): loss=3.746782815459615
Epoch 0(6/56): loss=3.6862691253304796
Epoch 0(7/56): loss=3.649385690140653
Epoch 0(8/56): loss=3.614423568019849
Epoch 0(9/56): loss=3.5757562366070794
Epoch 0(10/56): loss=3.5449130313566415
Epoch 0(11/56): loss=3.523602135456819
Epoch 0(12/56): loss=3.503735603835977
Epoch 0(13/56): loss=3.480380233180145
Epoch 0(14/56): loss=3.4645474788243904
Epoch 0(15/56): loss=3.4521505529342433
Epoch 0(16/56): loss=3.4413458328455753
Epoch 0(17/56): loss=3.4224586864513116
Epoch 0(18/56): loss=3.408133254361655
Epoch 0(19/56): loss=3.394728290239431
Epoch 0(20/56): loss=3.380780796951879
Epoch 0(21/56): loss=3.372747285749492
Epoch 0(22/56): loss=3.3669236647639367
Epoch 0(23/56): loss=3.3563901980646627
Epoch 0(24/56): loss=3.3536187168648657
Epoch 0(25/56): loss=3.34

In [0]:
def test(data, encoder, decoder, test_path, tag_ix):
    with open(test_path) as fp:
        test_lines = fp.readlines()

    hidden = encoder.init_hidden(1)
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        for line in test_lines:
            line = line.lower()
            tokens = data._tokenize_ru(line)
            tokens = list(map(data.get_src_tag_idx, tokens))
            tokens = [data.get_src_tag_idx('<sos>')] + tokens + [data.get_src_tag_idx('<eos>')]
            t = torch.tensor([tokens], device=device)
            encoder_output, hidden = encoder(t, hidden)
            encoder_output = encoder_output.to(device)
            hidden = hidden.to(device)
            decoder_input = tensor([[tag_ix]] * 1)
            predicted = []
            i = 0
            while i != len(tokens):
                i += 1
                decoder_input = decoder_input.type(torch.LongTensor)
                decoder_input[decoder_input < 0] = 0
                decoder_input = decoder_input.to(device)

                out, hidden = decoder(decoder_input,
                                      hidden,
                                      encoder_output)
                predicted_token = out[0].argmax()
                
                if predicted_token == tag_ix:
                    predicted.append('<EOS>')
                    break
                else:
                    predicted.append(data.get_trg_tag_from_idx(predicted_token))
                decoder_input = predicted_token.unsqueeze(0).unsqueeze(0)
            print(line[:-1])
            print(predicted)
            print()

            with open('out.txt', 'a') as f:
                f.write(" ".join(predicted) + "\n")

In [35]:
test(dataset, encoder, decoder, "data/eval-ru-100.txt", dataset.get_src_tag_idx('<eos>'))

26. вопрос о лесах необходимо вывести на более высокий уровень в рамках целей устойчивого развития, в том числе посредством включения в такие цели убедительных и четких целевых и рабочих показателей по лесам.
['the', 'question', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the', 'development', 'of', 'the']

в рамках экологической экспертизы определены пять вариантов строительства и эксплуатации замещающей электростанции, которая восстановит мощность энергораспределительной сети управления по состоянию до стихийного бедствия.
['in', 'the', 'and', 'the', 'and', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

в ходе рассмотрения данного пункта пове