Переписать загрузку данных с python функций на Dataset и Dataloader и применить сеть с attention

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import re

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
batch_size = 64
epochs = 3
latent_dim = 256
num_samples = 5000
data_path = '/content/drive/MyDrive/fra.txt'

In [7]:
class CreateDataset(torch.utils.data.Dataset):
    def __init__(self, file_name=data_path, num_samples=num_samples):

        texts = []
        text_words = []

        input_vocab = set()
        output_vocab = set()
        
        print('Загружаем ', file_name)
        
        with open(file_name, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')

        for line in lines[:num_samples]:
            input_text, output_text, _ = line.split('\t')

            texts.append((input_text, output_text))
            
            input_words = re.findall(r'\w+', input_text) 
            output_words = re.findall(r'\w+', output_text) 
            text_words.append((input_words, output_words))
            
            for word in input_words:
                input_vocab.add(word)
            for word in output_words:
                output_vocab.add(word)
    
        input_vocab2index = {word: i+2 for i, word in enumerate(input_vocab)}
        output_vocab2index = {word: i+2 for i, word in enumerate(output_vocab)}        

        def ws2i(words, vocab):
            indexes = [vocab.get(word, 0) for word in words] + [1]
            return torch.tensor(indexes, dtype=torch.long).view(-1, 1)
        
        self.texts = texts
        self.encoded_texts = [ (ws2i(p[0], input_vocab2index), ws2i(p[1], output_vocab2index)) for p in text_words ]
        self.input_vocab2index = input_vocab2index
        self.output_vocab2index = output_vocab2index
        self.input_vocabulary_size = len(self.input_vocab2index) + 2
        self.output_vocabulary_size = len(self.output_vocab2index) + 2
        
        print('Загружен ', file_name)
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        return self.encoded_texts[index] + self.texts[index]
    

In [8]:
ds = CreateDataset()

Загружаем  /content/drive/MyDrive/fra.txt
Загружен  /content/drive/MyDrive/fra.txt


In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        embedded_hidden = torch.cat((embedded[0], hidden[0]), 1)

        attn_weights = self.attn(embedded_hidden)
        
        attn_weights = F.softmax(attn_weights, dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)

        output = self.attn_combine(output).unsqueeze(0)

        #output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [11]:
def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]])

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        
        topv, topi = decoder_output.topk(1)
        
        decoder_input = topi.squeeze().detach()  # detach from history as input
        
        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == 1:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [15]:
encoder = EncoderRNN(ds.input_vocabulary_size, 30)
attn_decoder1 = AttnDecoderRNN(30, ds.output_vocabulary_size, dropout_p=0.1)

#attn_decoder1 = DecoderRNN(len(output_vocab2index)+2, 30)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(attn_decoder1.parameters(), lr=0.01)
criterion = nn.NLLLoss()

print_loss_total = 0

dl = torch.utils.data.DataLoader(ds, shuffle=True, batch_size=1)

for epoch in range(epochs):

    for i, ([input_tensor], [target_tensor], _, _) in enumerate(dl):
        loss = train_step(input_tensor, target_tensor, encoder,
                   attn_decoder1, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if (i + 1) % 100 == 0:
            print_loss_avg = print_loss_total / 100
            print_loss_total = 0
            print('(%d) %.4f' % (i + 1, print_loss_avg))

(100) 4.1395
(200) 4.8085
(300) 4.1936
(400) 3.7496
(500) 3.3145
(600) 3.3971
(700) 3.5364
(800) 3.4263
(900) 3.2721
(1000) 3.2551
(1100) 3.4408
(1200) 3.5266
(1300) 3.7015
(1400) 3.8589
(1500) 3.9917
(1600) 3.6409
(1700) 3.8599
(1800) 3.8006
(1900) 3.8166
(2000) 3.9237
(2100) 3.6791
(2200) 3.8601
(2300) 3.7720
(2400) 3.6414
(2500) 3.6018
(2600) 3.7819
(2700) 3.7817
(2800) 3.8103
(2900) 3.6280
(3000) 3.5573
(3100) 3.5878
(3200) 3.7210
(3300) 3.8912
(3400) 3.6209
(3500) 3.7677
(3600) 3.8179
(3700) 3.5698
(3800) 3.7089
(3900) 3.8023
(4000) 3.8132
(4100) 3.6196
(4200) 3.7133
(4300) 3.5282
(4400) 3.6326
(4500) 3.5040
(4600) 3.5475
(4700) 3.4710
(4800) 3.5793
(4900) 3.6646
(5000) 3.6134
(100) 3.5933
(200) 3.8060
(300) 3.6371
(400) 3.5163
(500) 3.5559
(600) 3.7114
(700) 3.6184
(800) 3.4579
(900) 3.7121
(1000) 3.4953
(1100) 3.6496
(1200) 3.6619
(1300) 3.3415
(1400) 3.5032
(1500) 3.4168
(1600) 3.6382
(1700) 3.5350
(1800) 3.4928
(1900) 3.5878
(2000) 3.5671
(2100) 3.5264
(2200) 3.3915
(2300) 3.3