# Задание 6
## UltraPro
### Реализовать seq2seq с механизмом внимание

In [48]:
import time
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext.data as data # версия 0.4

import spacy
from spacy.lang.ru import Russian

from tqdm import tqdm
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cpu')

In [49]:
nlp_ru = Russian()
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [50]:
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [51]:
SRC = data.Field(tokenize = tokenize_ru, 
                 include_lengths = True, 
                 lower = True)

TRG = data.Field(tokenize = tokenize_en, 
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "" token
                 include_lengths = True, 
                 lower = True)

fields = [('rus', SRC), ('eng', TRG)]

In [52]:
nmt_data = data.TabularDataset(path="./train.csv", format='csv', fields=fields)

In [53]:
SRC.build_vocab(nmt_data, max_size=4000)
TRG.build_vocab(nmt_data, max_size=4000)
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

In [54]:
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [55]:
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 64, 
    sort_within_batch = True, 
    sort_key = lambda x:len(x.rus),
    device = device)

In [56]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):
    
        super(Encoder, self).__init__()
    
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.dropout = dropout
    
        self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size,
                          num_layers=num_layers,
                          dropout=dropout)
      
    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)
        outputs, hidden = self.gru(embedded)

        return outputs, hidden

In [57]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()        
        self.hidden_size = hidden_size

    @staticmethod
    def dot_score(hidden_state, encoder_states):
        return torch.sum(hidden_state * encoder_states, dim=2)
  
    def forward(self, hidden, encoder_outputs, mask):
        attn_scores = self.dot_score(hidden, encoder_outputs)
        attn_scores = attn_scores.t()
        attn_scores = attn_scores.masked_fill(mask == 0, -1e5)
     
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

In [58]:
class Decoder(nn.Module):
  def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3):
      
    super(Decoder, self).__init__()
    
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(output_size, embedding_size)
            
    self.gru = nn.GRU(embedding_size, hidden_size, n_layers, 
                      dropout=dropout)
    
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.attn = Attention(hidden_size)
      
  def forward(self, current_token, hidden_state, encoder_outputs, mask):
    
    embedded = self.embedding(current_token)
    gru_output, hidden_state = self.gru(embedded, hidden_state)
    attention_weights = self.attn(gru_output, encoder_outputs, mask)
    context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
    
    gru_output = gru_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((gru_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    
    output = self.out(concat_output)

    return output, hidden_state

In [59]:
class seq2seq(nn.Module):
  def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
    super(seq2seq, self).__init__()
    
    self.embedding = nn.Embedding(vocab_size, embedding_size)
    
    self.encoder = Encoder(hidden_size, 
                            embedding_size,
                            num_layers=2,
                            dropout=0.3)
         
    self.decoder = Decoder(embedding_size,
                            hidden_size,
                            vocab_size,
                            n_layers=2,
                            dropout=0.3)
    
    
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx
    self.sos_idx = sos_idx
    self.device = device
      
  def create_mask(self, input_sequence):
    return (input_sequence != self.pad_idx).permute(1, 0)
      
      
  def forward(self, input_sequence, output_sequence):
    
    input_tokens = input_sequence[0]
    
    if output_sequence is None:
      inference = True
      output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
    else:
      inference = False
      output_tokens = output_sequence[0]
    
    vocab_size = self.decoder.output_size
    batch_size = len(input_sequence[1])
    max_seq_len = len(output_tokens)
    
    outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)        
    
    encoder_outputs, hidden = self.encoder(input_tokens)
    
    output = output_tokens[0,:]
    
    mask = self.create_mask(input_tokens)
    
    for t in range(1, max_seq_len):
      output = output.unsqueeze(0)
      
      output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
      outputs[t] = output
      
      if inference:
        output = output.max(1)[1]
      else:
        output = output_tokens[t]
      
      if inference and output.item() == self.eos_idx:
        return outputs[:t]
        
    return outputs

In [60]:
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']

embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)

In [61]:
model = seq2seq(embedding_dim,
                hidden_dim, 
                vocab_size, 
                device, pad_idx, eos_idx, sos_idx).to(device)

In [62]:
model

seq2seq(
  (embedding): Embedding(4004, 100)
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (concat): Linear(in_features=512, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=4004, bias=True)
    (attn): Attention()
  )
)

In [63]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [64]:
def train(model, ld, criterion, optimizer):
  model.train()
  
  epoch_loss = 0
  
  for batch in tqdm(ld):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    optimizer.zero_grad()

    output = model(input_sequence, output_sequence)

    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
      
  return epoch_loss / len(ld)

In [65]:
def evaluate(model, iterator, criterion):
  model.eval()
  
  epoch_loss = 0
  
  with torch.no_grad():
      for batch in iterator:
        input_sequence = batch.rus
        output_sequence = batch.eng
    
        target_tokens = output_sequence[0]
    
        output = model(input_sequence, output_sequence)
    
        output = output[1:].view(-1, output.shape[-1])
        target_tokens = target_tokens[1:].view(-1)
    
        loss = criterion(output, target_tokens)
    
        epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [66]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [67]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
  start_time = time.time()
  
  train_loss = train(model, train_iterator, criterion, optimizer)
  valid_loss = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')
  
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f}')

100%|██████████| 2339/2339 [03:08<00:00, 12.44it/s]


Epoch: 01 | Time: 3m 21s
	Train Loss: 3.060
	 Val. Loss: 2.141


100%|██████████| 2339/2339 [03:03<00:00, 12.76it/s]


Epoch: 02 | Time: 3m 16s
	Train Loss: 1.866
	 Val. Loss: 1.647


100%|██████████| 2339/2339 [03:10<00:00, 12.26it/s]


Epoch: 03 | Time: 3m 23s
	Train Loss: 1.500
	 Val. Loss: 1.460


100%|██████████| 2339/2339 [03:12<00:00, 12.18it/s]


Epoch: 04 | Time: 3m 25s
	Train Loss: 1.317
	 Val. Loss: 1.363


100%|██████████| 2339/2339 [03:10<00:00, 12.31it/s]


Epoch: 05 | Time: 3m 23s
	Train Loss: 1.206
	 Val. Loss: 1.314


In [68]:
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [69]:
def translate_sentence(model, sentence):
    model.eval()
    with torch.no_grad():
        tokenized = nlp_ru(sentence) 
        tokenized = [t.lower_ for t in tokenized]
        int_tokenized = [SRC.vocab.stoi[t] for t in tokenized] 
        
        sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device) 
        tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device) 
        translation_tensor_logits = model((tensor, sentence_length), None) 
        translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
        translation = [TRG.vocab.itos[t] for t in translation_tensor]
     
        translation = translation[1:]
        return " ".join(translation)

In [78]:
sentence = ('я человек')
response = translate_sentence(model, sentence)
print(response)

i 'm a man


In [71]:
test_df = pd.read_csv('./data/translation.csv')

In [72]:
attn_translations = [translate_sentence(model, sent) for sent in tqdm(test_df["rus"])]

100%|██████████| 46668/46668 [01:43<00:00, 449.79it/s]


In [73]:
test_df["attn_translations"] = attn_translations
test_df.sample(20)

Unnamed: 0,rus,eng,translations,attn_translations
39417,все на меня смотрят,everyone is looking at me,everybody 's looking at me,everybody is looking at me
15923,я услышал позади себя шум,i heard a noise behind me,i heard a noise,i heard myself all the noise
17578,мы подтолкнули машину,we gave the car a push,we 're fixing the car,we 're the car
33961,том — хороший мальчик,tom is a good boy,tom is a good boy,tom is a good boy
32653,ты это переживёшь,you'll get over it,did you sign that,you deserved it
30,я принёс обед,i've brought lunch,i brought lunch,i brought lunch
41396,это можно было понять,it was understandable,it can happen,it was just to understand
36272,как ты познакомилась с томом,how did you and tom become acquainted,how did you get to tom,how did you get to get with tom
25925,крыша действительно нуждается в ремонте,the roof is really in need of repair,the roof of the roof is going to play,the roof is really good at <unk>
9175,дай их ему,give them to him,give them to him,give them it
