# RNN & Attention: HW

Привет! Это твоё домашнее задание: сделать модель, которая может переводить тексты с немецкого языка в англиский. Для обучения будет использоваться датасет [wmt-14](https://huggingface.co/datasets/wmt14). Для проверки будет использоваться BLEU на тестовой выборке и 10 примеров перевода вашей модели. В этом ноутбуке есть скелет для обучения модели трансформера. Но вы можете пользоваться и RNN, если вы считаете что можете обучить её под эту задачу. Главное -- получить `submission.yaml`, используя нейросети.

**!Внимание!** В этой домашней работе нельзя пользоваться библиотекой `transformers`.


In [1]:
import subprocess
import sys


IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    subprocess.run("pip install datasets nltk gensim einops evaluate", shell=True)
    subprocess.run("python -m nltk.downloader punkt", shell=True)

In [3]:
import torch
import nltk
import einops
import evaluate
import numpy as np

from datasets import load_dataset

In [4]:
bleu = evaluate.load("bleu")

# Данные

В этой части подготовьте данные для обучения. Не забудьте добавить "BOS", "EOS" и "UNK" токены в ваши словари.


In [77]:
wmt14 = load_dataset("wmt14", "de-en")

Found cached dataset wmt14 (/home/natitov/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [78]:
wmt14["train"][0]

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'}}

In [79]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = nltk.WordNetLemmatizer()
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/natitov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [80]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/natitov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
def tokenize_pipeline(sentence):
    tokens = tokenizer.tokenize(sentence.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    return ['BOS'] + lemmatized_tokens + ['EOS']

In [83]:
tokenized_en = [tokenize_pipeline(t["translation"]["en"]) for t in wmt14["train"].select(range(100000))]
tokenized_de = [tokenize_pipeline(t["translation"]["de"]) for t in wmt14["train"].select(range(100000))]

In [84]:
all_tokenized_en_words = set(w for t in tokenized_en for w in t)
all_tokenized_de_words = set(w for t in tokenized_de for w in t)

In [85]:
en_words_to_ids = {w: idx+4 for idx, w in enumerate(all_tokenized_en_words)}
de_words_to_ids = {w: idx+4 for idx, w in enumerate(all_tokenized_de_words)}

In [86]:
en_words_to_ids["PAD"] = 0
en_words_to_ids["BOS"] = 1
en_words_to_ids["EOS"] = 2
en_words_to_ids["UNK"] = 3

de_words_to_ids["PAD"] = 0
de_words_to_ids["BOS"] = 1
de_words_to_ids["EOS"] = 2
de_words_to_ids["UNK"] = 3

In [87]:
len(en_words_to_ids), len(de_words_to_ids)

(22317, 67707)

In [37]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, data, en_words_to_ids, de_words_to_ids):
        self.data = data
        self.en_words_to_ids = en_words_to_ids
        self.de_words_to_ids = de_words_to_ids
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        en_sentence = self.data[index]["translation"]["en"]
        de_sentence = self.data[index]["translation"]["de"]
        
        # 3 -- for UNK token
        en_tokens = [self.en_words_to_ids.get(w, 3) for w in en_sentence]
        de_tokens = [1] + [self.de_words_to_ids.get(w, 3) for w in de_sentence] + [2]
        return {'de': de_tokens, 'en': en_tokens}

In [38]:
train_dataset = TranslationDataset(wmt14["train"].select(range(100000)), en_words_to_ids, de_words_to_ids)
valid_dataset = TranslationDataset(wmt14["validation"], en_words_to_ids, de_words_to_ids)
test_dataset = TranslationDataset(wmt14["test"], en_words_to_ids, de_words_to_ids)

  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

  0%|          | 0/3003 [00:00<?, ?ex/s]

In [41]:
def collate_fn(batch):
    de_batch = []
    en_batch = []
    for item in batch:
        de_batch.append(torch.tensor(item['de']))
        en_batch.append(torch.tensor(item['en']))
    
    en_batch = torch.nn.utils.rnn.pad_sequence(en_batch, padding_value = en_words_to_ids['PAD'], batch_first=True)
    de_batch = torch.nn.utils.rnn.pad_sequence(de_batch, padding_value = de_words_to_ids['PAD'], batch_first=True)
    return de_batch, en_batch


In [42]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

In [43]:
next(iter(train_dataloader))[1]

tensor([[   2,   36,    6,  ...,    0,    0,    0],
        [   2,  254,  916,  ..., 1083,  104,    3],
        [   2,   32,  252,  ...,    0,    0,    0],
        ...,
        [   2,  254,  273,  ...,    0,    0,    0],
        [   2,    7,  119,  ...,    0,    0,    0],
        [   2,  254,   37,  ...,    0,    0,    0]])

# Model

Сделайте модель, которая может в перевод. Для этой модели потребуется сделать `Encoder` и `Decoder`. Первый будет брать текст на немецком и отдавать информацию про него. Decoder будет брать информацию про немецкий текст и превращать его в английский.


In [26]:
# Если вам нужны дополнительные модули, такие как Attention или Transformer layer, то можете добавить их сюда

Для слоев Encoder можете скопировать код из семинара:


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [45]:
import math

def attention(K, V, Q, num_heads, mask=None):
    batch_size, q_len, hidden_dim = Q.size()
    k_len, v_len = K.size(1), V.size(1) 
    K = K.reshape(batch_size, k_len, num_heads, -1) 
    V = V.reshape(batch_size, v_len, num_heads, -1) 
    Q = Q.reshape(batch_size, q_len, num_heads, -1)   
    energy = torch.einsum('bqhd,bkhd->bhqk', [Q, K]) 
    if mask is not None:
        energy = energy.masked_fill(mask == 0, -torch.inf)
    attention = torch.softmax(energy / math.sqrt(hidden_dim // num_heads), dim=3)
    result_headed = torch.einsum('bhql,blhd->bqhd', [attention, V])
    return result_headed.reshape(batch_size, q_len, hidden_dim)

In [46]:
class AttentionModule(torch.nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()

        self.q_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.k_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.v_linear = torch.nn.Linear(hidden_dim, hidden_dim)

        self.out_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.num_heads = num_heads
    
    def forward(self, key, value, query, mask):
        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)
        attention_output = attention(K, V, Q, self.num_heads, mask)
        return self.out_linear(attention_output) + query



class MLP(torch.nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
        self.linear_0 = torch.nn.Linear(hidden_dim, 4 * hidden_dim)
        self.linear_1 = torch.nn.Linear(4 * hidden_dim, hidden_dim)
    
    def forward(self, hidden_state):
        return self.linear_1(torch.relu(self.linear_0(hidden_state))) + hidden_state



class TransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        
        self.attention_layer = AttentionModule(hidden_dim, num_heads)
        self.mlp_layer = MLP(hidden_dim)
        
        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(0.1)
        
    def forward(self, value, key, query, mask):
        attn_output = self.dropout(self.norm(self.attention_layer(value, key, query, mask)))
        mlp_output  = self.dropout(self.norm(self.mlp_layer(attn_output)))
        return mlp_output


class AutoregressiveTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        
        self.self_attention = AttentionModule(hidden_dim, num_heads)
        self.out_attention = AttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)
        
        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(0.1)
                
    def forward(self, hidden_state, encoder_layer_output, source_mask, target_mask):
        self_attn = self.dropout(self.norm(self.self_attention(hidden_state, hidden_state, hidden_state, target_mask)))
        out_attn = self.dropout(self.norm(self.out_attention(encoder_layer_output, encoder_layer_output, self_attn, source_mask)))
        mlp_output = self.dropout(self.norm(self.mlp(out_attn)))
        return mlp_output
        

In [49]:
class Encoder(torch.nn.Module):
    def __init__(self, embedding_size, hidden_dim, max_seq_len, num_heads, num_layers):
        super().__init__()
        
        self.word_embedding = torch.nn.Embedding(embedding_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        
        self.attention_layer_0 = TransformerLayer(hidden_dim, num_heads)
        self.attention_layer_1 = TransformerLayer(hidden_dim, num_heads)
        self.attention_layer_2 = TransformerLayer(hidden_dim, num_heads)
        
        self.dropout = torch.nn.Dropout(0.1)
        
    def forward(self, inputs, mask):
        batch_size, seq_len = inputs.shape
        word_embs = self.word_embedding(inputs)
        pos_embs = self.pos_embedding(torch.arange(0, seq_len).expand(batch_size, seq_len).to(device))
        embs =  word_embs + pos_embs
        hidden_state = self.dropout(embs)
        
        hidden_state = self.attention_layer_0(hidden_state, hidden_state, hidden_state, mask)
        hidden_state = self.attention_layer_1(hidden_state, hidden_state, hidden_state, mask)
        hidden_state = self.attention_layer_2(hidden_state, hidden_state, hidden_state, mask)
        
        return hidden_state

Для Decoder слоя потребуется модифицировать код. Не забудьте, что для декодера требуется другой механизм внимания.


In [51]:
class Decoder(torch.nn.Module):
    def __init__(self, embedding_size, hidden_dim, max_seq_len, num_heads, num_layers):
        super().__init__()
        
        self.word_embedding = torch.nn.Embedding(embedding_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        
        self.attention_layer_0 = AutoregressiveTransformerLayer(hidden_dim, num_heads)
        self.attention_layer_1 = AutoregressiveTransformerLayer(hidden_dim, num_heads)
        self.attention_layer_2 = AutoregressiveTransformerLayer(hidden_dim, num_heads)
        
        self.linear = torch.nn.Linear(hidden_dim, embedding_size)
        self.dropout = torch.nn.Dropout(0.1)
        
    def forward(self, input_ids, encoder_output, source_mask, target_mask):
        batch_size, seq_len = input_ids.shape
        position_ids = torch.arange(0, seq_len).expand(batch_size, seq_len).to('cuda')
        word_embds = self.word_embedding(input_ids)
        pos_embds = self.pos_embedding(position_ids)
        hidden_state = self.dropout(word_embds + pos_embds)
        hidden_state = self.attention_layer_0(hidden_state, encoder_output, src_mask, trg_mask)
        hidden_state = self.attention_layer_1(hidden_state, encoder_output, src_mask, trg_mask)
        hidden_state = self.attention_layer_2(hidden_state, encoder_output, src_mask, trg_mask)
        return self.linear(outputs)

In [None]:
def source_mask(x):
    mask = (x != en_words_to_ids['PAD']).unsqueeze(1).unsqueeze(2)
    return mask

In [None]:
def target_mask(input):
    batch_size, inpt_len = input.shape
    mask = (input != en_words_to_ids['PAD']).unsqueeze(1).unsqueeze(2)
    sub_mask = torch.tril(torch.ones((inpt_len, inpt_len))).expand(batch_size, 1, inpt_len, inpt_len).bool()
    mask = mask.to('cpu')
    mask = mask & sub_mask
    mask = mask.to(device)
    return mask

In [53]:
class TranslateTransformer(torch.nn.Module):
    def __init__(self, source_embeddibg_size, target_embedding_size, hidden_dim, num_heads, max_seq_len, num_layers):
        super().__init__()
        
        self.encoder = Encoder(source_embeddibg_size, hidden_dim, max_seq_len, num_heads, num_layers)
        self.decoder = Decoder(target_embedding_size, hidden_dim, max_seq_len, num_heads, num_layers)
        
    def forward(self, inputs):
        x, y = inputs
        mask_orig = source_mask(x)
        mask_translation = target_mask(y)
        encoder_output = self.encoder(x, mask_orig)
        decoder_output = self.decoder(y, encoder_output, mask_orig, mask_translation)
        return decoder_output

In [54]:
model =  TranslateTransformer(len(en_words_to_ids), len(de_words_to_ids), 64, 4, 256, 3)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index = en_words_to_ids['PAD'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

In [64]:
from tqdm import tqdm

def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    epoch_loss = 0
    for x, y in tqdm(data_loader):
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        preds = model((x,y[:, : -1]))
        y = y[:, 1:].contiguous().view(-1)
        en_vocab_len = preds.size(-1)
        preds = preds.view(-1, en_vocab_len)
        
        loss = loss_function(preds, y)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    
    return epoch_loss / len(data_loader)


def eval_epoch(model, data_loader, criterion, device):
    epoch_loss = 0
    for x, y in tqdm(data_loader):
        with torch.no_grad():
            x = x.to(device)
            y = y.to(device)

            preds = model((x,y[:, : -1]))
            y = y[:, 1:].contiguous().view(-1)
            en_vocab_len = preds.size(-1)
            preds = preds.view(-1, en_vocab_len)

            loss = loss_function(preds, y)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [67]:
model.to(device)
criterion.to(device)
num_epochs = 5


for i in range(num_epochs):
    train_metrics = train_epoch(model, train_dataloader, criterion, optimizer, scheduler, device)
    eval_metrics = eval_epoch(model, valid_dataloader, criterion, device)
    print(f"EPOCH: {i+1}")
    print(f'Train loss: {train_metrics}')
    print(f'Val loss: {eval_metrics}')


  0%|          | 0/3125 [00:00<?, ?it/s]

EPOCH: 1
Train loss: 1.7527447938919067
Val loss: 2.0938753878816647


  0%|          | 0/3125 [00:00<?, ?it/s]

EPOCH: 2
Train loss: 1.8044270277023315
Val loss: 2.0920537951144764


  0%|          | 0/3125 [00:00<?, ?it/s]

EPOCH: 3
Train loss: 1.5102092027664185
Val loss: 2.0921324313955103


  0%|          | 0/3125 [00:00<?, ?it/s]

EPOCH: 4
Train loss: 1.5152244567871094
Val loss: 2.0904776415926345


  0%|          | 0/3125 [00:00<?, ?it/s]

EPOCH: 5
Train loss: 1.8482121229171753
Val loss: 2.0902800458542843


Чтобы получить перевод, надо сделать функцию для декодинга. Она будет брать предсказания токена на последней позиции и отдавать нужный токен.


In [68]:
ids_to_en_words = {v: k for k, v in en_words_to_ids.items()}

In [69]:
def get_translation(sentence, model):
    
    sentence = ['BOS'] + list(tokenize_pipeline(sentence)) + ['EOS']
    sentence = torch.tensor([de_words_to_ids[word] if word in de_words_to_ids else de_words_to_ids['UNK'] for word in sentence]).unsqueeze(0).to(device)
    sentence_mask = source_mask(sentence).to(device)
    
    model.eval()
    with torch.no_grad():
        enc_sentence = model.encoder(sentence, sentence_mask)
    
        target = [de_words_to_ids['BOS']]
        target_tokens = []
        while True:
            target_tensor = torch.tensor(target).unsqueeze(0).to(device)
            trg_mask = target_mask(target_tensor)
            
            preds = model.decoder(target_tensor, enc_sentence, sentence_mask, trg_mask)
            last_id = torch.argmax(preds[:, -1, :], dim = -1).item()
            
            if last_id == de_words_to_ids['EOS']:
                break
            target.append(last_id)
            target_tokens.append(ids_to_en_words[last_id])
    return " ".join(target_tokens)

# Result

В качестве результата вы должны предоставить bleu вашей модели на тестовой выборке wmt14 и перевод 10 предложений с немецкого на английский.


In [71]:
references = [[" ".join(reference)] for reference in wmt14["test"]["en_tokens"]]
predictions = [decode(example["de"], model) for example in wmt14["test"]["translation"]]

In [75]:
test_bleu = bleu.compute(predictions=predictions, references=references)
test_bleu['bleu']

0.02294269871806114

In [73]:
de_sentences = [
    "Gutach: Noch mehr Sicherheit für Fußgänger",
    "Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?",
    "Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.",
    "Daher sei der Bau einer weiteren Ampel mehr als notwendig: \"Sicherheit geht hier einfach vor\", so Arnold.",
    "Pro Fahrtrichtung gibt es drei Lichtanlagen.",
    "Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.",
    "Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.",
    "Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.",
    "Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.",
    "Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.",
]
en_sentences = [decode(src, model) for src in de_sentences]
en_sentences

['UNK : more important .',
 'two questions : " UNK or UNK ?',
 'this also applies to the UNK of the UNK of UNK .',
 'therefore , the main idea of a more important step as a whole : " " " " " " is not a more UNK .',
 'there are three problems of UNK .',
 'the UNK UNK UNK UNK UNK .',
 'a UNK UNK UNK , whether the UNK of the UNK can be taken .',
 'UNK UNK has been UNK as a result of UNK , since the UNK of his UNK and UNK .',
 'the UNK UNK UNK UNK UNK .',
 'they will be UNK with the UNK , with the UNK UNK , with the UNK of their UNK .']

In [76]:
import yaml


submission = {
    "tasks": [
        {"task1": {"answer": test_bleu['bleu']}},
        {"task2": {"answer": en_sentences}}
    ]
}

yaml.safe_dump(submission, open("submission.yaml", "w"))