In [7]:
import random
import time
from typing import Dict, List, Tuple, Iterable, Sized

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Зафиксируем зерна
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Используемое устройство: cuda


In [8]:
from typing import Iterable

class logger:
    active = False
    _calls_ = {}
    log_file = "logger_output.txt"  # Default log file path
    silent = True

    @classmethod
    def on(cls): cls.active = True

    @classmethod
    def off(cls): cls.active = False

    @classmethod
    def silent(cls, silent: bool = True):
        cls.silent = silent

    @classmethod
    def zero(cls):
        cls._calls_ = {}

    @classmethod
    def clear_log(cls):
        with open(cls.log_file, 'w') as f:
            f.write("")

    @classmethod
    def write_log(cls, msg):
        with open(cls.log_file, 'a') as f:
            f.write(msg + "\n")

    @classmethod
    def trace(cls, name):
        def log_fn(func):
            def wrapper(*args, **kwargs):
                if cls.active:
                    if name not in cls._calls_:
                        cls._calls_[name] = 0
                    msg = f'>>> {name} call {cls._calls_[name]}: \n Args: \n'
                    for i, arg in enumerate(args):
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t arg[{i}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t arg[{i}]: {arg}\n'

                    for k, arg in kwargs.items():
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t kwarg[{k}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t kwarg[{k}]: {arg} \n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)

                result = func(*args, **kwargs)

                if cls.active:
                    msg = f'Result: \n'
                    if isinstance(result, Iterable):
                        for i, outp in enumerate(result):
                            if isinstance(outp, torch.Tensor):
                                msg += f'\t output[{i}]: shape={outp.shape}, dtype={outp.dtype}, device={outp.device}\n {outp} \n'
                            else:
                                msg += f'\t output[{i}]: {outp}\n'
                    elif isinstance(result, torch.Tensor):
                        msg = f'\t output: shape={result.shape}, dtype={result.dtype}, device={result.device}\n {result} \n'
                    else:
                        msg += f'\t output: {result}\n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)
                    cls._calls_[name] += 1

                return result
            return wrapper

        return log_fn

# 1. Токенизатор

In [140]:
from transformers import AutoTokenizer, AutoModelForPreTraining
# https://huggingface.co/docs/transformers/main_classes/tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

In [4]:
VOCAB_SIZE = tokenizer.vocab_size
print(f'Vocabulary size: {VOCAB_SIZE}')

Vocabulary size: 30522


In [10]:
def collect_special_tokens(tokenizer: AutoTokenizer) -> Dict:

    special_tokens = ['PAD', 'BOS', 'EOS', 'UNK', 'SEP', 'CLS']
    outp = {}
    for name in special_tokens:
        token = getattr(tokenizer, f'{name.lower()}_token')
        if token is not None:
            outp[name] = token
            outp[f'{name}_id'] = getattr(tokenizer, f'{name.lower()}_token_id')
        else:
            print(f'{name} token  is not present in tokenizer')
    
    for name in special_tokens:
        if name in outp:
            print(f'{name} token: token {outp[name]}, id {outp[f'{name}_id']}')

    return outp

special_tokens = collect_special_tokens(tokenizer)

for k, v in special_tokens.items():
    globals()[k] = v

BOS token  is not present in tokenizer
EOS token  is not present in tokenizer
PAD token: token [PAD], id 0
UNK token: token [UNK], id 100
SEP token: token [SEP], id 102
CLS token: token [CLS], id 101


In [11]:
PAD

'[PAD]'

In [76]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [5]:
text = 'NLP (Neuro-Linguistic Programming) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience. There is no scientific evidence supporting the effectiveness of NLP; it is recognized as a pseudoscience.'
# apply tokenizer:
seq = tokenizer(text)
print(seq)
# decode with tokenizer:
print(tokenizer.decode(seq['input_ids']))

{'input_ids': [101, 17953, 2361, 1006, 11265, 10976, 1011, 12158, 4730, 1007, 2003, 1037, 8317, 3921, 2008, 7336, 20253, 1996, 7060, 1997, 2245, 1010, 2653, 1010, 1998, 5248, 2000, 3305, 2129, 2027, 11835, 2007, 1998, 3747, 2529, 3325, 1012, 2045, 2003, 2053, 4045, 3350, 4637, 1996, 12353, 1997, 17953, 2361, 1025, 2009, 2003, 3858, 2004, 1037, 18404, 11020, 13684, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] nlp ( neuro - linguistic programming ) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience. there is no s

In [87]:
texts = ['NLP (Neuro-Linguistic Programming) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience.',
         'There is no scientific evidence supporting the effectiveness of NLP; it is recognized as a pseudoscience.']
seq = tokenizer(text, max_length=64, padding="max_length",\
                truncation="longest_first", return_tensors="pt",\
                return_token_type_ids = False, return_length=True)
print(seq)
# decode with tokenizer:
print(tokenizer.decode(seq['input_ids'][0]))

{'input_ids': tensor([[  101, 17953,  2361,  1006, 11265, 10976,  1011, 12158,  4730,  1007,
          2003,  1037,  8317,  3921,  2008,  7336, 20253,  1996,  7060,  1997,
          2245,  1010,  2653,  1010,  1998,  5248,  2000,  3305,  2129,  2027,
         11835,  2007,  1998,  3747,  2529,  3325,  1012,  2045,  2003,  2053,
          4045,  3350,  4637,  1996, 12353,  1997, 17953,  2361,  1025,  2009,
          2003,  3858,  2004,  1037, 18404, 11020, 13684,  1012,   102,     0,
             0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]), 'length': tensor([64])}
[CLS] nlp ( neuro - linguistic programming ) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experi

In [None]:
# Возвращается не словарь, а свой тип, имеющий  методы Dict() и другие 
type(seq)

In [89]:
print(*seq.tokens())

[CLS] nl ##p ( ne ##uro - linguistic programming ) is a psychological approach that involves analyzing the patterns of thought , language , and behavior to understand how they interact with and influence human experience . there is no scientific evidence supporting the effectiveness of nl ##p ; it is recognized as a pseudo ##sc ##ience . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]


In [71]:
print(''.join([_attr_+'   ' for _attr_ in dir(seq)]))

_MutableMapping__marker   __abstractmethods__   __class__   __class_getitem__   __contains__   __copy__   __delattr__   __delitem__   __dict__   __dir__   __doc__   __eq__   __format__   __ge__   __getattr__   __getattribute__   __getitem__   __getstate__   __gt__   __hash__   __init__   __init_subclass__   __ior__   __iter__   __le__   __len__   __lt__   __module__   __ne__   __new__   __or__   __reduce__   __reduce_ex__   __repr__   __reversed__   __ror__   __setattr__   __setitem__   __setstate__   __sizeof__   __slots__   __str__   __subclasshook__   __weakref__   _abc_impl   _encodings   _n_sequences   char_to_token   char_to_word   clear   convert_to_tensors   copy   data   encodings   fromkeys   get   is_fast   items   keys   n_sequences   pop   popitem   sequence_ids   setdefault   to   token_to_chars   token_to_sequence   token_to_word   tokens   update   values   word_ids   word_to_chars   word_to_tokens   words   


In [90]:
print(seq.input_ids)

tensor([[  101, 17953,  2361,  1006, 11265, 10976,  1011, 12158,  4730,  1007,
          2003,  1037,  8317,  3921,  2008,  7336, 20253,  1996,  7060,  1997,
          2245,  1010,  2653,  1010,  1998,  5248,  2000,  3305,  2129,  2027,
         11835,  2007,  1998,  3747,  2529,  3325,  1012,  2045,  2003,  2053,
          4045,  3350,  4637,  1996, 12353,  1997, 17953,  2361,  1025,  2009,
          2003,  3858,  2004,  1037, 18404, 11020, 13684,  1012,   102,     0,
             0,     0,     0,     0]])


# 2. Данные и контейнеры

In [17]:
%%bash
data_dir="../data"
csv_name="1_Recipe_csv.csv"
zip_name="recipes-dataset-64k.zip"
zip_path="$data_dir/$zip_name"

if [ ! -d "$data_dir" ]; then
    mkdir -p "$data_dir"
    echo "Directory created: $data_dir"
else
    echo "Found data directory: $data_dir"
fi

if [ ! -f "$data_dir/$csv_name" ]; then
    echo "CSV file $csv_name not found..."
    if [ ! -f "$zip_path" ]; then
        echo "Archive $zip_name not found, downloading..."
        curl -L -o "$zip_path" https://www.kaggle.com/api/v1/datasets/download/prashantsingh001/recipes-dataset-64k-dishes
        echo "Dataset downloaded: $zip_path"
    else
        echo "Found archive $zip_name"
    fi
    echo "Extracting to $data_dir ..."
    unzip "$zip_path" -d "$data_dir"
    echo "Dataset unzipped to: $data_dir"
else
    echo "Found dataset: $data_dir/$csv_name"
fi

Found data directory: ../data
Found dataset: ../data/1_Recipe_csv.csv


In [174]:
import re
import tqdm
from torch.utils.data import Dataset
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from typing import List, Dict, Tuple

class RecipeDataset(Dataset):

    def __init__(
            self, path: str,
            source_columns: List[str],
            target_columns: List[str],
            tokenizer: AutoTokenizer,
            nrows = None,
            device = 'cpu',
            max_len = 256,
            padding_type = "max_length",
            ):

        columns = source_columns + target_columns
        self.source_columns = source_columns
        self.target_columns = target_columns

        data = pd.read_csv(path, usecols=columns, nrows=nrows)

        self.src_ = []
        self.tgt_ = []
        self.tokenizer = tokenizer
        self.device = device
        self.PAD_idx = tokenizer.pad_token_id
        self.max_len = max_len
        self.padding_type = padding_type

        for i in tqdm.trange(len(data)):
            row = data.iloc[i]
            src_text = self.process_row(row, source_columns)
            tgt_text = self.process_row(row, target_columns)
            
            self.src_.append(src_text)
            self.tgt_.append(tgt_text)

        self.size = len(self.src_)

    def process_row(self, row: pd.Series, columns: List[str]):
        """Processes a single recipe row from the DataFrame into a clean string."""
        entry_parts = []
        for col in columns:
            if pd.notna(row[col]):
                content = str(row[col])
                if content.startswith('[') and content.endswith(']'):
                    content = re.sub(r'["\\$$\\\\$$]', '', content)
                entry_parts.append(f'{col.replace("_", " ")}: {content}')
                entry_parts.append('\n')
        return ''.join(entry_parts[:-1])

    def __len__(self) -> int:
        return self.size

    def __getitem__(self, idx) -> Tuple[str, str]:
        source = self.src_[idx]
        target = self.tgt_[idx]
        return source, target

    def single(self, idx):
        return self.collate_fn([(self.src_[idx], self.tgt_[idx])])

    def collate_fn(self, batch: List[Tuple[str, str]]) -> Dict[str, List]:
        
        sources = [f[0] for f in batch]
        targets = [f[1] for f in batch]
    
        source_enc = self.tokenizer(list(sources), max_length=self.max_len, 
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        target_enc = self.tokenizer(list(targets), max_length=self.max_len,
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        batch = {
            'input_ids': source_enc['input_ids'], 
            'attention_mask': source_enc['attention_mask'],
            'input_lengths': source_enc['length'],
            'labels': target_enc['input_ids'],
            'labels_lengths': target_enc['length'],
        }
        return batch

DATA_PATH = '../data/1_Recipe_csv.csv'
INPUT_COLS = ['recipe_title', 'category']
TARGET_COLS = ['description', 'ingredients', 'directions']
NSAMPLES = 1024

data = RecipeDataset(DATA_PATH, INPUT_COLS, TARGET_COLS,\
                     tokenizer, nrows=NSAMPLES, padding_type="longest")


100%|██████████| 1024/1024 [00:00<00:00, 50289.41it/s]


In [175]:
idx = np.random.randint(0, NSAMPLES)
print(data[idx][0])
print(data[idx][1])

recipe title: Wasabi Shrimp Sushi Cups
category: Allrecipes Allstar Recipes
description: These wasabi shrimp sushi cups are perfect for you if rolling sushi is not in your wheelhouse. Certainly filling a muffin cup is! Prepare sushi rice according to the package instructions, but do not fluff—the success of this dish depends on very sticky rice. Use a silicone muffin pan, if you have one. While the rice cooks, prepare the filling ingredients.
ingredients: [1 tablespoon wasabi paste, 1 teaspoon soy sauce, 2 tablespoons mayonnaise]
directions: [For wasabi aioli; combine wasabi paste, soy sauce, and mayonnaise in a small, zippered food storage bag. Gently squeeze the bag to combine ingredients. Taste and add more wasabi paste, if desired. Seal the bag and refrigerate., Place about 1/8 cup cooked and cooled rice in each of 8 silicone muffin cups. Press rice firmly into the cup, using damp fingertips. Shape some of the rice up the sides of the cup. Refrigerate at least 20 minutes., To serve

In [185]:
data.single(0)

{'input_ids': tensor([[  101, 17974,  2516,  1024,  2250, 14744,  2121, 14557, 25609,  2007,
          23427, 12901,  4696,  1024,  2250, 14744,  2121, 19328,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_lengths': tensor([19]),
 'labels': tensor([[  101,  6412,  1024,  2122,  2250, 14744,  2121, 14557, 25609,  1010,
           2366,  2007,  1037,  5404, 17710, 10649,  6279, 23427, 12901,  1010,
           2024,  1037, 11937, 21756,  4344,  2833,  4873,  2090,  1037,  2413,
          14744,  1998,  1037, 14557,  9090,  1012,  2079,  2202,  1996,  2051,
           2000,  2191,  1996, 23427, 12901,  1517,  2009,  1005,  1055,  4276,
           2009,  1012, 12760,  1024,  1031,  1017,  1013,  1018,  2452, 17710,
          10649,  6279,  1010,  1015,  1013,  1016,  2452,  5404,  1010,  1015,
           7251, 24667,  2078, 20218, 12901,  1010,  1015,  1013,  1016,  5572,
          13102,  7828, 20949,  9898,  1010,  1015,  1013,  1

In [176]:
from torch.utils.data import Subset, DataLoader

def get_dataloaders(dataset: Dataset, collate_fn=None, train_ratio=0.8, val_ratio=0.1, batch_size=1):
    """
    Loads data, splits it, and creates train, validation, and test DataLoaders.
    """
    indices = list(range(len(dataset)))
    random.shuffle(indices)

    train_end = int(len(indices) * train_ratio)
    val_end = int(len(indices) * (train_ratio + val_ratio))

    train_set = Subset(dataset, indices[:train_end])
    val_set = Subset(dataset, indices[train_end:val_end])
    test_set = Subset(dataset, indices[val_end:])

    print(f"Data split:")
    print(f"Training set size: {len(train_set)}")
    print(f"Validation set size: {len(val_set)}")
    print(f"Test set size: {len(test_set)}")

    if batch_size >= 1:
        print("Using batching with padding. Ensure your training loop can handle batched data!")

    
    train_loader = DataLoader(train_set, batch_size, shuffle=True, collate_fn=collate_fn)

    val_loader = DataLoader(val_set, batch_size, shuffle=False, collate_fn=collate_fn)
    
    test_loader = DataLoader(test_set, batch_size, shuffle=False, collate_fn=collate_fn)

    print(f"DataLoaders created with batch size {batch_size}.")

    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = get_dataloaders(data, collate_fn=data.collate_fn, batch_size=8)

Data split:
Training set size: 819
Validation set size: 102
Test set size: 103
Using batching with padding. Ensure your training loop can handle batched data!
DataLoaders created with batch size 8.


In [177]:
batch_n = 1
n = 0

for X in test_loader:
    if n == batch_n:
         print(X)
    elif n > batch_n:
        break
    n += 1      

{'input_ids': tensor([[  101, 17974,  2516,  1024,  2250, 14744,  2121, 13501,  6763,  4696,
          1024,  2035,  2890,  6895, 10374,  2035, 14117, 19328,   102,     0,
             0,     0],
        [  101, 17974,  2516,  1024,  7095,  6548,  2098,  6763,  4696,  1024,
          2035,  2890,  6895, 10374,  2035, 14117, 19328,   102,     0,     0,
             0,     0],
        [  101, 17974,  2516,  1024, 20665,  2004, 28689, 12349, 16521,  4696,
          1024,  2035,  2890,  6895, 10374,  2035, 14117, 19328,   102,     0,
             0,     0],
        [  101, 17974,  2516,  1024, 24514,  2100, 24881, 16521,  2007, 23605,
         14580, 11225,  4696,  1024,  2035,  2890,  6895, 10374,  2035, 14117,
         19328,   102],
        [  101, 17974,  2516,  1024,  2190,  2412,  2630,  9766, 16324,  4696,
          1024,  2035,  2890,  6895, 10374,  2035, 14117, 19328,   102,     0,
             0,     0],
        [  101, 17974,  2516,  1024,  1018,  1011, 21774,  7987,  2271, 2031

# 4.1. Seq2seq без внимания

In [178]:
IDTYPE = torch.int32
FDTYPE = torch.float32
# FDTYPE = torch.float8_e5m2

In [186]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, 
                          batch_first=True, bidirectional=True, dropout=dropout)
        
    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, hidden = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        return output, hidden


In [184]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""
    
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.5,
                 bridge=True):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
                 
        self.rnn = nn.GRU(input_size + 2*hidden_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
                 
        # Для инициализации из конечного состояния энкодера
        self.bridge = nn.Linear(2*hidden_size, hidden_size, bias=True) if bridge else None

        self.dropout_layer = nn.Dropout(p=dropout)
    
    def forward(self, inputs, encoder_hidden, 
                src_mask=None, trg_mask=None, hidden=None, max_len=None):
        """
        Unroll the decoder for a batch of text sequences

        Args:
            - inputs: torch.Tensor - encoder output (batch, seq_len, encoder_output)
            - encoder_hidden: torch.Tensor - last encoder hidden state (batch, seq_len, encoder_hidden)
        """
                                        
        if max_len is None:
            max_len = trg_mask.size(-1)

        if hidden is None:
            hidden = self.init_hidden(encoder_hidden)
            
        decoder_outputs = []
        decoder_states = []
        
        for i in range(max_len):
            output, hidden = self.rnn(inputs, hidden)
            decoder_states.append(hidden)

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_states = torch.cat(decoder_states, dim=1)
        return decoder_states, hidden # [B, L, H]

    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""

        if encoder_final is None:
            return None  

        return torch.tanh(self.bridge(encoder_final))       

In [None]:
class Seq2Seq(nn.Module):
    """
    A standard Seq2Seq architectuew
    """
    def __init__(self,
                 tokenizer,
                 embed_dim,
                 hidden_size,
                 n_layers,
                 dropout,
                 EncoderCls,
                 DecoderCls,
                 tie_projection=False
                ):
        """
        Args:
            - tokenizer: AutoTokenizer
            - embed_dim: int - size of embedding dimension
            - nlayers: number of layers in rnn
            - cls: encoder class to use, must match signature (),
            - decoder_cls: decoder class to use, must match signature (),
            - tie_projection: bool - if true, embedding and projection layer will use same weights
        """
        super(Seq2Seq, self).__init__()
        self.tokenizer = tokenizer
        self.vocab_size = tokenizer.vocab_size
        self.dropout = dropout
        self.embedding = nn.Embeddings(self.vocab_size, embded_dim, padding_idx=0)
        self.encoder = EncoderCls(embed_dim, hidden_size, n_layers, dropout)
        self.decoder = DecoderCls(hidden_size, embed_dim, n_layers, dropout)
        self.projection = nn.Linear(embed_dim, vocab_size)
        
        if tie_projection:
            self.projection.weights = self.embedding.weights
        
    def forward(self, tokens, src_mask=None, src_lengths=None, tgt_mask=None, tgt_lengths=None):
        """
        Args:
            - tokens: torch.Tensor[int] : input tokens 
        """
        embedded = self.embedding(tokens)
        encoder_hidden, encoder_final = self.encode(tokens, src_mask, src_lengths)
        decoder_hidden, decoder_final = self.decode(encoder_hidden, encoder_final, src_mask, trg, trg_mask)

        logits
        return logits
        
    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)
    
    def decode(self, encoder_hidden, encoder_final, src_mask, trg, trg_mask,
               decoder_hidden=None):
        return self.decoder(self.trg_embed(trg), encoder_hidden, encoder_final,
                            src_mask, trg_mask, hidden=decoder_hidden)
    
    def calculate_loss(self, loss_fn, src_mask, trg_mask, src_lengths, trg_lengths):

        pass


In [None]:
model = Seq2Seq(

# 4.2. Цикл обучения и другие рутины

# 4.2. Seq2seq с вниманием

In [None]:
class BahdanauAttention(nn.Module):
    """Implements Bahdanau attention"""
    
    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()
        
        # Поскольку в энкодере двунаправленный GRU, key_size = 2*hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size is None else query_size

        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)
        
        # to store attention scores
        self.scores = None
        
    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"

        # We first project the query (the decoder state).
        # The projected keys (the encoder states) were already pre-computated.
        query = self.query_layer(query)
        
        # Calculate scores.
        scores = self.energy_layer(torch.tanh(query + proj_key))
        scores = scores.squeeze(2).unsqueeze(1)
        
        # Mask out invalid positions.
        # The mask marks valid positions so we invert it using `mask & 0`.
        scores.data.masked_fill_(mask == 0, -float('inf'))
        
        # Turn scores to probabilities.
        self.scores = F.softmax(scores, dim=-1)
        
        # The context vector is the weighted sum of the values.
        context = torch.bmm(self.scores, value)
        
        # context shape: [B, 1, 2D], alphas shape: [B, 1, M]
        return context, self.scores


In [None]:
# Реализуйте и обучите Seq2Seq с вниманием

# YOUR CODE

# 5. Контрольные вопросы

In [None]:
# Насколько корректно модель предсказывает рецепты из тренировочной выборки?

In [None]:
# Насколько корректно модель предсказывает рецепты из тестовой выборки?

In [None]:
# Что будет, если подать рецепт из категорий, на которых модель не обучалась?

In [None]:
# Что будет, если подать промпт, не предполагающйий наличие рецепта?