In [None]:
import random
import time
from typing import Dict, List, Tuple, Iterable, Sized

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Зафиксируем зерна
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from typing import Iterable

class logger:
    active = False
    _calls_ = {}
    log_file = "logger_output.txt"  # Default log file path
    silent = True

    @classmethod
    def on(cls): cls.active = True

    @classmethod
    def off(cls): cls.active = False

    @classmethod
    def silent(cls, silent: bool = True):
        cls.silent = silent

    @classmethod
    def zero(cls):
        cls._calls_ = {}

    @classmethod
    def clear_log(cls):
        with open(cls.log_file, 'w') as f:
            f.write("")

    @classmethod
    def write_log(cls, msg):
        with open(cls.log_file, 'a') as f:
            f.write(msg + "\n")

    @classmethod
    def trace(cls, name):
        def log_fn(func):
            def wrapper(*args, **kwargs):
                if cls.active:
                    if name not in cls._calls_:
                        cls._calls_[name] = 0
                    msg = f'>>> {name} call {cls._calls_[name]}: \n Args: \n'
                    for i, arg in enumerate(args):
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t arg[{i}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t arg[{i}]: {arg}\n'

                    for k, arg in kwargs.items():
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t kwarg[{k}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t kwarg[{k}]: {arg} \n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)

                result = func(*args, **kwargs)

                if cls.active:
                    msg = f'Result: \n'
                    if isinstance(result, Iterable):
                        for i, outp in enumerate(result):
                            if isinstance(outp, torch.Tensor):
                                msg += f'\t output[{i}]: shape={outp.shape}, dtype={outp.dtype}, device={outp.device}\n {outp} \n'
                            else:
                                msg += f'\t output[{i}]: {outp}\n'
                    elif isinstance(result, torch.Tensor):
                        msg = f'\t output: shape={result.shape}, dtype={result.dtype}, device={result.device}\n {result} \n'
                    else:
                        msg += f'\t output: {result}\n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)
                    cls._calls_[name] += 1

                return result
            return wrapper

        return log_fn

# 1. Токенизатор

In [None]:
from transformers import AutoTokenizer, AutoModelForPreTraining
# https://huggingface.co/docs/transformers/main_classes/tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

In [None]:
VOCAB_SIZE = tokenizer.vocab_size
print(f'Vocabulary size: {VOCAB_SIZE}')

In [None]:
def collect_special_tokens(tokenizer: AutoTokenizer) -> Dict:

    special_tokens = ['PAD', 'BOS', 'EOS', 'UNK', 'SEP', 'CLS']
    outp = {}
    for name in special_tokens:
        token = getattr(tokenizer, f'{name.lower()}_token')
        if token is not None:
            outp[name] = token
            outp[f'{name}_id'] = getattr(tokenizer, f'{name.lower()}_token_id')
        else:
            print(f'{name} token  is not present in tokenizer')
    
    for name in special_tokens:
        if name in outp:
            print(f'{name} token: token {outp[name]}, id {outp[f'{name}_id']}')

    return outp

special_tokens = collect_special_tokens(tokenizer)

for k, v in special_tokens.items():
    globals()[k] = v

In [None]:
PAD

In [None]:
tokenizer.all_special_tokens

In [None]:
text = 'NLP (Neuro-Linguistic Programming) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience. There is no scientific evidence supporting the effectiveness of NLP; it is recognized as a pseudoscience.'
# apply tokenizer:
seq = tokenizer(text)
print(seq)
# decode with tokenizer:
print(tokenizer.decode(seq['input_ids']))

In [None]:
texts = ['NLP (Neuro-Linguistic Programming) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience.',
         'There is no scientific evidence supporting the effectiveness of NLP; it is recognized as a pseudoscience.']
seq = tokenizer(text, max_length=64, padding="max_length",\
                truncation="longest_first", return_tensors="pt",\
                return_token_type_ids = False, return_length=True)
print(seq)
# decode with tokenizer:
print(tokenizer.decode(seq['input_ids'][0]))

In [None]:
# Возвращается не словарь, а свой тип, имеющий  методы Dict() и другие 
type(seq)

In [None]:
print(*seq.tokens())

In [None]:
print(''.join([_attr_+'   ' for _attr_ in dir(seq)]))

In [None]:
print(seq.input_ids)

# 2. Данные и контейнеры

In [None]:
%%bash
data_dir="../data"
csv_name="1_Recipe_csv.csv"
zip_name="recipes-dataset-64k.zip"
zip_path="$data_dir/$zip_name"

if [ ! -d "$data_dir" ]; then
    mkdir -p "$data_dir"
    echo "Directory created: $data_dir"
else
    echo "Found data directory: $data_dir"
fi

if [ ! -f "$data_dir/$csv_name" ]; then
    echo "CSV file $csv_name not found..."
    if [ ! -f "$zip_path" ]; then
        echo "Archive $zip_name not found, downloading..."
        curl -L -o "$zip_path" https://www.kaggle.com/api/v1/datasets/download/prashantsingh001/recipes-dataset-64k-dishes
        echo "Dataset downloaded: $zip_path"
    else
        echo "Found archive $zip_name"
    fi
    echo "Extracting to $data_dir ..."
    unzip "$zip_path" -d "$data_dir"
    echo "Dataset unzipped to: $data_dir"
else
    echo "Found dataset: $data_dir/$csv_name"
fi

In [None]:
import re
import tqdm
from torch.utils.data import Dataset
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from typing import List, Dict, Tuple

class RecipeDataset(Dataset):

    def __init__(
            self, path: str,
            source_columns: List[str],
            target_columns: List[str],
            tokenizer: AutoTokenizer,
            nrows = None,
            device = 'cpu',
            max_len = 256,
            padding_type = "max_length",
            ):

        columns = source_columns + target_columns
        self.source_columns = source_columns
        self.target_columns = target_columns

        data = pd.read_csv(path, usecols=columns, nrows=nrows)

        self.src_ = []
        self.tgt_ = []
        self.tokenizer = tokenizer
        self.device = device
        self.PAD_idx = tokenizer.pad_token_id
        self.max_len = max_len
        self.padding_type = padding_type

        for i in tqdm.trange(len(data)):
            row = data.iloc[i]
            src_text = self.process_row(row, source_columns)
            tgt_text = self.process_row(row, target_columns)
            
            self.src_.append(src_text)
            self.tgt_.append(tgt_text)

        self.size = len(self.src_)

    def process_row(self, row: pd.Series, columns: List[str]):
        """Processes a single recipe row from the DataFrame into a clean string."""
        entry_parts = []
        for col in columns:
            if pd.notna(row[col]):
                content = str(row[col])
                if content.startswith('[') and content.endswith(']'):
                    content = re.sub(r'["\\$$\\\\$$]', '', content)
                entry_parts.append(f'{col.replace("_", " ")}: {content}')
                entry_parts.append('\n')
        return ''.join(entry_parts[:-1])

    def __len__(self) -> int:
        return self.size

    def __getitem__(self, idx) -> Tuple[str, str]:
        source = self.src_[idx]
        target = self.tgt_[idx]
        return source, target

    def single(self, idx):
        return self.collate_fn([(self.src_[idx], self.tgt_[idx])])

    def collate_fn(self, batch: List[Tuple[str, str]]) -> Dict[str, List]:
        
        sources = [f[0] for f in batch]
        targets = [f[1] for f in batch]
    
        source_enc = self.tokenizer(list(sources), max_length=self.max_len, 
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        target_enc = self.tokenizer(list(targets), max_length=self.max_len,
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        batch = {
            'input_ids': source_enc['input_ids'], 
            'attention_mask': source_enc['attention_mask'],
            'input_lengths': source_enc['length'],
            'labels': target_enc['input_ids'],
            'labels_lengths': target_enc['length'],
        }
        return batch

DATA_PATH = '../data/1_Recipe_csv.csv'
INPUT_COLS = ['recipe_title', 'category']
TARGET_COLS = ['description', 'ingredients', 'directions']
NSAMPLES = 1024

data = RecipeDataset(DATA_PATH, INPUT_COLS, TARGET_COLS,\
                     tokenizer, nrows=NSAMPLES, padding_type="longest")


In [None]:
idx = np.random.randint(0, NSAMPLES)
print(data[idx][0])
print(data[idx][1])

In [None]:
data.single(0)

In [None]:
from torch.utils.data import Subset, DataLoader

def get_dataloaders(dataset: Dataset, collate_fn=None, train_ratio=0.8, val_ratio=0.1, batch_size=1):
    """
    Loads data, splits it, and creates train, validation, and test DataLoaders.
    """
    indices = list(range(len(dataset)))
    random.shuffle(indices)

    train_end = int(len(indices) * train_ratio)
    val_end = int(len(indices) * (train_ratio + val_ratio))

    train_set = Subset(dataset, indices[:train_end])
    val_set = Subset(dataset, indices[train_end:val_end])
    test_set = Subset(dataset, indices[val_end:])

    print(f"Data split:")
    print(f"Training set size: {len(train_set)}")
    print(f"Validation set size: {len(val_set)}")
    print(f"Test set size: {len(test_set)}")

    if batch_size >= 1:
        print("Using batching with padding. Ensure your training loop can handle batched data!")

    
    train_loader = DataLoader(train_set, batch_size, shuffle=True, collate_fn=collate_fn)

    val_loader = DataLoader(val_set, batch_size, shuffle=False, collate_fn=collate_fn)
    
    test_loader = DataLoader(test_set, batch_size, shuffle=False, collate_fn=collate_fn)

    print(f"DataLoaders created with batch size {batch_size}.")

    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = get_dataloaders(data, collate_fn=data.collate_fn, batch_size=8)

In [None]:
batch_n = 1
n = 0

for X in test_loader:
    if n == batch_n:
         print(X)
    elif n > batch_n:
        break
    n += 1      

# 3.1. Seq2seq без внимания

In [None]:
IDTYPE = torch.int32
FDTYPE = torch.float32
# FDTYPE = torch.float8_e5m2

In [None]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, 
                          batch_first=True, bidirectional=True, dropout=dropout)
        
    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, hidden = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        return output, hidden


In [None]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""
    
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.5,
                 bridge=True):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
                 
        self.rnn = nn.GRU(input_size + 2*hidden_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
                 
        # Для инициализации из конечного состояния энкодера
        self.bridge = nn.Linear(2*hidden_size, hidden_size, bias=True) if bridge else None

        self.dropout_layer = nn.Dropout(p=dropout)
    
    def forward(self, inputs, encoder_hidden, hidden=None, max_len=None):
        """
        Unroll the decoder for a batch of text sequences

        Args:
            - inputs: torch.Tensor - encoder output (batch, seq_len, encoder_output)
            - encoder_hidden: torch.Tensor - last encoder hidden state (batch, seq_len, encoder_hidden)
        """

        # YOUR CODE
        
        return decoder_states, hidden # [B, L, H]

    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""

        if encoder_final is None:
            return None  

        return torch.tanh(self.bridge(encoder_final))       

In [None]:
class Seq2Seq(nn.Module):
    """
    A standard Seq2Seq architectuew
    """
    def __init__(self,
                 tokenizer,
                 embed_dim,
                 hidden_size,
                 n_layers,
                 dropout,
                 EncoderCls,
                 DecoderCls,
                 tie_projection=False
                ):
        """
        Args:
            - tokenizer: AutoTokenizer
            - embed_dim: int - size of embedding dimension
            - nlayers: number of layers in rnn
            - cls: encoder class to use, must match signature (),
            - decoder_cls: decoder class to use, must match signature (),
            - tie_projection: bool - if true, embedding and projection layer will use same weights
        """
        super(Seq2Seq, self).__init__()
        self.tokenizer = tokenizer
        self.vocab_size = tokenizer.vocab_size
        self.dropout = dropout
        self.embedding = nn.Embeddings(self.vocab_size, embded_dim, padding_idx=0)
        self.encoder = EncoderCls(embed_dim, hidden_size, n_layers, dropout)
        self.decoder = DecoderCls(hidden_size, embed_dim, n_layers, dropout)
        self.projection = nn.Linear(embed_dim, vocab_size)
        
        if tie_projection:
            self.projection.weights = self.embedding.weights
        
    def forward(self, tokens, src_mask=None, src_lengths=None, tgt_mask=None, tgt_lengths=None):
        """
        Args:
            - tokens: torch.Tensor[int] : input tokens 
        """

        # YOUR CODE
        
        
        logits = F.softmax(projection)
        return logits
        

    
    def calculate_loss(self, loss_fn, src_mask, trg_mask, src_lengths, trg_lengths):

        pass


# 3.2. Seq2seq с вниманием

In [None]:
class BahdanauAttention(nn.Module):
    """Implements Bahdanau attention"""
    
    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()
        
        # Поскольку в энкодере двунаправленный GRU, key_size = 2*hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size is None else query_size

        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)
        
        self.scores = None
        
    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"

        query = self.query_layer(query)
        
        scores = self.energy_layer(torch.tanh(query + proj_key))
        scores = scores.squeeze(2).unsqueeze(1)
        
        scores.data.masked_fill_(mask == 0, -float('inf'))
        
        self.scores = F.softmax(scores, dim=-1)
        
        context = torch.bmm(self.scores, value)
        
        # context shape: [B, 1, 2D], scores shape: [B, 1, M]
        return context, self.scores


In [None]:
# Реализуйте и обучите Seq2Seq с вниманием

# YOUR CODE

# 5. Контрольные вопросы

In [None]:
# Насколько корректно модель предсказывает рецепты из тренировочной выборки?

In [None]:
# Насколько корректно модель предсказывает рецепты из тестовой выборки?

In [None]:
# Что будет, если подать рецепт из категорий, на которых модель не обучалась?

In [None]:
# Что будет, если подать промпт, не предполагающйий наличие рецепта?