*Большую часть NLP-специфичного материала (TF-IDF, BagOfWords, N-grams и т.д.) мы пропускаем. Если эти темы вам в дальнейшем понадобятся, могут быть полезны материалы*
- *Подробнее про эмбеддинги на примере word2vec: [хабр](https://habr.com/ru/articles/446530/)*
- *Серия лекций DLS МФТИ по NLP, начало тут: [вк](https://vk.com/video-155161349_456239178), [youtube](https://www.youtube.com/watch?v=StZaHBNWiOs)*
- *Серия видео 3Blue1Brown по LLM: [youtube](https://www.youtube.com/watch?v=LPZh9BOjkQs&list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi&index=5)*
- *Оригинальная работа, в которой введено внимание Баданау: [arxiv](https://arxiv.org/abs/1409.0473)*

*На этой лекции мы очень кратко рассмотрим токенизацию и эмбеддинги, реализуем простой текстовый классификатор на основе GRU-энкодера, а затем модифицируем его, реализовав механизм внимания Баданау.*

In [None]:
!pip install gensim
!pip install torchinfo



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np
import random
import math
import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Зафиксируем зерна
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Используемое устройство: cpu


## 1. Токенизация и эмбеддинги

`gensim` - библотека для работы с текстом и темами, в области обработки естественного языка (NLP). В частности, она позволяет широкий функционал для векторизации текста, но этим функционал не ограничивается (см. https://pypi.org/project/gensim/)

In [None]:
import gensim.downloader as api

glove_vectors = api.load('glove-wiki-gigaword-50')
type(glove_vectors)
# https://radimrehurek.com/gensim/models/keyedvectors.html



In [None]:
vec1 = glove_vectors['cat']
print(vec1)

In [None]:
glove_vectors

In [None]:
vec1 = - glove_vectors['men'] + glove_vectors['women']
vec2 = glove_vectors['boy']
glove_vectors.most_similar(vec1+vec2)

In [None]:
import seaborn as sns

vec = glove_vectors['student']
near = glove_vectors.most_similar(vec)
words = [val[0] for val in near]
cosine = [val[1] for val in near]
vecs = [glove_vectors[word] for word in words]
vecs = np.stack(vecs)

labels = [f'{w}:{float(s):.2f}' for w, s in zip(words, cosine)]
fig = plt.figure(figsize=(25, 5))
sns.heatmap(vecs, yticklabels=labels, annot=True, fmt=".1f", linewidths=1, square=True)

plt.tight_layout()
plt.show()

In [None]:
shift = np.zeros(50)
shift[29] = 3.0

glove_vectors.most_similar(glove_vectors['human'] + shift)

In [None]:
# Проблема: по умолчанию вектора для специальных символов отсутствуют.

special_keys = ['<pad>', '<bos>', '<eos>', '<unk>']

for key in special_keys:
    if key not in glove_vectors:
        print(f'{key} is not present')
    else:
        print(f'{key}: {glove_vectors[key]}')

In [None]:
PAD = '<pad>'
BOS = '<bos>'
EOS = '<eos>'
UNK = '<unk>'

special_vecs = {
        PAD: np.zeros((50,), dtype=np.float32),
        BOS: np.random.normal(size=(50,)).astype(np.float32),
        EOS: np.random.normal(size=(50,)).astype(np.float32),
        UNK: np.random.normal(size=(50,)).astype(np.float32),
}

glove_vectors.add_vectors([*special_vecs.keys()], [*special_vecs.values()])
glove_vectors.resize_vectors(seed=0)

for key in special_keys:
    print(glove_vectors.most_similar(key))

In [None]:
from gensim.models import KeyedVectors

class Tokenizer():

    PAD = '<pad>'
    BOS = '<bos>'
    EOS = '<eos>'
    UNK = '<unk>'
    special = [PAD, BOS, EOS, UNK]

    def __init__(self, vectors: KeyedVectors, preprocessor = None):

        for key in self.special:
            if key not in glove_vectors:
                raise ValueError(f'Cannot instantiate: special token {key} is not present in given embedding')
        self.vectors = vectors
        self.preprocessor = preprocessor

        self.PAD_idx = self.vectors.key_to_index[self.PAD]
        self.PAD_v = self.vectors[self.PAD_idx]
        self.BOS_idx = self.vectors.key_to_index[self.BOS]
        self.BOS_v = self.vectors[self.BOS_idx]
        self.EOS_idx = self.vectors.key_to_index[self.EOS]
        self.EOS_v = self.vectors[self.EOS_idx]
        self.UNK_idx = self.vectors.key_to_index[self.UNK]
        self.UNK_v = self.vectors[self.UNK_idx]

    def tokenize(self, text: str, preprocess=False):
        """Converts a string to a list of token indices, adding BOS and EOS tokens."""
        if preprocess:
            text = self.preprocessor(text)
        tokens = [self.BOS_idx]
        # Handle empty words that can result from split
        words = [word for word in text.split(' ') if word]
        tokens.extend(
            self.vectors.key_to_index.get(word, self.UNK_idx)
            for word in words
        )
        tokens.append(self.EOS_idx)
        return tokens

    def detokenize(self, idxs: list):

        return ''.join([self.vectors.index_to_key[i]+' ' for i in idxs]).strip()

In [None]:
tokenizer = Tokenizer(glove_vectors)

In [None]:
text = 'abra cadabra'
seq = tokenizer.tokenize(text)
print(seq)
print(tokenizer.detokenize(seq))


In [None]:
text = 'NLP (Neuro-Linguistic Programming) is a psychological approach that involves analyzing the patterns of thought, language, and behavior to understand how they interact with and influence human experience. There is no scientific evidence supporting the effectiveness of NLP; it is recognized as a pseudoscience.'
seq = tokenizer.tokenize(text)
print(seq)
print(tokenizer.detokenize(seq))

In [None]:
text = 'General Relativity is a physical theory, which explains gravity as purely geometrical effect: curved spacetime tells matter how to move, while matter influences the curvature of spacetime'
seq = tokenizer.tokenize(text)
print(seq)
print(tokenizer.detokenize(seq))

In [None]:
import re
from string import punctuation

def text_cleanup(text: str):
    text = text.lower()
    # Удаляем двойные знаки препинания
    text = re.sub(r'([,.])\1+', r'\1', text)
    # Окружаем знаки препинания пробелами
    text = re.sub(f'([{punctuation}])', r' \1 ', text)
    # Удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text).strip()
    return text

text_cleanup('sample text: with commas,  double  spaces  ([[and brackets]])..!')

In [None]:
tokenizer.preprocessor = text_cleanup
text = 'General Relativity is a physical theory, which explains gravity as purely geometrical effect: curved spacetime tells matter how to move, while matter influences the curvature of spacetime'
seq = tokenizer.tokenize(text, preprocess=True)
print(seq)
print(tokenizer.detokenize(seq))

## 2. Загрузка и предобработка данных

In [None]:
!mkdir ./data
!curl -L -o ./data/recipes-dataset-64k-dishes.zip\
  https://www.kaggle.com/api/v1/datasets/download/prashantsingh001/recipes-dataset-64k-dishes
!unzip ./data/recipes-dataset-64k-dishes.zip -d ./data/

In [None]:
import pandas as pd

# В общем случае не очень хорошо
data_path = './data/1_Recipe_csv.csv'
dataframe = pd.read_csv(data_path)
dataframe = dataframe.dropna().reset_index(drop=True)


In [None]:
dataframe.info()

In [None]:
dataframe.head()

In [None]:
dataframe['category'].unique()

In [None]:
NCLASSES = 16
most_commons = dataframe['category'].value_counts()[:NCLASSES]
categories = list(most_commons.index)
most_commons

In [None]:
df_reduced = dataframe[dataframe['category'].isin(categories)]

In [None]:
df_reduced.info()

In [None]:
import re
import string

def process_recipe(row: pd.Series, columns_to_use: list[str]):
    """
    Processes a single recipe row from the DataFrame into a clean string.
    """
    entry_parts = []
    for col in columns_to_use:
        if col in row and pd.notna(row[col]):
            content = str(row[col])
            if isinstance(content, str) and content.startswith('[') and content.endswith(']'):
                content = re.sub(r'["\\\[\\\\\]]', '', content)
            entry_parts.append(f'{col.replace("_", " ")}: {content}')
            entry_parts.append(' ')

    return ''.join(entry_parts)

In [None]:
COLUMNS = [
    'recipe_title',
    'description',
    'ingredients',
    # 'directions'
    ]

process_recipe(dataframe.iloc[0], COLUMNS)

In [None]:
seq = tokenizer.tokenize(process_recipe(dataframe.iloc[0], COLUMNS), preprocess=True)
print(seq)
print(tokenizer.detokenize(seq))

## 3. Контейнеры данных

In [None]:
from typing import List, Tuple

Inpt = List
'''Input vector'''
Tgt = List | torch.Tensor
'''Output vector (can take a form of List or torch.Tensor)'''

class RecipeDataset(Dataset):

    def __init__(self, df, columns, cat_vec, tokenizer, device='cpu', max_len=256):

        self.data_: List[Inpt] = []
        self.tgt_: List[Tgt] = []
        self.columns = columns
        self.tokenizer = tokenizer
        self.device = device
        self.PAD_idx = (tokenizer.PAD_idx)
        self.max_len = max_len
        self.idtype: torch.dtype = torch.int32
        self.fdtype: torch.dtype = torch.float32

        for i, row in df.iterrows():
            entry = process_recipe(row, columns)
            tokenized_recipe = tokenizer.tokenize(entry, preprocess=True)

            if len(tokenized_recipe) > 10:
                self.tgt_.append(cat_vec[i].tolist())
                self.data_.append(tokenized_recipe)

        self.size = len(self.data_)
        # self.subsample()

    def __len__(self):
        return self.size

    def __getitem__(self, idx) -> Tuple[Inpt, Tgt]:
        return self.data_[idx], self.tgt_[idx]

    # def subsample(self, last_idx=None, stride=1):
    #     if last_idx == None:
    #       last_idx = self.max_size
    #     else:
    #       last_idx = min(last_idx, self.max_size)

    #     self.data = self.data_[:last_idx:stride]
    #     self.tgt = self.data_[:last_idx:stride]
    #     self.size = len(self.data)

    def collate_fn(self, batch: List[Tuple[Inpt, Tgt]]):
        seq_lenghts = [len(x[0]) for x in batch]
        new_len = min(max(seq_lenghts), self.max_len)

        new_inp = []
        for i, seq_len in enumerate(seq_lenghts):
            _inp_ = batch[i][0][:new_len]
            if seq_len < new_len:
                _inp_.extend([self.PAD_idx]*(new_len - seq_len))
            new_inp.append(_inp_)

        # Для RNN размерность входа выбирают (seq_len, batch_size ...)
        # либо (batch_size, seq_len, ...). Будем использовать второй вариант.
        new_inp = torch.tensor(new_inp, dtype=self.idtype, device=self.device)
        new_tgt = torch.tensor([v[1] for v in batch], dtype=self.fdtype, device=self.device)

        return (new_inp, new_tgt)

In [None]:
from sklearn.preprocessing import OneHotEncoder

def get_dataloaders(df, columns_to_use, tokenizer, batch_size=1):
  """
  Loads data, splits it, and creates train, validation, and test DataLoaders.
  """
  cat_encoder = OneHotEncoder(sparse_output=False)
  cat_vec = cat_encoder.fit_transform(df[['category']])
  indices = list(range(len(df)))
  print(len(cat_vec))
  random.shuffle(indices)

  train_ratio = 0.8
  val_ratio = 0.1

  train_end = int(len(indices) * train_ratio)
  val_end = int(len(indices) * (train_ratio + val_ratio))

  train_indices = indices[:train_end]
  val_indices = indices[train_end:val_end]
  test_indices = indices[val_end:]

  train_df = df.iloc[train_indices].reset_index(drop=True)
  val_df = df.iloc[val_indices].reset_index(drop=True)
  test_df = df.iloc[test_indices].reset_index(drop=True)


  print(f"Data split:")
  print(f"Training set size: {len(train_df)}")
  print(f"Validation set size: {len(val_df)}")
  print(f"Test set size: {len(test_df)}")

  train_dataset = RecipeDataset(train_df, columns_to_use, cat_vec[train_indices], tokenizer)
  val_dataset = RecipeDataset(val_df, columns_to_use, cat_vec[val_indices], tokenizer)
  test_dataset = RecipeDataset(test_df, columns_to_use, cat_vec[test_indices], tokenizer)

  print(f"Number of samples:")
  print(f"Training: {len(train_dataset)}")
  print(f"Validation: {len(val_dataset)}")
  print(f"Test: {len(test_dataset)}")


  if batch_size >= 1:
      print("Using batching with padding. Ensure your training loop can handle batched data!")

  train_loader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            collate_fn=train_dataset.collate_fn
                            )

  val_loader = DataLoader(val_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          collate_fn=val_dataset.collate_fn
                          )

  test_loader = DataLoader(test_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                           collate_fn=test_dataset.collate_fn
                            )

  print(f"DataLoaders created with batch size {batch_size}.")

  return train_loader, val_loader, test_loader, cat_encoder
df = df_reduced.sample(1600, random_state=42)
train_loader, val_loader, test_loader, cat_encoder = get_dataloaders(df, COLUMNS, tokenizer, batch_size=8)

1600
Data split:
Training set size: 1280
Validation set size: 160
Test set size: 160
Number of samples:
Training: 1280
Validation: 160
Test: 160
Using batching with padding. Ensure your training loop can handle batched data!
DataLoaders created with batch size 8.


## 4. Классификация текста

In [None]:
import os
from typing import Iterable

class logger:
    active = False
    _calls_ = {}
    log_file = "logger_output.txt"  # Default log file path
    silent = True

    @classmethod
    def on(cls): cls.active = True

    @classmethod
    def off(cls): cls.active = False

    @classmethod
    def silent(cls, silent: bool = True):
        cls.silent = silent

    @classmethod
    def zero(cls):
        cls._calls_ = {}

    @classmethod
    def clear_log(cls):
        with open(cls.log_file, 'w') as f:
            f.write("")

    @classmethod
    def write_log(cls, msg):
        with open(cls.log_file, 'a') as f:
            f.write(msg + "\n")

    @classmethod
    def trace(cls, name):
        def log_fn(func):
            def wrapper(*args, **kwargs):
                if cls.active:
                    if name not in cls._calls_:
                        cls._calls_[name] = 0
                    msg = f'>>> {name} call {cls._calls_[name]}: \n Args: \n'
                    for i, arg in enumerate(args):
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t arg[{i}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t arg[{i}]: {arg}\n'

                    for k, arg in kwargs.items():
                        if isinstance(arg, torch.Tensor):
                            msg += f'\t kwarg[{k}]: shape={arg.shape}, dtype={arg.dtype}, device={arg.device}\n {arg} \n'
                        else:
                            msg += f'\t kwarg[{k}]: {arg} \n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)

                result = func(*args, **kwargs)

                if cls.active:
                    msg = f'Result: \n'
                    if isinstance(result, Iterable):
                        for i, outp in enumerate(result):
                            if isinstance(outp, torch.Tensor):
                                msg += f'\t output[{i}]: shape={outp.shape}, dtype={outp.dtype}, device={outp.device}\n {outp} \n'
                            else:
                                msg += f'\t output[{i}]: {outp}\n'
                    elif isinstance(result, torch.Tensor):
                        msg = f'\t output: shape={result.shape}, dtype={result.dtype}, device={result.device}\n {result} \n'
                    else:
                        msg += f'\t output: {result}\n'

                    if not cls.silent: print(msg)
                    cls.write_log(msg)
                    cls._calls_[name] += 1

                return result
            return wrapper

        return log_fn

In [None]:
embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(glove_vectors.vectors),
            freeze=True
            )

In [None]:
import torch.nn.functional as F

class Encoder(nn.Module):
    """
    Encoder module: Embedding + GRU. Returns outputs and hidden state.
    """
    def __init__(self, embedding, hidden_size, n_layers=1, dropout=0.5, bidirectional=True, freeze_emb=True):
        super().__init__()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = embedding
        self.rnn = nn.GRU(
            embedding.embedding_dim,
            hidden_size,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0
            )

        self.dropout = nn.Dropout(dropout)
        self.max_len = 256

    @logger.trace('ENC')
    def forward(self, text, text_lengths = None):
        if text_lengths is None:
            text_lengths = [len(t) for t in text]
        X = self.dropout(self.embedding(text))
        X_packed = nn.utils.rnn.pack_padded_sequence(X, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(X_packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        return output, hidden


In [None]:
class TextClassifier(nn.Module):
    """
    A text classification model without an attention mechanism, using Encoder.
    """
    def __init__(self, embedding, hidden_size, num_classes, n_layers=1, dropout=0.5, bidirectional=True):
        super().__init__()

        self.encoder = Encoder(embedding, hidden_size, n_layers, dropout, bidirectional)
        linear_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.fc = nn.Linear(linear_input_size, num_classes)
        self.bidirectional = bidirectional

    @logger.trace('CLSF')
    def forward(self, text, text_lengths = None):

        _, hidden = self.encoder(text, text_lengths)
        if self.bidirectional:
            hidden = torch.cat((hidden[-2, :,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]

        return F.softmax(self.fc(hidden), dim=1)


In [None]:
import torchinfo
# Гиперпараметры моделей
HIDDEN_SIZE = 50
N_LAYERS = 1
DROPOUT = 0.2

modelA = TextClassifier(embedding, HIDDEN_SIZE, NCLASSES, N_LAYERS, DROPOUT)
torchinfo.summary(modelA)

In [None]:
# Пример входных данных
sample_text = ["this is a test recipe"]
tokenized = [tokenizer.tokenize(sample, preprocess=True) for sample in sample_text]
input_tensor = torch.tensor(tokenized, dtype=torch.long)  # (batch, seq_len)
text_lengths = torch.tensor([len(seq) for seq in tokenized], dtype=torch.long)  # (batch,)

# Вызов модели
modelA.eval()
logger.clear_log()
logger.on()
with torch.no_grad():
    y_pred = modelA(input_tensor, text_lengths)
    print(y_pred)
    print(f"Predicted class: {cat_encoder.inverse_transform(y_pred)}" )
logger.off()

In [None]:
# Цикл обучения
from typing import Dict
import tqdm

class Helper:

    eval_fn = None
    eval_int = 1
    in_notebook = True

    @classmethod
    def plot_history(cls, history, model=None):

        if cls.in_notebook:
            from IPython.display import clear_output
            clear_output(wait=True)
        else:
            plt.close('all')

        fig, axes = plt.subplots(1, 2, figsize=(15, 5))

        axes[0].plot(history['train_loss'], label='train loss')
        axes[0].set_xlabel('Epochs')
        axes[0].set_ylabel('Loss')
        axes[0].legend()

        axes[1].plot(history['epochs'], history['train_score'], label='train score')
        axes[1].plot(history['epochs'], history['val_score'], label='val score')
        axes[1].set_xlabel('Epochs')
        axes[1].set_ylabel('Score')
        axes[1].legend()

        if hasattr(model, 'vis_ax') and model.vis_ax is not None:
            fig.axes.append(model.vis_ax)

        plt.tight_layout()

        if cls.in_notebook:
            plt.show()
        else:
            plt.pause(0.001)

    @classmethod
    def train(cls,
              model,
              optimizer,
              dataloaders: Dict[str, DataLoader],
              loss,
              n_epochs = 30,
              plot = False,
             ):

        history = {'train_loss': [], 'train_score':[], 'val_score': [], 'epochs': []}
        if not cls.in_notebook: plt.ion()

        for epoch in tqdm.trange(n_epochs):
            model.train()
            history['train_loss'].append(0)

            for X_batch, y_batch in dataloaders['train']:

                optimizer.zero_grad()
                y_pred = model(X_batch)

                loss_train = loss(y_pred, y_batch)
                history['train_loss'][-1] += loss_train.item()

                loss_train.backward()
                optimizer.step()

            if (epoch % cls.eval_int) == 0:
                cls.evaluation(model, dataloaders, loss, loss_train, history, epoch)

            if plot:
              cls.plot_history(history, model)

        return history

    @classmethod
    def evaluation(cls, model, dataloaders, loss, loss_train, history, epoch):

        model.eval()
        with torch.no_grad():
            eval_test = cls.eval_fn_dl(model, dataloaders['val'])
            eval_train = 0
            if cls.eval_fn == None:
                eval_train = loss_train.item()
            else:
                eval_train = cls.eval_fn_dl(model, dataloaders['train'])

        history['epochs'].append(epoch)
        history['train_score'].append(eval_train)
        history['val_score'].append(eval_test)

    @classmethod
    def eval_fn_dl(cls, model, loader, eval_fn = None):

        model.eval()
        score = 0
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            score += cls.eval_fn(y_pred, y_batch).item()

        return score / len(loader)

In [None]:
class Scores:

    threshold = 0.7
    eps = 1e-8

    @classmethod
    def accuracy(cls, y_true, y_pred):

        return ((y_pred > cls.threshold) == y_true.bool()).float().mean()

    @classmethod
    def f1_score(cls, y_true, y_pred):

        tp = (y_pred * y_true).sum().to(torch.float32)
        tn = ((1 - y_pred) * (1 - y_true)).sum().to(torch.float32)
        fp = (y_pred * (1 - y_true)).sum().to(torch.float32)
        fn = ((1 - y_pred) * y_true).sum().to(torch.float32)

        precision = tp / (tp + fp + cls.eps)
        recall = tp / (tp + fn + cls.eps)
        f1 = 2 * (precision * recall) / (precision + recall + cls.eps)

        return f1

In [None]:
optimizer = torch.optim.RMSprop(modelA.parameters(), lr=3e-2)
dataloaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}
loss = torch.nn.KLDivLoss()
Helper.eval_fn = Scores.f1_score

In [None]:
_ = Helper.train(modelA, optimizer, dataloaders, loss, n_epochs=10, plot=True)

In [None]:
Helper.eval_fn_dl(modelA, test_loader)

In [None]:
with torch.no_grad():
    X, y_true = test_loader.dataset[1]
    y_pred = modelA(torch.IntTensor(X))
    print(y_pred.shape)
    print(f"True class: {cat_encoder.inverse_transform(y_true)}")
    print(f"Predicted class: {cat_encoder.inverse_transform(y_pred)}")

In [None]:
class BahdanauAttention(nn.Module):

    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.W_a = nn.Linear(decoder_hidden_dim, decoder_hidden_dim, bias=False)
        self.U_a = nn.Linear(encoder_hidden_dim, decoder_hidden_dim, bias=False)
        self.v = nn.Linear(decoder_hidden_dim, 1, bias=False)

    @logger.trace('ATN')
    def forward(self, decoder_hidden, encoder_hidden):

        s = self.W_a(decoder_hidden).unsqueeze(1)
        h = self.U_a(encoder_hidden)
        # print(s.shape, h.shape)
        scores = self.v(F.tanh(s+h))
        attention_weights = F.softmax(scores, dim=1).squeeze(-1)
        context_vector = torch.einsum('bs,bsh->bh', attention_weights, encoder_hidden)
        return context_vector, attention_weights

class AttentionTextClassifier(nn.Module):

    def __init__(self, embeddings, hidden_size, num_classes, window_size=10, stride=2, n_layers=1, dropout=0.5, bidirectional=True):
        super().__init__()
        self.window_size = window_size
        self.encoder = Encoder(embeddings, hidden_size, n_layers, dropout, bidirectional)
        rnn_output_size = hidden_size * 2 if bidirectional else hidden_size
        self.attention = BahdanauAttention(rnn_output_size, rnn_output_size)
        self.fc = nn.Linear(rnn_output_size, num_classes)

    def forward(self, text, text_lengths=None):
        output, _ = self.encoder(text, text_lengths)
        batch_size, seq_length, _ = output.shape
        context_vectors = []

        for i in range(0, seq_length, self.window_size):
            window = output[:, i:i + self.window_size, :]
            decoder_hidden = output[:, min(i + self.window_size - 1, seq_length - 1), :]
            context_vector, _ = self.attention(decoder_hidden, window)
            context_vectors.append(context_vector)

        context_vectors = torch.stack(context_vectors, dim=1)
        context_vector = context_vectors.mean(dim=1)
        output = self.fc(context_vector)
        return F.softmax(output, dim=1)

modelB = AttentionTextClassifier(embedding, HIDDEN_SIZE, NCLASSES, 10, 2, N_LAYERS, DROPOUT)
torchinfo.summary(modelB)


In [None]:
# Пример входных данных
sample_text = ["this is a test recipe, very long test recipe !"]
tokenized = [tokenizer.tokenize(sample, preprocess=True) for sample in sample_text]
input_tensor = torch.tensor(tokenized, dtype=torch.long)  # (batch, seq_len)
text_lengths = torch.tensor([len(seq) for seq in tokenized], dtype=torch.long)  # (batch,)

# Вызов модели
modelB.eval()
logger.zero()
logger.clear_log()
logger.on()
with torch.no_grad():
    y_pred = modelB(input_tensor, text_lengths)
    print(y_pred)
    print(f"Predicted class: {cat_encoder.inverse_transform(y_pred)}" )
logger.off()

In [None]:
optimizer = torch.optim.RMSprop(modelB.parameters(), lr=3e-4)
dataloaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}
loss = torch.nn.KLDivLoss()
Helper.eval_fn = Scores.f1_score

In [None]:
_ = Helper.train(modelB, optimizer, dataloaders, loss, n_epochs=5, plot=True)