In [89]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_score, precision_recall_curve
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from collections import Counter
from typing import Union
import torch.nn as nn
import pandas as pd
import joblib
import numpy as np
import torch

In [90]:
Int = Union[int, np.integer]
Real = Union[float, np.floating, int, np.integer]

In [91]:
class RatioValidator:
    def __set_name__(self, owner, name):
        self.name = "_" + name

    def __set__(self, instance, value):
        if not isinstance(value, (float, np.floating, int, np.integer)):
            raise TypeError(f"{self.name} must be a real number")
        elif not (0 <= value <= 1):
            raise ValueError(f"{self.name} must be between 0 and 1")

        setattr(instance, self.name, value)

    def __get__(self, instance, owner):
        return getattr(instance, self.name, None)


class StratifiedMultiLabelSplitter:
    """The class that divides a sample into 3 parts. To split into 2, set parameter 'valid_size' as 0.0.\n\n

       __init__:\n
       valid_size [float | np.floating | int | np.integer]: Real number, that shows the proportion of validation set. It requires being between 0 and 1\n
       test_size [float | np.floating | int | np.integer]: Real number, that shows the proportion of test set. It requires being between 0 and 1\n
       random_state [int | np.integer]: An integer that, controls the shuffling applied to the data before applying the split\n\n

       __call__:\n
       X [np.ndarray | pd.DataFrame]: Features\n
       y [np.ndarray | pd.Series]: Target collection. They are supposed to be represented as arrays of possible classes"""

    valid_size, test_size = RatioValidator(), RatioValidator()

    def __init__(self, valid_size: Real, test_size: Real, random_state: Int = 21):
        self.random_state = random_state
        self.test_size = test_size
        self.valid_size = valid_size

        if self.test_size + self.valid_size > 1:
            raise ValueError("test_size + valid_size must be <= 1")

    def __call__(self, x: pd.DataFrame | pd.Series | np.ndarray, y: pd.Series | np.ndarray, *args, **kwargs):
        if not isinstance(x, (pd.DataFrame, np.ndarray, pd.Series)):
            raise TypeError("x must be a pandas.DataFrame or a pandas.Series or a np.ndarray")
        if not isinstance(y, (pd.Series, np.ndarray)):
            raise TypeError("y must be a pandas.Series or a np.ndarray")

        if isinstance(x, (pd.DataFrame, pd.Series)):
            x = x.values
        if isinstance(y, pd.Series):
            y = y.values

        if any(not isinstance(elem, np.ndarray) for elem in y):
            raise TypeError("y must contain arrays of class labels")

        if len(x) != len(y):
            raise ValueError("x and y must have the same shape")

        total_amount = dict(Counter(np.concatenate([array for array in y])))
        class_lengths, labels = np.array(list(total_amount.values()), dtype=np.int64), np.array(list(total_amount.keys()))

        valid_prop, test_prop = np.floor(class_lengths * self.valid_size), np.floor(class_lengths * self.test_size)
        train_prop = class_lengths - valid_prop - test_prop

        if np.any(train_prop < 0):
            raise ValueError("Due to rounding, train size became negative. Use another valid/test sizes.")

        lens = np.array([len(lst) for lst in y], dtype=np.uint64)
        max_len, indices = lens.max(), np.arange(0, len(y), dtype=np.uint64)

        # 0 refers to train_sample, 1 - valid_sample, 2 - test_sample
        counts = {0: np.zeros_like(train_prop, dtype=np.int64), 1: np.zeros_like(valid_prop, dtype=np.int64),
                  2: np.zeros_like(test_prop, dtype=np.int64)}
        ind = {0: list(), 1: list(), 2: list()}

        rng = np.random.default_rng(seed=self.random_state)
        for curr_len in range(max_len, 0, -1):
            curr_idx = indices[lens == curr_len]
            rng.shuffle(curr_idx)

            for i, index in enumerate(curr_idx):
                curr_row = np.isin(labels, y[index])  # булевый вектор формы labels
                rates = np.sum(np.concatenate([
                    np.greater_equal(counts[0] + curr_row, train_prop)[np.newaxis, :],
                    np.greater_equal(counts[1] + curr_row, valid_prop)[np.newaxis, :],
                    np.greater_equal(counts[2] + curr_row, test_prop)[np.newaxis, :]
                ], axis=0),
                    axis=1)

                argmin = np.argmin(rates)

                ind[argmin].append(index)
                counts[argmin] += curr_row

        return x[ind[0]], x[ind[1]], x[ind[2]], y[ind[0]], y[ind[1]], y[
            ind[2]]  # X_train, X_valid, X_test, y_train, y_valid, y_test

    @property
    def random_state(self):
        return self._random_state

    @random_state.setter
    def random_state(self, random_state):
        if not isinstance(random_state, (int, np.integer)):
            raise TypeError("Parameter 'random_state' must be an integer")
        elif not (0 <= random_state <= 4294967295):
            raise TypeError("Parameter 'random_state' must be between 0 and 2^32 - 1")

        self._random_state = random_state

In [92]:
fake_data = pd.read_csv("./fake_data.csv", header=0)
real_data = pd.read_csv("./data.csv", header=0)

data = pd.concat([real_data, fake_data], axis=0, ignore_index=True)

del real_data, fake_data

data

Unnamed: 0,description,classes,duration
0,Разметить данные,Работа,20160
1,Сделать ТАУ,Образование,60
2,Сходить в магазин за продуктами,Покупки,45
3,Сходить в магазин,Покупки,120
4,Приготовить обед,Быт,90
...,...,...,...
2814,Записаться на йогу,Спорт,20
2815,Сходить в океанариум,Развлечения,120
2816,Купить новый шарф,Покупки,30
2817,Погулять в парке с семьей,"Отношения, Развлечения",90


In [93]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используемое устройство:", device)

model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

bert_model.eval()

def get_embeddings(texts: list[str], batch_size: int = 32) -> np.ndarray:
    # batch_size - кол-во текстов, которые нужно обработать за 1 проход
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            padding=True,                  # добавить паддинг до длины самого длинного в батче
            truncation=True,               # обрезать, если длиннее max_length
            max_length=128,                # макс. длина последовательности (в токенах)
            return_tensors="pt"            # вернуть PyTorch тензоры
        ).to(device)

        with torch.no_grad():
            outputs = bert_model(**inputs) # тензор формы (batch_size, sequence_length, hidden_size)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)

X_emb = get_embeddings(data["description"].tolist())

Используемое устройство: cuda


In [94]:
splitter = StratifiedMultiLabelSplitter(valid_size=0.0, test_size=0.15, random_state=21)
data["classes"] = data["classes"].apply(func=lambda x: np.array(x.replace(",", "").split()))
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter(X_emb, data["classes"])

print(f"X_train.shape: {X_train.shape}\nX_valid.shape: {X_valid.shape}\nX_test.shape: {X_test.shape}")

X_train.shape: (2206, 312)
X_valid.shape: (0, 312)
X_test.shape: (613, 312)


In [95]:
mlb = MultiLabelBinarizer()

mlb.fit(y_train)
y_train, y_test = mlb.transform(y_train), mlb.transform(y_test)

y_train

array([[1, 0, 0, ..., 1, 0, 1],
       [1, 0, 0, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [96]:
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_val_t = torch.tensor(X_test, dtype=torch.float32)
y_val_t = torch.tensor(y_test, dtype=torch.float32)

In [97]:
class AdvancedMultiLabelMLP(nn.Module):
    def __init__(self, input_dim, num_labels, hidden_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            ResidualBlock(hidden_dim),   # ← добавляем остаточные блоки
            ResidualBlock(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_labels)
        )

    def forward(self, x):
        return self.net(x)

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.gelu = nn.GELU()

    def forward(self, x):
        return self.gelu(x + self.block(x))

# Инициализация модели
num_labels = y_train_t.shape[1]
model = AdvancedMultiLabelMLP(input_dim=X_emb.shape[1], num_labels=num_labels).to(device)

# Функция потерь и оптимизатор
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# Даталоадеры
train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [98]:
# class MultiLabelMLP(nn.Module):
#     def __init__(self, input_dim, num_labels, hidden_dim=256): # input_dim - размерность призн. пр-ва; num_labels - число классов; hidden_dim - размер первого скрытого слоя
#         super().__init__()
#         self.net = nn.Sequential(                               # структура последовательная, т. е. слои применяются один за другим
#             nn.Linear(input_dim, hidden_dim),                   # полносвязный слой линейного преобразования: R^(input_dim) -> R^(hidden_dim)
#             nn.ReLU(),                                          # вводит нелинейность, поэлементно применяя max(x, 0)
#             nn.Dropout(0.3),                                    # регуляризация: случайно "выключает" 30% нейронов во время обучения
#             nn.Linear(hidden_dim, hidden_dim // 2),             # вторым полносвязным слоем уменьшаем размерность в 2 раза (256 → 128), т. е. оставляем только самое важное
#             nn.ReLU(),                                          # снова нелинейность
#             nn.Dropout(0.3),                                    # и регуляризация
#             nn.Linear(hidden_dim // 2, num_labels)              # выходной слой преобразует сжатое представление → в num_labels значений
#         )

#     def forward(self, x):
#         return self.net(x)

# # Инициализация модели
# num_labels = y_train_t.shape[1]
# model = MultiLabelMLP(input_dim=X_emb.shape[1], num_labels=num_labels).to(device)

# # Функция потерь и оптимизатор
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# # Даталоадеры
# train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [99]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()                                                     # режим обучения
    total_loss = 0                                                    # переменная для общих потерь за эпоху
    for X_batch, y_batch in loader:                                   # по каждому батчу из loader
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()                                         # обнуляем градиенты от предыдущего шага
        outputs = model(X_batch)                                      # получаем сырые логиты
        loss = criterion(outputs, y_batch)                            # вычисляем значение функции потерь
        loss.backward()                                               # обратное распространение ошибки
        optimizer.step()
        total_loss += loss.item()                                     # обновляем веса модели на основе градиентов
    return total_loss / len(loader)                                   # средняя потеря по эпохе

def evaluate(model, X, y, device):
    model.eval()                                                      # переводим модель в режим оценки
    with torch.no_grad():
        X = X.to(device)
        outputs = model(X)
        preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy()
    return preds


for epoch in range(10):
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    if (epoch + 1) % 5 == 0:
        val_preds = evaluate(model, X_val_t, y_val_t, device)
        f1 = f1_score(y_test, val_preds, average='macro')
        prec = precision_score(y_test, val_preds, average='macro', zero_division=0)
        print(f"Эпоха {epoch+1}, Loss: {loss:.4f}, Val F1-macro: {f1:.4f}, Precision: {prec:.4f}")

Эпоха 5, Loss: 0.1082, Val F1-macro: 0.5933, Precision: 0.5727
Эпоха 10, Loss: 0.0611, Val F1-macro: 0.5559, Precision: 0.4933


In [100]:
final_preds = evaluate(model, X_val_t, y_val_t, device)

print("\n=== Итоговый отчёт ===")
print(classification_report(y_test, final_preds, target_names=mlb.classes_, zero_division=0))


=== Итоговый отчёт ===
              precision    recall  f1-score   support

         Быт       0.53      0.78      0.63       130
     Встречи       0.45      0.63      0.53        30
   Документы       0.58      0.68      0.62        28
    Здоровье       0.40      0.57      0.47        37
     Красота       0.67      0.57      0.62        14
 Образование       0.49      0.63      0.55        57
   Отношения       0.25      0.40      0.31        15
     Покупки       0.80      0.94      0.87        48
 Путешествия       0.58      0.69      0.63        16
      Работа       0.56      0.70      0.62        99
 Развлечения       0.40      0.67      0.50        64
       Спорт       0.46      0.72      0.56        25
       Хобби       0.25      0.47      0.33        55

   micro avg       0.48      0.68      0.56       618
   macro avg       0.49      0.65      0.56       618
weighted avg       0.50      0.68      0.57       618
 samples avg       0.53      0.68      0.58       618



In [101]:
model.eval()
with torch.no_grad():
  X_val_t = X_val_t.to(device)
  outputs = model(X_val_t)
  proba = torch.sigmoid(outputs).cpu().numpy()

In [102]:
thlds = dict()
for i, label in enumerate(mlb.classes_):
  precision, recall, thresholds = precision_recall_curve(y_test[:, i], proba[:, i])
  idx = np.argmax(precision > 0.8)
  print(f"{label}: precision = {precision[idx]:.2f}; recall = {recall[idx]:.2f}; threshold = {thresholds[idx]}")
  thlds[label] = thresholds[idx]

Быт: precision = 0.80; recall = 0.44; threshold = 0.9906858801841736
Встречи: precision = 1.00; recall = 0.10; threshold = 0.9988874793052673
Документы: precision = 0.81; recall = 0.61; threshold = 0.9688215255737305
Здоровье: precision = 1.00; recall = 0.11; threshold = 0.9814250469207764
Красота: precision = 0.88; recall = 0.50; threshold = 0.6611655354499817
Образование: precision = 0.83; recall = 0.09; threshold = 0.9979484677314758
Отношения: precision = 1.00; recall = 0.13; threshold = 0.9996955394744873
Покупки: precision = 0.80; recall = 0.94; threshold = 0.539891242980957
Путешествия: precision = 0.88; recall = 0.44; threshold = 0.994171679019928
Работа: precision = 0.83; recall = 0.10; threshold = 0.9964026212692261
Развлечения: precision = 0.83; recall = 0.16; threshold = 0.9985381364822388
Спорт: precision = 0.82; recall = 0.56; threshold = 0.9726100564002991
Хобби: precision = 1.00; recall = 0.05; threshold = 0.9989047050476074


In [103]:
torch.save(model.state_dict(), "mlp_model.pth")
joblib.dump(thlds, "thresholds.pkl")
joblib.dump(mlb, "../ml-services/duration_service/mlb.pkl")

['mlb.pkl']