In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error

import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

In [2]:
real_data = pd.read_csv("./data.csv", header=0)
fake_data = pd.read_csv("./fake_data.csv", header=0)

data = pd.concat([real_data, fake_data], axis=0, ignore_index=True)
del real_data, fake_data

binarizer = MultiLabelBinarizer()
X_classes = binarizer.fit_transform(data["classes"].str.replace(",", "").str.split())

data.drop(["classes"], inplace=True, axis=1)

data

Unnamed: 0,description,duration
0,Разметить данные,20160
1,Сделать ТАУ,60
2,Сходить в магазин за продуктами,45
3,Сходить в магазин,120
4,Приготовить обед,90
...,...,...
2814,Записаться на йогу,20
2815,Сходить в океанариум,120
2816,Купить новый шарф,30
2817,Погулять в парке с семьей,90


In [3]:
target = data["duration"].values
mask_1, mask_2 = target > np.quantile(target, 0.98), target < np.quantile(target, 0.04)
idxs = np.concatenate([np.where(mask_1)[0], np.where(mask_2)[0]], axis=0)
X_classes = np.delete(X_classes, idxs, axis=0)

data.drop(labels=idxs, inplace=True)
data.reset_index(inplace=True, drop=True)

data

Unnamed: 0,description,duration
0,Сделать ТАУ,60
1,Сходить в магазин за продуктами,45
2,Сходить в магазин,120
3,Приготовить обед,90
4,Погулять с собакой,30
...,...,...
2711,Записаться на йогу,20
2712,Сходить в океанариум,120
2713,Купить новый шарф,30
2714,Погулять в парке с семьей,90


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используемое устройство:", device)

model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

bert_model.eval()

def get_embeddings(texts: list[str], batch_size: int = 32) -> np.ndarray:
    # batch_size - кол-во текстов, которые нужно обработать за 1 проход
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            padding=True,                  # добавить паддинг до длины самого длинного в батче
            truncation=True,               # обрезать, если длиннее max_length
            max_length=128,                # макс. длина последовательности (в токенах)
            return_tensors="pt"            # вернуть PyTorch тензоры
        ).to(device)

        with torch.no_grad():
            outputs = bert_model(**inputs) # тензор формы (batch_size, sequence_length, hidden_size)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)

X_emb = get_embeddings(data["description"].tolist())

Используемое устройство: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [5]:
X_emb = np.concatenate([X_emb, X_classes], axis=1)

print(X_emb.shape, X_emb.dtype)

(2716, 325) float64


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_emb, data["duration"].values, test_size=0.15, random_state=21)

print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_val.shape}")

X_train.shape: (2308, 325)
X_test.shape: (408, 325)


In [7]:
def regression_report(y_true: np.ndarray | pd.Series, y_pred: np.ndarray | pd.Series, model_name: str = ""):
    print(f"\n===== {model_name} =====\n")
    print(f"RMSE: {root_mean_squared_error(y_true, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MAPE: {mean_absolute_percentage_error(y_true, y_pred):.4f}")

In [8]:
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_val_t = torch.tensor(X_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val, dtype=torch.float32)

train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [9]:
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.gelu = nn.GELU()

    def forward(self, x):
        return self.gelu(x + self.block(x))


class AdvancedRegressionMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=384):
        super().__init__()
        self.input_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU()
        )

        self.blocks = nn.Sequential(
            ResidualBlock(hidden_dim),
            ResidualBlock(hidden_dim)
        )

        self.head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.input_proj(x)
        x = self.blocks(x)
        return self.head(x).squeeze(-1)

In [10]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, X, y, device):
    model.eval()
    with torch.no_grad():
        X = X.to(device)
        outputs = model(X)
        preds = outputs.cpu().numpy()
    return preds

In [26]:
def mape_loss(y_pred, y_true):
    """MAPE loss: mean absolute percentage error"""
    eps = 1e-8
    return torch.mean(torch.abs((y_true - y_pred) / (y_true + eps))) * 100

In [29]:
input_dim = X_emb.shape[1]
model = AdvancedRegressionMLP(input_dim=input_dim, hidden_dim=256).to(device)

criterion = mape_loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(15):
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    if (epoch + 1) % 5 == 0:
        val_preds = evaluate(model, X_val_t, y_val_t, device)

        mae = np.mean(np.abs(y_val - val_preds))
        mape = np.mean(np.abs((y_val - val_preds) / y_val)) * 100

        print(f"Эпоха {epoch+1}, Loss: {loss:.4f}, Val MAE: {mae:.2f} мин, Val MAPE: {mape:.2f}%")

Эпоха 5, Loss: 55.2482, Val MAE: 66.94 мин, Val MAPE: 58.04%
Эпоха 10, Loss: 52.8933, Val MAE: 61.14 мин, Val MAPE: 59.83%
Эпоха 15, Loss: 37.8257, Val MAE: 59.79 мин, Val MAPE: 55.25%


In [30]:
final_preds = evaluate(model, X_val_t, y_val_t, device)

mae = np.mean(np.abs(y_val - final_preds))
mape = np.mean(np.abs((y_val - final_preds) / y_val)) * 100

print("\n=== Итоговая оценка ===")
print(f"MAE: {mae:.2f} минут")
print(f"MAPE: {mape:.2f}%")


=== Итоговая оценка ===
MAE: 59.79 минут
MAPE: 55.25%


In [23]:
torch.save({
    'model_state_dict': model.state_dict(),
    'input_dim': X_emb.shape[1],
    'hidden_dim': 256
}, "regression_model.pth")