# Классификатор текстов с описаниями товаров (с использованием RuBert)

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
from _funcs import transform_frame, feature_creator, image_path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool, cv

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import f_classif

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
import tqdm

In [None]:
df = pd.read_csv('ml_ozon_counterfeit_train.csv', encoding='utf-8')
df_upd = feature_creator(transform_frame(df))
df_upd.columns

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

MODEL_NAME = "DeepPavlov/rubert-base-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

train_dataset = TextDataset(df_text_train_x['name_description'], df_text_train_y['resolution'], tokenizer)
test_dataset = TextDataset(df_text_test_x['name_description'], df_text_test_y['resolution'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=196, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=196)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
criterion = nn.CrossEntropyLoss()

EPOCHS = 4

for epoch in tqdm.tqdm(range(EPOCHS)):
    model.train()
    total_loss = 0
    for batch in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Test Acc: {acc:.4f} | F1: {f1:.4f}")
    print()
    print(classification_report(true_labels, preds))
torch.save(model.state_dict(), 'rubert-texts.pth')

In [None]:
df = pd.read_csv('ml_ozon_counterfeit_train.csv', encoding='utf-8')
df_upd = feature_creator(transform_frame(df))
df_upd.columns

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_NAME = "DeepPavlov/rubert-base-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model.load_state_dict(torch.load("rubert-texts.pth", map_location=device))
model.to(device)
model.eval()



def predict_texts(texts, batch_size=16):
    preds = []
    probs = []
    for i in tqdm.tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        encodings = tokenizer(
            batch,
            truncation=True,
            padding=True,
            max_length=64,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=-1)
            batch_probs = torch.softmax(logits, dim=-1)
            preds.extend(batch_preds.cpu().numpy())
            probs.extend(batch_probs.cpu().numpy())
    return probs

In [None]:
text_preds = predict_texts(test['description'].tolist())
text_preds[:100]