In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import pytorch_lightning as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torchmetrics


MODEL_NAME = 'cointegrated/rubert-tiny2'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TextDataset(Dataset):
    def __init__(
        self,
        texts: list[str],
        labels: list[str],
        tokenizer: AutoTokenizer,
        max_length: int = 512,
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long),
        }

In [7]:
class BERTClassifier(pl.LightningModule):
    def __init__(
        self,
        n_classes: int,
        steps_per_epoch=None,
        n_epochs=None,
        lr=2e-5,
    ):
        super().__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME).train()
        hidden_size = self.bert.config.hidden_size
        self.pre_classifier = nn.Linear(hidden_size, hidden_size, bias=True)
        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, n_classes, bias=True),
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr

        self.train_accuracy = torchmetrics.Accuracy(
            task='multiclass',
            num_classes=n_classes,
            average='macro',
        )
        self.val_accuracy = torchmetrics.Accuracy(
            task='multiclass',
            num_classes=n_classes,
            average='macro',
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1]
        return self.classifier(outputs)

    def training_step(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs, labels)
        self.train_accuracy.update(outputs.softmax(dim=-1), labels)
        self.log('train_loss', loss)
        return loss

    def on_train_epoch_end(self):
        accuracy = self.train_accuracy.compute()
        self.log('train_accuracy', accuracy, on_epoch=True)
        print(f'\nEpoch {self.current_epoch} Train Accuracy: {accuracy:.4f}')
        self.train_accuracy.reset()

    def validation_step(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs, labels)
        self.val_accuracy.update(outputs.softmax(dim=-1), labels)
        self.log('val_loss', loss)
        return loss

    def on_validation_epoch_end(self):
        accuracy = self.val_accuracy.compute()
        self.log('val_accuracy', accuracy, on_epoch=True)
        print(f'\nEpoch {self.current_epoch} Validation Accuracy: {accuracy:.4f}')
        self.val_accuracy.reset()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

In [8]:
def preprocess_data(
        texts: list[str],
        labels: list[str],
        max_length: int,
        batch_size: int,
    ):
    le = LabelEncoder()
    labels = le.fit_transform(labels)

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.1, random_state=42,
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_loader, val_loader, le

In [9]:
def train_model(
        texts,
        labels,
        num_classes,
        max_length=512,
        batch_size=16,
        n_epochs=5,
        lr=2e-5,
    ):
    train_loader, val_loader, label_encoder = preprocess_data(texts, labels, max_length, batch_size)

    steps_per_epoch = len(train_loader)
    model = BERTClassifier(
        n_classes=num_classes,
        steps_per_epoch=steps_per_epoch,
        n_epochs=n_epochs,
        lr=lr,
    )

    trainer = pl.Trainer(
        max_epochs=n_epochs, 
        devices=1 if torch.cuda.is_available() else None,
        accelerator= 'gpu' if torch.cuda.is_available() else 'cpu',
    )
    
    trainer.fit(model, train_loader, val_loader)

    return model, label_encoder

In [10]:
df = pd.read_csv('Вопрос ответ.csv')

texts = df['question'].to_list()
labels = (df['category'] + '///' + df['content']).to_list()

num_classes = len(set(labels))
model, label_encoder = train_model(texts, labels, num_classes)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
d:\GitHub\research\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\GitHub\research\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  9.78it/s]
Epoch 0 Validation Accuracy: 0.0000
                                                                           

d:\GitHub\research\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 95/95 [00:10<00:00,  9.31it/s, v_num=40]
Epoch 0 Validation Accuracy: 0.0285
Epoch 0: 100%|██████████| 95/95 [00:10<00:00,  8.95it/s, v_num=40]
Epoch 0 Train Accuracy: 0.0047
Epoch 1: 100%|██████████| 95/95 [00:10<00:00,  9.45it/s, v_num=40]
Epoch 1 Validation Accuracy: 0.0298
Epoch 1: 100%|██████████| 95/95 [00:10<00:00,  9.08it/s, v_num=40]
Epoch 1 Train Accuracy: 0.0077
Epoch 2: 100%|██████████| 95/95 [00:10<00:00,  9.47it/s, v_num=40]
Epoch 2 Validation Accuracy: 0.0323
Epoch 2: 100%|██████████| 95/95 [00:10<00:00,  9.10it/s, v_num=40]
Epoch 2 Train Accuracy: 0.0080
Epoch 3: 100%|██████████| 95/95 [00:09<00:00,  9.54it/s, v_num=40]
Epoch 3 Validation Accuracy: 0.0379
Epoch 3: 100%|██████████| 95/95 [00:10<00:00,  9.16it/s, v_num=40]
Epoch 3 Train Accuracy: 0.0088
Epoch 4: 100%|██████████| 95/95 [00:09<00:00,  9.56it/s, v_num=40]
Epoch 4 Validation Accuracy: 0.0413
Epoch 4: 100%|██████████| 95/95 [00:10<00:00,  9.18it/s, v_num=40]
Epoch 4 Train Accuracy: 0.

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 95/95 [00:11<00:00,  8.41it/s, v_num=40]


In [11]:
def predict_class(model, tokenizer, text, label_encoder, device='cpu'):
    inputs = tokenizer(
        text, 
        padding='max_length', 
        truncation=True, 
        max_length=64,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        predictions = torch.argmax(outputs, dim=-1)

    predicted_class_id = predictions.item()
    predicted_class_text = label_encoder.inverse_transform([predicted_class_id])[0]

    return predicted_class_text


text = 'Как трудоустроится совместителем?'
model.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

predicted_class_text = predict_class(model, tokenizer, text, label_encoder, device)
print(f'Predicted class: {predicted_class_text}')

Predicted class: поддержка///Создайте, пожалуйста, обращение в ИТ поддержку на портале support
