In [23]:
import os
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
from textpruner import PipelinePruner, TransformerPruningConfig, VocabularyPruningConfig, GeneralConfig
import time
import warnings
warnings.filterwarnings('ignore')

Загружаем набор данных (собранные из телеграма вопросы и ответы на них + negative samples). В качестве лейбла - 1, если пара вопрос-ответ релевантна, 0 - если нет

In [24]:
dataset = load_dataset('Den4ikAI/russian_dialogues')

Using custom data configuration Den4ikAI--russian_dialogues-a0532d1b6de2e5c9
Reusing dataset json (/home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
dataset = dataset.class_encode_column('relevance')

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-0536867201af999e.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-1967d52384a58767.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-95ada20d539d614a.arrow


In [26]:
dataset = dataset['train'].train_test_split(
    test_size=0.05,
    shuffle=True,
    #stratify_by_column='relevance',
    seed=42
    )

dataset = dataset['test']

Loading cached split indices for dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-a0a52cb35c6b4681.arrow and /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-8029ced1bb69a1fc.arrow


In [27]:
dataset = dataset.filter(
    lambda example: type(example['question']) is str and type(example['answer']) is str
)

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-b5ac06dc99303b16.arrow


In [28]:
MODEL_NAME = 'Den4ikAI/ruBert-base-qa-ranker'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [29]:
if torch.cuda.is_available():
    model.to('cpu')
print(model.device)

cpu


In [30]:
def tokenization(example):
    return tokenizer(
        '[CLS]' + example['question'] + '[RESPONSE_TOKEN]' + example['answer'],
        max_length=512,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
        )

In [31]:
dataset = dataset.select(indices=range(5000))

In [32]:
dataset = dataset.map(tokenization, batched=False)

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/json/Den4ikAI--russian_dialogues-a0532d1b6de2e5c9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-f241ebaf2d1353ad.arrow


In [33]:
dataset.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "relevance"]
)

In [34]:
data_collator = DataCollatorWithPadding(tokenizer)

In [35]:
batch_size = 64

In [36]:
test_dataloader = DataLoader(
    dataset, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator
)

In [37]:
def predict_with_model(model, dataloader, max_idx=None):
    preds = []
    facts = []
    
    for idx, batch in tqdm(enumerate(dataloader), total=max_idx if max_idx else len(dataloader)):
        facts.append(batch.relevance.cpu().numpy())
        batch = batch.to(model.device)
        
        with torch.no_grad():
            pred = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids
            )
        preds.append(torch.sigmoid(pred.logits).cpu().numpy())

        if idx == max_idx:
            break

    facts = np.concatenate(facts)
    preds = np.concatenate(preds)

    return facts, preds


def evaluate_model(model, dev_dataloader):
    eval_start_time = time.time()
    facts, preds = predict_with_model(model, dev_dataloader)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    timer = 'Eval time:  ' +str(eval_duration_time)
    roc_score = roc_auc_score(facts, preds[:, 0])
    return roc_score

In [38]:
roc_auc_score_orig  = evaluate_model(model, test_dataloader)

  0%|          | 0/79 [00:00<?, ?it/s]

In [39]:
print(f'Dev Area Under ROC Curve is {roc_auc_score_orig} before quantization')

Dev Area Under ROC Curve is 0.9739135049032972 before quantization


In [40]:
quantized_model_dino = torch.quantization.quantize_dynamic(
    model.to('cpu'), {torch.nn.Linear}, dtype=torch.qint8
)
print(quantized_model_dino)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119548, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
     

In [41]:
import os

In [42]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model.to('cpu'))
print_size_of_model(quantized_model_dino)

Size (MB): 711.505197
Size (MB): 454.985029


In [43]:
roc_auc_score_quant = evaluate_model(quantized_model_dino, test_dataloader)

  0%|          | 0/79 [00:00<?, ?it/s]

In [44]:
print(f'Dev Area Under ROC Curve is {roc_auc_score_quant } after quantization')

Dev Area Under ROC Curve is 0.9645560873843629 after quantization
