In [1]:
import os
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments

import warnings
warnings.filterwarnings('ignore')

In [28]:
dataset = load_dataset('Den4ikAI/russian_dialogues')

Using custom data configuration Den4ikAI--russian_dialogues-2f0e674e933ff89a
Reusing dataset json (/root/.cache/huggingface/datasets/Den4ikAI___json/Den4ikAI--russian_dialogues-2f0e674e933ff89a/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
dataset = dataset.class_encode_column('relevance')
dataset = dataset.rename_column('relevance', 'labels')

Loading cached processed dataset at /root/.cache/huggingface/datasets/Den4ikAI___json/Den4ikAI--russian_dialogues-2f0e674e933ff89a/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-ecba1f789bb6e582.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/Den4ikAI___json/Den4ikAI--russian_dialogues-2f0e674e933ff89a/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-5fc2dd5f7c22d453.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/Den4ikAI___json/Den4ikAI--russian_dialogues-2f0e674e933ff89a/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-928b7c7c52b925b5.arrow


In [30]:
dataset = dataset['train'].train_test_split(
    test_size=0.05,
    shuffle=True,
    stratify_by_column='labels',
    seed=42
    )

dataset = dataset['test']

In [31]:
dataset = dataset.filter(
    lambda example: type(example['question']) is str and type(example['answer']) is str
)

  0%|          | 0/124 [00:00<?, ?ba/s]

In [32]:
MODEL_NAME = 'Den4ikAI/ruBert-base-qa-ranker'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [33]:
if torch.cuda.is_available():
    model.to('cuda')
print(model.device)

cuda:0


In [34]:
def tokenization(example):
    return tokenizer(
        '[CLS]' + example['question'] + '[RESPONSE_TOKEN]' + example['answer'],
        max_length=512,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
        )

In [35]:
dataset_test = dataset.select(indices=range(5000))
dataset_pruning = dataset.select(indices=range(5000, 15000))

In [36]:
dataset_test = dataset_test.map(tokenization, batched=False)
dataset_pruning = dataset_pruning.map(tokenization, batched=False)

  0%|          | 0/5000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [37]:
dataset_test.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

dataset_pruning.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

In [38]:
data_collator = DataCollatorWithPadding(tokenizer)

In [55]:
batch_size = 8

In [56]:
test_dataloader = DataLoader(
    dataset_test, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator
)

pruning_dataloader = DataLoader(
    dataset_pruning, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator
)

In [74]:
def predict_with_model(model, dataloader, max_idx=None):
    preds = []
    facts = []

    for idx, batch in tqdm(enumerate(dataloader), total=max_idx if max_idx else len(dataloader)):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)

        with torch.no_grad():
            pred = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids
            )
        preds.append(torch.sigmoid(pred.logits).cpu().numpy())
        
        if idx == max_idx:
            break

    facts = np.concatenate(facts)
    preds = np.concatenate(preds)

    return facts, preds


def evaluate_model(model, dev_dataloader):
    facts, preds = predict_with_model(model, dev_dataloader)
    roc_score = roc_auc_score(facts, preds[:, 0])
    return roc_score

In [17]:
roc_auc_score = evaluate_model(model, test_dataloader)

  0%|          | 0/79 [00:00<?, ?it/s]

In [18]:
print(f'Dev Area Under ROC Curve is {roc_auc_score} before quantization')

Dev Area Under ROC Curve is 0.9695577994975315 before quantization


In [19]:
benchmark_args = PyTorchBenchmarkArguments(
    models=[MODEL_NAME],
    training=False,
    inference=True,
    sequence_lengths=[8,128,256,512],
    batch_sizes=[1,32,64],
    multi_process=False,
    cuda=True,
    speed=True,
)

benchmark = PyTorchBenchmark(benchmark_args)

In [81]:
!pip install py3nvml

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [20]:
results = benchmark.run()

1 / 1

--------------------------------------------------------------------------------
          Model Name             Batch Size     Seq Length     Time in s   
--------------------------------------------------------------------------------
Den4ikAI/ruBert-base-qa-ranker       1               8             0.009     
Den4ikAI/ruBert-base-qa-ranker       1              128            0.009     
Den4ikAI/ruBert-base-qa-ranker       1              256            0.011     
Den4ikAI/ruBert-base-qa-ranker       1              512            0.019     
Den4ikAI/ruBert-base-qa-ranker       32              8             0.011     
Den4ikAI/ruBert-base-qa-ranker       32             128            0.135     
Den4ikAI/ruBert-base-qa-ranker       32             256            0.365     
Den4ikAI/ruBert-base-qa-ranker       32             512            0.765     
Den4ikAI/ruBert-base-qa-ranker       64              8             0.018     
Den4ikAI/ruBert-base-qa-ranker       64             1

In [26]:
results[0][MODEL_NAME]['result']

{1: {8: 0.009212053602095694,
  128: 0.009397806506603957,
  256: 0.010669440601486713,
  512: 0.01914765270194039},
 32: {8: 0.011300773499533534,
  128: 0.13459614559542388,
  256: 0.3646724847960286,
  512: 0.7645334110944532},
 64: {8: 0.018073088803794234,
  128: 0.34874521840829403,
  256: 0.7089315174962394,
  512: 1.5779933413024991}}

In [18]:
!pip install textpruner

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting textpruner
  Downloading textpruner-1.1.post2.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: textpruner
  Building wheel for textpruner (setup.py) ... [?25ldone
[?25h  Created wheel for textpruner: filename=textpruner-1.1.post2-py3-none-any.whl size=43882 sha256=e862d3864095d3a7e9c6bbd52394634583335e362d16e3357c730291d2db6614
  Stored in directory: /root/.cache/pip/wheels/02/05/93/7ed7efba7071091691dd50709f6683955d609ff28324ac8710
Successfully built textpruner
Installing collected packages: textpruner
Successfully installed textpruner-1.1.post2
[0m

In [48]:
from textpruner import PipelinePruner, TransformerPruningConfig

In [57]:
transformer_pruning_config = TransformerPruningConfig(
    target_ffn_size=2048,
    target_num_of_heads=8, 
    pruning_method='iterative',
    n_iters=4
)

pruner = PipelinePruner(model, tokenizer, transformer_pruning_config=transformer_pruning_config)

In [58]:
pruner.prune(dataloader=pruning_dataloader, dataiter=dataset['question'][5000:45000], save_model=True)

Calculating IS with loss: 100%|██████████| 1250/1250 [12:27<00:00,  1.67it/s]
Calculating IS with loss: 100%|██████████| 1250/1250 [12:25<00:00,  1.68it/s]
Calculating IS with loss: 100%|██████████| 1250/1250 [12:26<00:00,  1.67it/s]
Calculating IS with loss: 100%|██████████| 1250/1250 [12:26<00:00,  1.67it/s]
100%|██████████| 40000/40000 [00:05<00:00, 7662.45it/s]


New embedding size 29613 pruned vocab file has been saved to ./pruned_models/pruned_V29613H8.0F2048/vocab.txt. Reintialize the tokenizer!


'./pruned_models/pruned_V29613H8.0F2048'

In [60]:
model.config

BertConfig {
  "_name_or_path": "Den4ikAI/ruBert-base-qa-ranker",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "pruned_heads": {
    "0": [
      1,
      10,
      4,
      7
    ],
    "1": [
      0,
      9,
      11,
      6
    ],
    "2": [
      0,
      8,
      10,
      6
    ],
    "3": [
      1,
 

In [3]:
MODEL_PATH = './pruned_models/pruned_V29613H8.0F2048/'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

In [68]:
dataset_test = dataset.select(indices=range(5000))
dataset_test = dataset_test.map(tokenization, batched=False)

  0%|          | 0/5000 [00:00<?, ?ex/s]

In [69]:
dataset_test.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

In [70]:
data_collator = DataCollatorWithPadding(tokenizer)

In [71]:
batch_size = 32

In [72]:
test_dataloader = DataLoader(
    dataset_test, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator
)

In [75]:
roc_auc_score = evaluate_model(model, test_dataloader)

  0%|          | 0/157 [00:00<?, ?it/s]

In [77]:
print(f'Dev Area Under ROC Curve is {roc_auc_score} after pruning')

Dev Area Under ROC Curve is 0.8927088349674912 after pruning


In [78]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)

Size (MB): 321.895469


In [5]:
benchmark_args = PyTorchBenchmarkArguments(
    models=[MODEL_PATH],
    training=False,
    inference=True,
    sequence_lengths=[8,128,256,512],
    batch_sizes=[1,32,64],
    multi_process=False,
    cuda=True,
    speed=True,
)

benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()


1 / 1

--------------------------------------------------------------------------------
          Model Name             Batch Size     Seq Length     Time in s   
--------------------------------------------------------------------------------
./pruned_models/pruned_V29613H       1               8             0.013     
./pruned_models/pruned_V29613H       1              128            0.014     
./pruned_models/pruned_V29613H       1              256            0.013     
./pruned_models/pruned_V29613H       1              512            0.013     
./pruned_models/pruned_V29613H       32              8             0.015     
./pruned_models/pruned_V29613H       32             128             0.08     
./pruned_models/pruned_V29613H       32             256             0.21     
./pruned_models/pruned_V29613H       32             512            0.393     
./pruned_models/pruned_V29613H       64              8             0.014     
./pruned_models/pruned_V29613H       64             1