In [1]:
import os
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
#import datasets
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = load_dataset('Den4ikAI/russian_dialogues')

In [3]:
dataset = dataset.class_encode_column('relevance')
dataset = dataset.rename_column('relevance', 'labels')

In [4]:
dataset = dataset['train'].train_test_split(
    test_size=0.20,
    shuffle=True,
    stratify_by_column='labels',
    seed=42
    )

In [5]:
dataset = dataset.filter(
    lambda example: type(example['question']) is str and type(example['answer']) is str
)

In [6]:
TEACHER_NAME = 'Den4ikAI/ruBert-base-qa-ranker'
teacher_tokenizer = AutoTokenizer.from_pretrained(TEACHER_NAME)
teacher_model = AutoModelForSequenceClassification.from_pretrained(TEACHER_NAME)

In [7]:
STUDENT_NAME = 'cointegrated/rubert-tiny2'
student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_NAME)
student_model = AutoModelForSequenceClassification.from_pretrained(STUDENT_NAME)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [8]:
if torch.cuda.is_available():
    student_model.to('cuda')
    teacher_model.to('cuda')

print(student_model.device)
print(teacher_model.device)

cuda:0
cuda:0


In [9]:
def tokenization(example, tokenizer):
    return tokenizer(
        '[CLS]' + example['question'] + '[RESPONSE_TOKEN]' + example['answer'],
        max_length=512,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
        )

In [10]:
dataset_test = dataset['test']
#dataset_train = dataset['train']
dataset_train = dataset['train'].select(indices=range(0, 10000))

In [11]:
dataset_train_teacher = dataset_train.map(lambda x: tokenization(x, teacher_tokenizer), batched=False)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [12]:
dataset_train_teacher.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

In [13]:
data_collator = DataCollatorWithPadding(teacher_tokenizer)

In [14]:
batch_size = 16

In [15]:
train_dataloader = DataLoader(
    dataset_train_teacher, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator
)

In [16]:
def predict_with_model(model, dataloader, max_idx=None):
    preds = []
    facts = []

    for idx, batch in tqdm(enumerate(dataloader), total=max_idx if max_idx else len(dataloader)):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)

        with torch.no_grad():
            pred = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids
            )
        preds.append(torch.sigmoid(pred.logits).cpu().numpy())
        
        if idx == max_idx:
            break

    facts = np.concatenate(facts)
    preds = np.concatenate(preds)

    return facts, preds

In [17]:
_, teacher_labels = predict_with_model(teacher_model, train_dataloader)

  0%|          | 0/625 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [18]:
teacher_labels = np.argmax(teacher_labels, 1)

In [19]:
dataset_train = dataset_train.remove_columns("labels").add_column("labels", teacher_labels)

Flattening the indices:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [20]:
dataset_train_student = dataset_train.map(lambda x: tokenization(x, student_tokenizer), batched=False)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [21]:
dataset_train_student.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

In [22]:
lenght = (len(dataset_train_student)//100) *50

In [23]:
dataset_train_st = dataset_train_student.select(indices=range(0, lenght))
dataset_test_st = dataset_train_student.select(indices=range(lenght, lenght * 2))

In [24]:
training_args = TrainingArguments(

    output_dir="distillated_student",

    learning_rate=2e-5,

    per_device_train_batch_size=20,

    per_device_eval_batch_size=16,

    num_train_epochs=15,

    weight_decay=0.01,

    evaluation_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,

    push_to_hub=False,
    logging_steps = 1

)

In [25]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [26]:
trainer = Trainer(

    model=student_model,

    args=training_args,

    train_dataset=dataset_train_st,

    eval_dataset=dataset_test_st,

    tokenizer=student_tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question, answer. If question, answer are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 15
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 40
  Gradient Accumulation steps = 1
  Total optimization steps = 1875
  Number of trainable parameters = 29194394


Epoch,Training Loss,Validation Loss,Accuracy
1,0.493,0.612679,0.67
2,0.5703,0.598592,0.6836
3,0.4334,0.596295,0.694
4,0.5163,0.60028,0.6972
5,0.4968,0.592021,0.7052
6,0.4867,0.599254,0.7022
7,0.4244,0.615119,0.7074
8,0.4302,0.619764,0.709
9,0.3812,0.627447,0.709
10,0.3835,0.645444,0.7092


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question, answer. If question, answer are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32
Saving model checkpoint to distillated_student/checkpoint-125
Configuration saved in distillated_student/checkpoint-125/config.json
Model weights saved in distillated_student/checkpoint-125/pytorch_model.bin
tokenizer config file saved in distillated_student/checkpoint-125/tokenizer_config.json
Special tokens file saved in distillated_student/checkpoint-125/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question, answer. If question, answer are not expected by `BertForSequenceClassification.forward`,  you can safely ig

TrainOutput(global_step=1875, training_loss=0.4221291113297145, metrics={'train_runtime': 1086.0392, 'train_samples_per_second': 69.058, 'train_steps_per_second': 1.726, 'total_flos': 553065523200000.0, 'train_loss': 0.4221291113297145, 'epoch': 15.0})

In [44]:
trainer.save_model('great_distill_model')
student_tokenizer.save_pretrained('great_distill_model')

Saving model checkpoint to great_distill_model
Configuration saved in great_distill_model/config.json
Model weights saved in great_distill_model/pytorch_model.bin
tokenizer config file saved in great_distill_model/tokenizer_config.json
Special tokens file saved in great_distill_model/special_tokens_map.json
tokenizer config file saved in great_distill_model/tokenizer_config.json
Special tokens file saved in great_distill_model/special_tokens_map.json


('great_distill_model/tokenizer_config.json',
 'great_distill_model/special_tokens_map.json',
 'great_distill_model/vocab.txt',
 'great_distill_model/added_tokens.json',
 'great_distill_model/tokenizer.json')

## Замерим модели после дистилляции

In [45]:
def tokenization_teacher(example):
    return teacher_tokenizer(
        '[CLS]' + example['question'] + '[RESPONSE_TOKEN]' + example['answer'],
        max_length=512,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
        )

In [46]:
def tokenization_student(example):
    return student_tokenizer(
        '[CLS]' + example['question'] + '[RESPONSE_TOKEN]' + example['answer'],
        max_length=512,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
        )

In [47]:
dataset = load_dataset('Den4ikAI/russian_dialogues')
dataset = dataset.class_encode_column('relevance')
dataset = dataset['train'].train_test_split(
    test_size=0.05,
    shuffle=True,
    seed=42
    )

dataset = dataset['test']
dataset = dataset.filter(
    lambda example: type(example['question']) is str and type(example['answer']) is str
)

In [48]:
dataset = dataset.select(indices=range(5000))

In [49]:
dataset_teacher = dataset.map(tokenization_teacher, batched=False)

In [50]:
dataset_student = dataset.map(tokenization_student, batched=False)

In [51]:
dataset_teacher.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "relevance"]
)
dataset_student.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "relevance"]
)

In [52]:
data_collator_teacher = DataCollatorWithPadding(teacher_tokenizer)
data_collator_student = DataCollatorWithPadding(student_tokenizer)


In [53]:
batch_size = 64

In [54]:
test_dataloader_teacher = DataLoader(
    dataset_teacher, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator_teacher
)

In [55]:
test_dataloader_student = DataLoader(
    dataset_student, 
    batch_size=batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator_student
)

In [64]:
import time
def predict_with_model(model, dataloader, max_idx=None):
    preds = []
    facts = []
    
    for idx, batch in tqdm(enumerate(dataloader), total=max_idx if max_idx else len(dataloader)):
        facts.append(batch.relevance.cpu().numpy())
        batch = batch.to(model.device)
        
        with torch.no_grad():
            pred = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids
            )
        preds.append(torch.sigmoid(pred.logits).cpu().numpy())

        if idx == max_idx:
            break

    facts = np.concatenate(facts)
    preds = np.concatenate(preds)

    return facts, preds


def evaluate_model(model, dev_dataloader):
    eval_start_time = time.time()
    facts, preds = predict_with_model(model, dev_dataloader)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    timer = 'Eval time:  ' +str(eval_duration_time)
    roc_score = roc_auc_score(facts, preds[:, 0])
    return roc_score, timer

In [65]:
roc_auc_score_teacher, timer_teacher = evaluate_model(teacher_model, test_dataloader_teacher)

  0%|          | 0/79 [00:00<?, ?it/s]

In [66]:
student_model = AutoModelForSequenceClassification.from_pretrained('great_distill_model').to('cuda')

loading configuration file great_distill_model/config.json
Model config BertConfig {
  "_name_or_path": "great_distill_model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "emb_size": 312,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "initializer_range": 0.02,
  "intermediate_size": 600,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 83828
}

loading weights file great_distill_model/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of Be

In [67]:
roc_auc_score_student, timer_student = evaluate_model(student_model, test_dataloader_student)

  0%|          | 0/79 [00:00<?, ?it/s]

In [68]:
print(f'Dev Area Under ROC Curve is {roc_auc_score_teacher} origignal and {roc_auc_score_student}  student')

Dev Area Under ROC Curve is 0.9739135049032971 origignal and 0.7153989430064729  student


In [69]:
print(f'inferense time of student is  {timer_student} and teacher is  {timer_teacher}')

inferense time of student is  Eval time:  4.6855950355529785 and teacher is  Eval time:  36.39859652519226
