In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.quantization
import torch.nn.utils.prune as prune
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification, DistilBertConfig
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import evaluate
import gc

# Отключение W&B для избежания запроса API-ключа
os.environ["WANDB_MODE"] = "offline"

# Загрузка модели и токенизатора для учителя
teacher_model_name = "DeepPavlov/rubert-base-cased"
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name, num_labels=3)

  from .autonotebook import tqdm as notebook_tqdm





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
for i in teacher_model.parameters():
    i.requires_grad = False


for i in teacher_model.classifier.parameters():
    i.requires_grad = True

In [28]:
# Загрузка CSV в pandas DataFrame
train_df = pd.read_csv("./train.csv")
test_df = train_df.sample(10000)

# Создание датасета из DataFrame
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [29]:
# Функция токенизации
def tokenize_function(examples):
    return teacher_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Функция маппинга меток
def map_labels(examples):
    label_map = {"positive": 0, "negative": 1, "neutral": 2, "speech": -1, "skip": -1}
    examples["label"] = label_map[examples["label"]]
    return examples

# Токенизация и фильтрация
tokenized_datasets = dataset.map(tokenize_function, batched=True).map(map_labels)
tokenized_datasets = tokenized_datasets.filter(lambda example: example["label"] != -1)

Map: 100%|██████████| 10713/10713 [00:01<00:00, 8699.58 examples/s]
Map: 100%|██████████| 10000/10000 [00:01<00:00, 8846.78 examples/s]
Map: 100%|██████████| 10713/10713 [00:00<00:00, 12169.30 examples/s]
Map: 100%|██████████| 10000/10000 [00:01<00:00, 7875.25 examples/s]
Filter: 100%|██████████| 10713/10713 [00:02<00:00, 3622.72 examples/s]
Filter: 100%|██████████| 10000/10000 [00:02<00:00, 3678.31 examples/s]


In [30]:
# Ограничение размера датасета для скорости
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(min(10000, len(tokenized_datasets["train"]))))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(min(2000, len(tokenized_datasets["test"]))))

In [31]:


# Функция для оценки модели
metric = evaluate.load("accuracy")
def evaluate_model(model, dataset):
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./results",
            per_device_eval_batch_size=4,  # Уменьшено
            do_eval=True,
            report_to="none",
            run_name=f"eval_{int(time.time())}"
        ),
        eval_dataset=dataset,
        compute_metrics=lambda eval_pred: metric.compute(
            predictions=np.argmax(eval_pred.predictions, axis=1),
            references=eval_pred.label_ids
        )
    )
    start_time = time.time()
    metrics = trainer.evaluate()
    inference_time = time.time() - start_time
    torch.cuda.empty_cache()
    gc.collect()
    return metrics["eval_accuracy"], inference_time

# Функция для получения размера модели
def get_model_size(model):
    torch.save(model.state_dict(), "temp.pt")
    size = os.path.getsize("temp.pt") / 1e6
    os.remove("temp.pt")
    return size

In [32]:

# Инициализация списков для графиков
sizes = []
accuracies = []
inference_times = []
stages = ["Original"]

# Оценка исходной модели (учителя)
teacher_model.eval()
original_size = get_model_size(teacher_model)
original_accuracy, original_time = evaluate_model(teacher_model, eval_dataset)
sizes.append(original_size)
accuracies.append(original_accuracy)
inference_times.append(original_time)
print(f"Original - Size: {original_size:.2f} MB, Accuracy: {original_accuracy:.4f}, Time: {original_time:.2f}s")

Original - Size: 711.49 MB, Accuracy: 0.4910, Time: 9.85s


In [34]:
# 3. Полноценная дистилляция

# Предварительное обучение учителя
training_args = TrainingArguments(
    output_dir="./teacher_results",
    per_device_train_batch_size=4,  # Уменьшено
    per_device_eval_batch_size=4,   # Уменьшено
    num_train_epochs=10,             # Уменьшено для отладки
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    run_name=f"teacher_{int(time.time())}",
    fp16=False,                     # Отключено для стабильности
    logging_steps=5,               # Логирование каждые 10 шагов
)

teacher_trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=lambda eval_pred: metric.compute(
        predictions=np.argmax(eval_pred.predictions, axis=1),
        references=eval_pred.label_ids
    )
)
teacher_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9411,0.968557,0.504
2,1.1819,0.961318,0.5135
3,0.8737,0.955981,0.5145
4,0.8948,0.949013,0.5265
5,0.9641,0.945119,0.526
6,0.9026,0.941795,0.532
7,0.8469,0.938893,0.5355
8,1.0775,0.937731,0.539
9,1.032,0.936761,0.54
10,0.9886,0.93638,0.5395


TrainOutput(global_step=22110, training_loss=0.9660715239989386, metrics={'train_runtime': 954.3234, 'train_samples_per_second': 92.652, 'train_steps_per_second': 23.168, 'total_flos': 5816122098877440.0, 'train_loss': 0.9660715239989386, 'epoch': 10.0})