# Домашнее задание № 8

### Задание 1 (10 баллов).
Это задание основано на этой тетрадке - https://github.com/mannefedov/compling_nlp_hse_course/blob/master/notebooks/transfer_learning_hg/Fine_tunining_pretrained_LMs_torch.ipynb

На датасете lenta_sample.ru  дообучите две модели - modernbert-base (из семинара) и rumodernbert-base (https://huggingface.co/deepvk/RuModernBERT-base). Оцените разницу в качестве сравнив поклассовые метрики (classification_report)

Для обоих моделей качество должно быть >0.10 по f-мере (прогоните несколько экспериментов если у вас получаются нули, изменяя параметры).
Также для обоих моделей попробуйте дообучать модель и целиком и дообучать только последний слой. 
Для RuModernBERT дополнительно сравните модель, которая использует первый вектор (cls токен, как в семинаре), так и усредненный вектор по всем hidden_state, который выдает bert. 




In [1]:
import argparse
from pathlib import Path
from typing import Dict, List

import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset, load_dataset
from sklearn.metrics import classification_report
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)


  Referenced from: <EA7F9DF5-8854-31D8-89D4-BD566CAF4DEA> /opt/anaconda3/envs/ml_env/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
def build_label_maps(dataset: Dataset):
    labels: List[str] = sorted(set(dataset["topic"]))
    label2id = {l: i for i, l in enumerate(labels)}
    id2label = {i: l for l, i in label2id.items()}
    return label2id, id2label


In [3]:
def preprocess(example: dict, label2id: Dict[str, int]):
    return {"text": example["text"], "label": label2id[example["topic"]]}

In [4]:
def tokenize_fn(examples: dict, tokenizer: AutoTokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
    )

In [5]:
class MeanPoolingBERT(nn.Module):
    def __init__(self, model_name: str, num_labels: int):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state 
        mask = attention_mask.unsqueeze(-1).float()
        pooled = (hidden * mask).sum(1) / mask.sum(1)
        logits = self.classifier(pooled)
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits}

In [6]:
def compute_metrics(eval_pred, id2label):
    logits, labels = eval_pred
    preds = np.argmax(logits, 1)
    report_str = classification_report(
        labels,
        preds,
        target_names=[id2label[i] for i in range(len(id2label))],
        zero_division=0,
    )
    print("\n" + report_str)
    report = classification_report(
        labels,
        preds,
        target_names=[id2label[i] for i in range(len(id2label))],
        output_dict=True,
        zero_division=0,
    )
    return {"macro_f1": report["macro avg"]["f1-score"]}

In [7]:
MODELS = {
    "modern_full": {
        "name": "answerdotai/ModernBERT-base",
        "freeze": False,
        "custom_head": None,
    },
    "modern_cls_only": {
        "name": "answerdotai/ModernBERT-base",
        "freeze": True,
        "custom_head": None,
    },
    "rumodern_cls_full": {
        "name": "deepvk/RuModernBERT-base",
        "freeze": False,
        "custom_head": None,
    },
    "rumodern_mean_full": {
        "name": "deepvk/RuModernBERT-base",
        "freeze": False,
        "custom_head": MeanPoolingBERT,
    },
}

In [8]:
def train_one(cfg: dict, dsets: dict, id2label: dict):
    model_name = cfg["name"]
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    tokenized_train = dsets["train"].map(
        lambda x: tokenize_fn(x, tokenizer),
        batched=True,
        remove_columns=["text", "topic"],
    )
    tokenized_val = dsets["validation"].map(
        lambda x: tokenize_fn(x, tokenizer),
        batched=True,
        remove_columns=["text", "topic"],
    )

    if cfg["custom_head"]:
        model = cfg["custom_head"](model_name, num_labels=len(id2label))
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(id2label),
            trust_remote_code=True,
            id2label=id2label,
            label2id={v: k for k, v in id2label.items()},
        )

    if cfg["freeze"] and not cfg["custom_head"]:
        for p in model.base_model.parameters():
            p.requires_grad = False

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=50,
        save_strategy="no",
        bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=lambda p: compute_metrics(p, id2label),
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"Final macro-F1: {metrics['eval_macro_f1']:.4f}\n")


In [None]:
import pandas as pd
df = pd.read_csv('lenta_40k.csv.zip', compression="zip")
raw = Dataset.from_pandas(df)

label2id, id2label = build_label_maps(raw)
ds = raw.map(lambda ex: preprocess(ex, label2id))

dsets = ds.train_test_split(test_size=0.2)
dsets["validation"] = dsets.pop("test")

for tag, cfg in MODELS.items():
    print("=" * 80)
    print(f"-> {tag}")
    train_one(cfg, dsets, id2label)

Map:   0%|          | 0/44356 [00:00<?, ? examples/s]

-> modern_full


Map:   0%|          | 0/35484 [00:00<?, ? examples/s]

Map:   0%|          | 0/8872 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
