In [None]:
!pip -q install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
!pip -q install accelerate==0.25.0 bertopic==0.15.0 faiss-cpu==1.7.4 \
          langchain==0.0.348 langchainhub==0.1.14 sentence-transformers==2.2.2 \
          sentencepiece==0.1.99 transformers==4.24.0 tqdm packaging==24.2
!pip -q install -U datasets


import os
print("Перезапускаю ядро, жди 5 секунд…")
os.kill(os.getpid(), 9)

[31mERROR: Cannot install accelerate, accelerate==0.25.0, langchain, langchain-core==0.0.13, packaging==24.2 and transformers==4.24.0 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:

import os, random, typing as tp, numpy as np, torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import (DistilBertTokenizerFast, DistilBertForSequenceClassification,
                          get_linear_schedule_with_warmup)
from sklearn.metrics import f1_score, classification_report
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", DEVICE)


Using: cuda


In [2]:

class CustomDataset(Dataset):
    """
    Обёртка вокруг HF-датасета, которая сразу токенизирует текст.
    Возвращает dict с input_ids, attention_mask, label.
    """
    def __init__(self, hf_split, tokenizer, max_len: int = 256):
        self.texts  = hf_split["text"]
        self.labels = hf_split["label"]
        self.tok    = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx) -> tp.Dict[str, tp.Any]:
        enc = self.tok(self.texts[idx],
                       truncation=True,
                       padding="max_length",
                       max_length=self.max_len,
                       return_tensors="pt")
        sample = {k: v.squeeze() for k, v in enc.items()}
        sample["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return sample


In [6]:

class ModelTrainer:
    def __init__(self,
                 model_name: str = "distilbert-base-uncased",
                 batch_size: int = 16,
                 lr: float = 2e-5,
                 weight_decay: float = 0.01,
                 epochs: int = 2,
                 max_len: int = 256,
                 out_dir: str = "/content/model_checkpoints"):

        self.batch_size, self.lr, self.weight_decay = batch_size, lr, weight_decay
        self.epochs, self.max_len, self.out_dir = epochs, max_len, out_dir

        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
        self.model.to(DEVICE)

        self.apply_data_parallel()
        self.configure_optimizer()
        total_steps = 1
        self.scheduler = None

        os.makedirs(self.out_dir, exist_ok=True)



    def apply_data_parallel(self):
        if torch.cuda.device_count() > 1:
            print(f"⏩ DataParallel on {torch.cuda.device_count()} GPUs")
            self.model = nn.DataParallel(self.model)


    def configure_optimizer(self):
        no_decay = ["bias", "LayerNorm.weight"]
        params = [
            {"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             "weight_decay": self.weight_decay},
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0},
        ]
        self.optimizer = torch.optim.AdamW(params, lr=self.lr)


    def load_data(self, split: str):
        dataset = load_dataset(
            "imdb",
            split=split,
            cache_dir="/content/hf_cache",
            trust_remote_code=True,
            download_mode="force_redownload"
        )
        return dataset



    def train(self, train_split):
        ds = CustomDataset(train_split, self.tokenizer, self.max_len)
        loader = DataLoader(ds, batch_size=self.batch_size, shuffle=True)

        total_steps = len(loader) * self.epochs
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=int(0.1*total_steps),
                                                         num_training_steps=total_steps)

        self.model.train()
        for epoch in range(1, self.epochs + 1):
            ep_loss = 0.0
            for batch in tqdm(loader, desc=f"Epoch {epoch}", leave=False):
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()

                ep_loss += loss.item()

            avg_loss = ep_loss / len(loader)
            print(f"🟢 Epoch {epoch}/{self.epochs} | loss: {avg_loss:.4f}")

    # ------------------------------------------------------------

    @torch.no_grad()
    def validate(self, valid_split):
        ds = CustomDataset(valid_split, self.tokenizer, self.max_len)
        loader = DataLoader(ds, batch_size=self.batch_size)

        self.model.eval()
        preds, labels = [], []

        for batch in tqdm(loader, desc="Valid", leave=False):
            lbl = batch["labels"].numpy()
            batch = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}

            logits = self.model(**batch).logits
            pred = torch.argmax(logits, dim=1).cpu().numpy()

            preds.extend(pred); labels.extend(lbl)

        return {"valid_labels": labels, "valid_preds": preds}


    def compute_metrics_report(self, labels, predictions):
        f1 = f1_score(labels, predictions)
        print("F1-macro:", round(f1, 4))
        print(classification_report(labels, predictions, digits=4))
        return f1


    def save_model(self, tag="best"):
        mdl = self.model.module if isinstance(self.model, nn.DataParallel) else self.model
        path = os.path.join(self.out_dir, f"distilbert_{tag}.pt")
        torch.save(mdl.state_dict(), path)
        print("💾 Saved to", path)


    def run_experiment(self):
        print("🔹 Loading data …")
        train_split = self.load_data("train")
        valid_split = self.load_data("test")

        print("🔹 Start training …")
        self.train(train_split)

        print("🔹 Validate …")
        res = self.validate(valid_split)

        print("🔹 Metrics …")
        self.compute_metrics_report(res["valid_labels"], res["valid_preds"])

        self.save_model()


In [4]:

trainer = ModelTrainer(
    batch_size = 16,
    lr         = 2e-5,
    epochs     = 2,
    max_len    = 256
)
trainer.run_experiment()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassi

🔹 Loading data …


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

🔹 Start training …


Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

🟢 Epoch 1/2 | loss: 0.3079


Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

🟢 Epoch 2/2 | loss: 0.1509
🔹 Validate …


Valid:   0%|          | 0/1563 [00:00<?, ?it/s]

🔹 Metrics …


AttributeError: 'float' object has no attribute 'round'

In [7]:
res = trainer.validate(trainer.load_data("test"))  # заново получаем предсказания (быстро)
trainer.compute_metrics_report(res["valid_labels"], res["valid_preds"])
trainer.save_model()

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Valid:   0%|          | 0/1563 [00:00<?, ?it/s]

AttributeError: 'float' object has no attribute 'round'

In [8]:
import types

def compute_metrics_report_fixed(self, labels, predictions):
    from sklearn.metrics import f1_score, classification_report

    f1 = f1_score(labels, predictions)
    print(f"F1-macro: {f1:.4f}")
    print(classification_report(labels, predictions, digits=4))
    return f1

trainer.compute_metrics_report = types.MethodType(compute_metrics_report_fixed, trainer)


In [9]:
res = trainer.validate(trainer.load_data("test"))
trainer.compute_metrics_report(res["valid_labels"], res["valid_preds"])


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Valid:   0%|          | 0/1563 [00:00<?, ?it/s]

F1-macro: 0.9149
              precision    recall  f1-score   support

           0     0.9248    0.9007    0.9126     12500
           1     0.9032    0.9268    0.9149     12500

    accuracy                         0.9138     25000
   macro avg     0.9140    0.9138    0.9137     25000
weighted avg     0.9140    0.9138    0.9137     25000



0.9148700939745716

In [10]:
trainer.save_model("imdb_final")


💾 Saved to /content/model_checkpoints/distilbert_imdb_final.pt


In [11]:

from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = trainer.model.cpu(),
    tokenizer = trainer.tokenizer,
    return_all_scores = False
)

pipe("This movie was absolutely fantastic, loved every minute!")




[{'label': 'LABEL_1', 'score': 0.9954558610916138}]