In [None]:
!pip install transformers torch sacremoses scikit-learn matplotlib seaborn tqdm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve
)
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "allegro/herbert-base-cased"  # <-- tutaj zmiana modelu


In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [5]:
def load_and_split_data(data_path, tokenizer, max_len=64, test_size=0.2, random_state=42):
    df = pd.read_csv(data_path)
    train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['Class'], random_state=random_state)

    train_dataset = TextDataset(train_df['Text'].tolist(), train_df['Class'].tolist(), tokenizer, max_len)
    test_dataset = TextDataset(test_df['Text'].tolist(), test_df['Class'].tolist(), tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    return train_loader, test_loader

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, model_name):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output).squeeze(-1)

In [None]:
def train_and_evaluate(train_loader, test_loader, model_name=MODEL_NAME, save_dir="results/herbert-base-cased"):
    os.makedirs(save_dir, exist_ok=True)

    model = TransformerClassifier(model_name).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    num_training_steps = len(train_loader) * 6
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    best_val_loss = float("inf")
    patience_counter = 0
    EPOCHS = 6

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_epoch_loss:.4f}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(test_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pt"))
        else:
            patience_counter += 1
            if patience_counter >= 2:
                print("⏹️ Early stopping triggered.")
                break

    model.load_state_dict(torch.load(os.path.join(save_dir, "best_model.pt")))
    model.eval()
    all_preds, all_probs, all_labels = [], [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print(classification_report(all_labels, all_preds, zero_division=0))

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix for HerBERT - The Best Variant")
    plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
    plt.close()

    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC Curve for HerBERT - The Best Varianl")
    plt.legend()
    plt.savefig(os.path.join(save_dir, "roc_curve.png"))
    plt.close()

In [8]:
def run_pipeline_herbert_singlefile(data_path, max_len=256):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_loader, test_loader = load_and_split_data(data_path, tokenizer, max_len)
    train_and_evaluate(train_loader, test_loader, model_name=MODEL_NAME)


In [14]:
from google.colab import files
files.download('/content/results/herbert-base-cased/best_model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
run_pipeline_herbert_singlefile("BAN-PL_raw.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/556k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]


Epoch 1:   0%|          | 0/1200 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 1/1200 [00:02<46:31,  2.33s/it][A
Epoch 1:   0%|          | 2/1200 [00:02<26:21,  1.32s/it][A
Epoch 1:   0%|          | 3/1200 [00:03<19:50,  1.01it/s][A
Epoch 1:   0%|          | 4/1200 [00:04<16:46,  1.19it/s][A
Epoch 1:   0%|          | 5/1200 [00:04<15:11,  1.31it/s][A
Epoch 1:   0%|          | 6/1200 [00:05<14:09,  1.41it/s][A
Epoch 1:   1%|          | 7/1200 [00:06<13:31,  1.47it/s][A
Epoch 1:   1%|          | 8/1200 [00:06<13:03,  1.52it/s][A
Epoch 1:   1%|          | 9/1200 [00:07<12:48,  1.55it/s][A
Epoch 1:   1%|          | 10/1200 [00:07<12:38,  1.57it/s][A
Epoch 1:   1%|          | 11/1200 [00:08<12:31,  1.58it/s][A
Epoch 1:   1%|          | 12/1200 [00:09<12:24,  1.60it/s][A
Epoch 1:   1%|          | 13/1200 [00:09<12:21,  1.60it/s][A
Epoch 1:   1%|          | 14/1200 [00:10<12:17,  1.61it/s][A
Epoch 1:   1%|▏         | 15/1200 [00:10<12:14,  1.61it/s][A
Epoch 1:   1%|▏         |

Epoch 1 Loss: 0.2814
Validation Loss: 0.1990


Epoch 2: 100%|██████████| 1200/1200 [12:49<00:00,  1.56it/s]


Epoch 2 Loss: 0.1560
Validation Loss: 0.2086


Epoch 3: 100%|██████████| 1200/1200 [12:49<00:00,  1.56it/s]


Epoch 3 Loss: 0.0895
Validation Loss: 0.2192
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [01:01<00:00,  4.87it/s]


Accuracy:  0.9192
Precision: 0.9076
Recall:    0.9333
F1-score:  0.9203
ROC AUC:   0.9765
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92      2400
         1.0       0.91      0.93      0.92      2400

    accuracy                           0.92      4800
   macro avg       0.92      0.92      0.92      4800
weighted avg       0.92      0.92      0.92      4800



In [10]:
run_pipeline_herbert_singlefile("BAN-PL_light.csv")

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]


Epoch 1:   0%|          | 0/1199 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 1/1199 [00:01<37:22,  1.87s/it][A
Epoch 1:   0%|          | 2/1199 [00:02<22:36,  1.13s/it][A
Epoch 1:   0%|          | 3/1199 [00:03<17:47,  1.12it/s][A
Epoch 1:   0%|          | 4/1199 [00:03<15:32,  1.28it/s][A
Epoch 1:   0%|          | 5/1199 [00:04<14:15,  1.40it/s][A
Epoch 1:   1%|          | 6/1199 [00:04<13:35,  1.46it/s][A
Epoch 1:   1%|          | 7/1199 [00:05<13:02,  1.52it/s][A
Epoch 1:   1%|          | 8/1199 [00:06<12:40,  1.57it/s][A
Epoch 1:   1%|          | 9/1199 [00:06<12:25,  1.60it/s][A
Epoch 1:   1%|          | 10/1199 [00:07<12:15,  1.62it/s][A
Epoch 1:   1%|          | 11/1199 [00:07<12:10,  1.63it/s][A
Epoch 1:   1%|          | 12/1199 [00:08<12:05,  1.64it/s][A
Epoch 1:   1%|          | 13/1199 [00:09<12:02,  1.64it/s][A
Epoch 1:   1%|          | 14/1199 [00:09<12:03,  1.64it/s][A
Epoch 1:   1%|▏         | 15/1199 [00:10<12:01,  1.64it/s][A
Epoch 1:   1%|▏         |

Epoch 1 Loss: 0.3279
Validation Loss: 0.2693


Epoch 2: 100%|██████████| 1199/1199 [13:22<00:00,  1.49it/s]


Epoch 2 Loss: 0.2011
Validation Loss: 0.2959


Epoch 3: 100%|██████████| 1199/1199 [13:21<00:00,  1.50it/s]


Epoch 3 Loss: 0.1262
Validation Loss: 0.2881
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [01:05<00:00,  4.58it/s]


Accuracy:  0.8972
Precision: 0.8901
Recall:    0.9061
F1-score:  0.8980
ROC AUC:   0.9572
              precision    recall  f1-score   support

         0.0       0.90      0.89      0.90      2399
         1.0       0.89      0.91      0.90      2396

    accuracy                           0.90      4795
   macro avg       0.90      0.90      0.90      4795
weighted avg       0.90      0.90      0.90      4795



In [13]:
run_pipeline_herbert_singlefile("BAN-PL_full.csv")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1: 100%|██████████| 1197/1197 [13:13<00:00,  1.51it/s]


Epoch 1 Loss: 0.3618
Validation Loss: 0.2966


Epoch 2: 100%|██████████| 1197/1197 [13:15<00:00,  1.51it/s]


Epoch 2 Loss: 0.2605
Validation Loss: 0.2952


Epoch 3: 100%|██████████| 1197/1197 [13:15<00:00,  1.50it/s]


Epoch 3 Loss: 0.2036
Validation Loss: 0.3170


Epoch 4: 100%|██████████| 1197/1197 [13:16<00:00,  1.50it/s]


Epoch 4 Loss: 0.1584
Validation Loss: 0.3447
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [01:04<00:00,  4.63it/s]


Accuracy:  0.8723
Precision: 0.8371
Recall:    0.9248
F1-score:  0.8787
ROC AUC:   0.9525
              precision    recall  f1-score   support

         0.0       0.92      0.82      0.87      2391
         1.0       0.84      0.92      0.88      2394

    accuracy                           0.87      4785
   macro avg       0.88      0.87      0.87      4785
weighted avg       0.88      0.87      0.87      4785

