In [None]:
!pip install transformers torch scikit-learn matplotlib seaborn tqdm --quiet

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve
)
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "xlm-roberta-base"

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [None]:
def load_and_split_data(data_path, tokenizer, max_len=64, test_size=0.2, random_state=42):
    df = pd.read_csv(data_path)
    train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['Class'], random_state=random_state)

    train_dataset = TextDataset(train_df['Text'].tolist(), train_df['Class'].tolist(), tokenizer, max_len)
    test_dataset = TextDataset(test_df['Text'].tolist(), test_df['Class'].tolist(), tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    return train_loader, test_loader

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, model_name):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output).squeeze(-1)

In [None]:
def train_and_evaluate(train_loader, test_loader, model_name="xlm-roberta-base", save_dir="results/xlm-roberta"):
    os.makedirs(save_dir, exist_ok=True)

    model = TransformerClassifier(model_name).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCEWithLogitsLoss()

    num_training_steps = len(train_loader) * 6
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    best_val_loss = float("inf")
    patience_counter = 0
    EPOCHS = 6

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_epoch_loss:.4f}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(test_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pt"))
        else:
            patience_counter += 1
            if patience_counter >= 2:
                print("⏹️ Early stopping triggered.")
                break

    model.load_state_dict(torch.load(os.path.join(save_dir, "best_model.pt")))
    model.eval()
    all_preds, all_probs, all_labels = [], [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print(classification_report(all_labels, all_preds, zero_division=0))

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix for RoBERTa - The Best Variant")
    plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
    plt.close()

    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC Curve for RoBERTa - The Best Varianl")
    plt.legend()
    plt.savefig(os.path.join(save_dir, "roc_curve.png"))
    plt.close()

In [None]:
def run_pipeline_xlmr_singlefile(data_path, max_len=256):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_loader, test_loader = load_and_split_data(data_path, tokenizer, max_len)
    train_and_evaluate(train_loader, test_loader, model_name=MODEL_NAME)

In [13]:
from google.colab import files
files.download('/content/results/xlm-roberta/best_model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
run_pipeline_xlmr_singlefile("BAN-PL_raw.csv")

Epoch 1: 100%|██████████| 1200/1200 [14:24<00:00,  1.39it/s]


Epoch 1 Loss: 0.3340
Validation Loss: 0.2266


Epoch 2: 100%|██████████| 1200/1200 [14:23<00:00,  1.39it/s]


Epoch 2 Loss: 0.2048
Validation Loss: 0.2235


Epoch 3: 100%|██████████| 1200/1200 [14:23<00:00,  1.39it/s]


Epoch 3 Loss: 0.1440
Validation Loss: 0.2349


Epoch 4: 100%|██████████| 1200/1200 [14:22<00:00,  1.39it/s]


Epoch 4 Loss: 0.0972
Validation Loss: 0.2636
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [01:00<00:00,  4.96it/s]


Accuracy:  0.9137
Precision: 0.9020
Recall:    0.9283
F1-score:  0.9150
ROC AUC:   0.9733
              precision    recall  f1-score   support

         0.0       0.93      0.90      0.91      2400
         1.0       0.90      0.93      0.91      2400

    accuracy                           0.91      4800
   macro avg       0.91      0.91      0.91      4800
weighted avg       0.91      0.91      0.91      4800



In [9]:
run_pipeline_xlmr_singlefile("BAN-PL_light.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1199/1199 [13:52<00:00,  1.44it/s]


Epoch 1 Loss: 0.4003
Validation Loss: 0.3144


Epoch 2: 100%|██████████| 1199/1199 [13:59<00:00,  1.43it/s]


Epoch 2 Loss: 0.2695
Validation Loss: 0.2843


Epoch 3: 100%|██████████| 1199/1199 [14:00<00:00,  1.43it/s]


Epoch 3 Loss: 0.2015
Validation Loss: 0.3179


Epoch 4: 100%|██████████| 1199/1199 [13:59<00:00,  1.43it/s]


Epoch 4 Loss: 0.1450
Validation Loss: 0.3629
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [00:56<00:00,  5.27it/s]


Accuracy:  0.8903
Precision: 0.9006
Recall:    0.8773
F1-score:  0.8888
ROC AUC:   0.9556
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89      2399
         1.0       0.90      0.88      0.89      2396

    accuracy                           0.89      4795
   macro avg       0.89      0.89      0.89      4795
weighted avg       0.89      0.89      0.89      4795



In [12]:
run_pipeline_xlmr_singlefile("BAN-PL_full.csv")

Epoch 1: 100%|██████████| 1197/1197 [13:49<00:00,  1.44it/s]


Epoch 1 Loss: 0.4445
Validation Loss: 0.3671


Epoch 2: 100%|██████████| 1197/1197 [13:54<00:00,  1.43it/s]


Epoch 2 Loss: 0.3083
Validation Loss: 0.3144


Epoch 3: 100%|██████████| 1197/1197 [13:53<00:00,  1.44it/s]


Epoch 3 Loss: 0.2532
Validation Loss: 0.3236


Epoch 4: 100%|██████████| 1197/1197 [13:53<00:00,  1.44it/s]


Epoch 4 Loss: 0.2025
Validation Loss: 0.3437
⏹️ Early stopping triggered.


Evaluating: 100%|██████████| 300/300 [00:56<00:00,  5.34it/s]


Accuracy:  0.8644
Precision: 0.8598
Recall:    0.8709
F1-score:  0.8653
ROC AUC:   0.9412
              precision    recall  f1-score   support

         0.0       0.87      0.86      0.86      2391
         1.0       0.86      0.87      0.87      2394

    accuracy                           0.86      4785
   macro avg       0.86      0.86      0.86      4785
weighted avg       0.86      0.86      0.86      4785

