In [1]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, f1_score

# Disable W&B and detect GPU
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# =========================
# 1. LOAD & PREPARE DATA
# =========================
df = pd.read_csv('/kaggle/input/memo-health-trigger-dataset/Memo_Dataset.csv')
df = df[['Question', 'Question_eng', 'Trigger']]
df['Trigger'] = df['Trigger'].astype(int)

# Quick run: sample only 100 from each class
min_count = min(5000, df['Trigger'].value_counts().min())  
df_balanced = df.groupby('Trigger', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42)
).reset_index(drop=True)

# Use only the Arabic question, keep text as-is
df_balanced = df_balanced[['Question', 'Trigger']].rename(
    columns={'Question': 'text', 'Trigger': 'label'}
)

# Train-test split
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    random_state=42,
    stratify=df_balanced['label']
)

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")

# ======================
# 2. DATASET CLASS
# ======================
class TriggerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze() for k, v in encoded.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ================================
# 3. LOAD TOKENIZER & MODEL
# ================================
model_name = "UBC-NLP/MARBERT"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    problem_type="single_label_classification"
).to(device)

train_dataset = TriggerDataset(train_df, tokenizer)
test_dataset = TriggerDataset(test_df, tokenizer)

# ================
# 4. METRICS
# ================
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="binary")
    }

# ==========================
# 5. TRAINING CONFIGURATION
# ==========================
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    save_total_limit=2,
    seed=42,
    optim="adamw_torch",
    report_to=[]
)

# ================
# 6. TRAINER
# ================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ================
# 7. TRAIN + EVAL
# ================
trainer.train()
final_metrics = trainer.evaluate()

print("Final evaluation:", final_metrics)


2025-12-03 04:03:00.534958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764734580.719562      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764734580.771842      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


  df_balanced = df.groupby('Trigger', group_keys=False).apply(


Training samples: 8000, Test samples: 2000


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5882,0.567309,0.716,0.725073
2,0.471,0.681548,0.716,0.730806
3,0.3873,1.135443,0.6895,0.718112
4,0.2376,1.495145,0.6825,0.672173
5,0.1404,1.770351,0.6645,0.688631


Final evaluation: {'eval_loss': 0.6815475821495056, 'eval_accuracy': 0.716, 'eval_f1': 0.7308056872037914, 'eval_runtime': 14.8055, 'eval_samples_per_second': 135.085, 'eval_steps_per_second': 16.886, 'epoch': 5.0}


In [2]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import numpy as np
import torch

# ============
# 1. GET PREDICTIONS
# ============
predictions_output = trainer.predict(test_dataset)

logits = predictions_output.predictions
labels = predictions_output.label_ids
preds = np.argmax(logits, axis=1)

# Softmax probabilities
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
pred_conf = probs[np.arange(len(preds)), preds]

# ============
# 2. BASIC METRICS
# ============
acc = accuracy_score(labels, preds)
precision = precision_score(labels, preds)
recall = recall_score(labels, preds)
f1 = f1_score(labels, preds)

# ============
# 3. CONFUSION MATRIX
# ============
cm = confusion_matrix(labels, preds)
tn, fp, fn, tp = cm.ravel()

# ============
# 4. CLASSIFICATION REPORT
# ============
cls_report = classification_report(labels, preds, target_names=["Non-trigger (0)", "Trigger (1)"])

# ============
# 5. CONFIDENCE ANALYSIS
# ============
correct_conf = pred_conf[preds == labels].mean()
incorrect_conf = pred_conf[preds != labels].mean()

# ============
# 6. PRINT EVERYTHING CLEANLY
# ============

print(f"\nDataset: {len(labels)} samples")
print(f"Class 0 samples: {np.sum(labels == 0)}")
print(f"Class 1 samples: {np.sum(labels == 1)}\n")

print("METRICS:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}\n")

print("CONFUSION MATRIX:")
print(f"{'':16}Predicted 0  Predicted 1")
print(f"Actual 0{cm[0]}")
print(f"Actual 1{cm[1]}\n")

print("CLASSIFICATION REPORT:")
print(cls_report)

print("CONFIDENCE ANALYSIS:")
print(f"Average confidence (correct predictions):   {correct_conf:.4f}")
print(f"Average confidence (incorrect predictions): {incorrect_conf:.4f}")



Dataset: 2000 samples
Class 0 samples: 1000
Class 1 samples: 1000

METRICS:
Accuracy:  0.7160
Precision: 0.6946
Recall:    0.7710
F1 Score:  0.7308

CONFUSION MATRIX:
                Predicted 0  Predicted 1
Actual 0[661 339]
Actual 1[229 771]

CLASSIFICATION REPORT:
                 precision    recall  f1-score   support

Non-trigger (0)       0.74      0.66      0.70      1000
    Trigger (1)       0.69      0.77      0.73      1000

       accuracy                           0.72      2000
      macro avg       0.72      0.72      0.72      2000
   weighted avg       0.72      0.72      0.72      2000

CONFIDENCE ANALYSIS:
Average confidence (correct predictions):   0.8887
Average confidence (incorrect predictions): 0.8438
