In [1]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm

# ======================================================
# 1. LOAD + BALANCE DATA
# ======================================================
df = pd.read_csv('/kaggle/input/memo-health-trigger-dataset/Memo_Dataset.csv')
df = df[['Question', 'Trigger']]
df['Trigger'] = df['Trigger'].astype(int)

min_count = min(5000, df['Trigger'].value_counts().min())
df_balanced = df.groupby('Trigger', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42)
).reset_index(drop=True)

df_balanced = df_balanced.rename(columns={'Question': 'text', 'Trigger': 'label'})

train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced['label'],
    random_state=42
)

print("Train:", len(train_df), " Test:", len(test_df))

# ======================================================
# 2. DATASET CLASS
# ======================================================
class TriggerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze() for k, v in encoded.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ======================================================
# 3. LOAD MARBERT
# ======================================================
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name).to(device)
base_model.eval()

# Freeze MARBERT
for param in base_model.parameters():
    param.requires_grad = False

# Optional: unfreeze last 1-2 layers
unfreeze_last = True
if unfreeze_last:
    for layer in base_model.encoder.layer[-2:]:
        for param in layer.parameters():
            param.requires_grad = True

# ======================================================
# 4. MLP CLASSIFIER ON TOP
# ======================================================
class CLS_MLP(nn.Module):
    def __init__(self, base_model, hidden_size=768, mlp_hidden=384, num_labels=2, dropout=0.2):
        super().__init__()
        self.base = base_model
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls_embed = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.mlp(cls_embed)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return loss, logits
        return logits

# ======================================================
# 5. TRAINING CONFIG
# ======================================================
train_dataset = TriggerDataset(train_df, tokenizer)
test_dataset = TriggerDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

model = CLS_MLP(base_model).to(device)
optimizer = AdamW(model.parameters(), lr=2e-4)

num_epochs = 5

# ======================================================
# 6. TRAIN LOOP
# ======================================================
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss, logits = model(input_ids, attention_mask, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# ======================================================
# 7. EVALUATION
# ======================================================
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds)
rec = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
cm = confusion_matrix(all_labels, all_preds)

print("\n=== FINAL METRICS ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(all_labels, all_preds))


  df_balanced = df.groupby('Trigger', group_keys=False).apply(


Train: 8000  Test: 2000


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-12-03 06:42:28.971822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764744149.153484      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764744149.214007      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Epoch 1/5:   0%|          | 0/500 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

Epoch 1/5: 100%|██████████| 500/500 [01:24<00:00,  5.88it/s]


Epoch 1 Loss: 0.6067


Epoch 2/5: 100%|██████████| 500/500 [01:24<00:00,  5.92it/s]


Epoch 2 Loss: 0.5737


Epoch 3/5: 100%|██████████| 500/500 [01:24<00:00,  5.90it/s]


Epoch 3 Loss: 0.5502


Epoch 4/5: 100%|██████████| 500/500 [01:24<00:00,  5.90it/s]


Epoch 4 Loss: 0.5340


Epoch 5/5: 100%|██████████| 500/500 [01:24<00:00,  5.90it/s]


Epoch 5 Loss: 0.5325

=== FINAL METRICS ===
Accuracy:  0.7105
Precision: 0.6908
Recall:    0.7620
F1 Score:  0.7247

Confusion Matrix:
 [[659 341]
 [238 762]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.66      0.69      1000
           1       0.69      0.76      0.72      1000

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000

