In [1]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score,
    recall_score, classification_report,
    confusion_matrix
)
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# ======================================================
# 1. LOAD + BALANCE DATA
# ======================================================
df = pd.read_csv('/kaggle/input/memo-health-trigger-dataset/Memo_Dataset.csv')
df = df[['Question', 'Trigger']]
df['Trigger'] = df['Trigger'].astype(int)

min_count = min(5000, df['Trigger'].value_counts().min())
df_balanced = df.groupby('Trigger', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42)
).reset_index(drop=True)

df_balanced = df_balanced.rename(columns={'Question': 'text', 'Trigger': 'label'})

train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced['label'],
    random_state=42
)

print("Train:", len(train_df), " Test:", len(test_df))

# ======================================================
# 2. LOAD MARBERT
# ======================================================
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

# Freeze MARBERT
for param in model.parameters():
    param.requires_grad = False

# ======================================================
# 3. FUNCTION TO EXTRACT CLS EMBEDDINGS
# ======================================================
def get_cls_embeddings(texts):
    embeddings = []
    batch_size = 32

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoded)
            cls_vec = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        embeddings.append(cls_vec)

    return np.vstack(embeddings)

# ======================================================
# 4. EMBEDDINGS FOR TRAIN + TEST
# ======================================================
print("Extracting TRAIN embeddings...")
X_train = get_cls_embeddings(train_df["text"].tolist())
y_train = train_df["label"].values

print("Extracting TEST embeddings...")
X_test = get_cls_embeddings(test_df["text"].tolist())
y_test = test_df["label"].values

print("Shapes:", X_train.shape, X_test.shape)

# ======================================================
# 5. TRAIN SVM CLASSIFIER
# ======================================================
clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)

# ======================================================
# 6. EVALUATE
# ======================================================
preds = clf.predict(X_test)

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
cm = confusion_matrix(y_test, preds)

print("\n=== FINAL RESULTS ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

print("\nConfusion Matrix:")
print(cm)

print("\nDetailed Report:")
print(classification_report(y_test, preds))


  df_balanced = df.groupby('Trigger', group_keys=False).apply(


Train: 8000  Test: 2000


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-12-03 06:34:30.818672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764743671.005223      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764743671.062877      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Extracting TRAIN embeddings...


  0%|          | 1/250 [00:00<01:29,  2.77it/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

100%|██████████| 250/250 [00:13<00:00, 19.02it/s]


Extracting TEST embeddings...


100%|██████████| 63/63 [00:03<00:00, 18.37it/s]


Shapes: (8000, 768) (2000, 768)

=== FINAL RESULTS ===
Accuracy:  0.6760
Precision: 0.6667
Recall:    0.7040
F1 Score:  0.6848

Confusion Matrix:
[[648 352]
 [296 704]]

Detailed Report:
              precision    recall  f1-score   support

           0       0.69      0.65      0.67      1000
           1       0.67      0.70      0.68      1000

    accuracy                           0.68      2000
   macro avg       0.68      0.68      0.68      2000
weighted avg       0.68      0.68      0.68      2000



