In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
modernBert = AutoModel.from_pretrained(model_id).to(device)
modernBert.eval()

def extract_embeddings(texts, model, tokenizer, method="cls", batch_size=16, max_length=512):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting ({method})"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            output = model(**enc).last_hidden_state

        if method == "cls":
            batch_emb = output[:, 0, :]
        elif method == "mean":
            batch_emb = output.mean(dim=1)
        elif method == "max":
            batch_emb = output.max(dim=1).values
        elif method == "cls+mean":
            batch_emb = torch.cat([output[:, 0, :], output.mean(dim=1)], dim=1)
        else:
            raise ValueError("Unsupported pooling method")

        embeddings.extend(batch_emb.cpu().numpy())
    return np.array(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

In [2]:
def evaluate_split(train_csv, val_csv, label_col, pooling="cls+mean"):
    df_train = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    X_train = extract_embeddings(df_train["response"].tolist(), modernBert, tokenizer, method=pooling)
    X_val = extract_embeddings(df_val["response"].tolist(), modernBert, tokenizer, method=pooling)

    y_train = df_train[label_col].values
    y_val = df_val[label_col].values

    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    f1 = f1_score(y_val, y_pred, average="macro")
    acc = accuracy_score(y_val, y_pred)

    print(f"{label_col} [{pooling}] → Acc: {acc:.4f}, Macro F1: {f1:.4f}")
    return acc, f1


In [4]:
tasks = [
    "mistake_identification",
    "mistake_location",
    "providing_guidance",
    "actionability",
]
poolings = ["cls", "mean", "max", "cls+mean"]
results = {}

for task_name in tasks:
  for pooling in poolings:
    print(f"\n Evaluating {task_name}")
    acc, f1 = evaluate_split(
        train_csv=f"{task_name}_train.csv",
        val_csv=f"{task_name}_val.csv",
        label_col=task_name,
        pooling=pooling
    )
    results[task_name] = (acc, f1)



 Evaluating mistake_identification


Extracting (cls): 100%|██████████| 124/124 [00:18<00:00,  6.78it/s]
Extracting (cls): 100%|██████████| 31/31 [00:02<00:00, 10.78it/s]


mistake_identification [cls] → Acc: 0.8012, Macro F1: 0.6265

 Evaluating mistake_identification


Extracting (mean): 100%|██████████| 124/124 [00:10<00:00, 11.33it/s]
Extracting (mean): 100%|██████████| 31/31 [00:02<00:00, 10.83it/s]


mistake_identification [mean] → Acc: 0.8235, Macro F1: 0.6558

 Evaluating mistake_identification


Extracting (max): 100%|██████████| 124/124 [00:10<00:00, 11.31it/s]
Extracting (max): 100%|██████████| 31/31 [00:02<00:00, 10.81it/s]


mistake_identification [max] → Acc: 0.8114, Macro F1: 0.5829

 Evaluating mistake_identification


Extracting (cls+mean): 100%|██████████| 124/124 [00:10<00:00, 11.31it/s]
Extracting (cls+mean): 100%|██████████| 31/31 [00:02<00:00, 10.81it/s]


mistake_identification [cls+mean] → Acc: 0.8337, Macro F1: 0.6391

 Evaluating mistake_location


Extracting (cls): 100%|██████████| 124/124 [00:11<00:00, 11.01it/s]
Extracting (cls): 100%|██████████| 32/32 [00:02<00:00, 10.89it/s]


mistake_location [cls] → Acc: 0.6446, Macro F1: 0.5087

 Evaluating mistake_location


Extracting (mean): 100%|██████████| 124/124 [00:11<00:00, 11.00it/s]
Extracting (mean): 100%|██████████| 32/32 [00:02<00:00, 11.00it/s]


mistake_location [mean] → Acc: 0.5944, Macro F1: 0.4743

 Evaluating mistake_location


Extracting (max): 100%|██████████| 124/124 [00:11<00:00, 10.98it/s]
Extracting (max): 100%|██████████| 32/32 [00:02<00:00, 11.00it/s]


mistake_location [max] → Acc: 0.6024, Macro F1: 0.4689

 Evaluating mistake_location


Extracting (cls+mean): 100%|██████████| 124/124 [00:11<00:00, 11.01it/s]
Extracting (cls+mean): 100%|██████████| 32/32 [00:02<00:00, 11.00it/s]


mistake_location [cls+mean] → Acc: 0.6245, Macro F1: 0.4560

 Evaluating providing_guidance


Extracting (cls): 100%|██████████| 124/124 [00:10<00:00, 11.53it/s]
Extracting (cls): 100%|██████████| 31/31 [00:03<00:00,  7.96it/s]


providing_guidance [cls] → Acc: 0.5242, Macro F1: 0.4809

 Evaluating providing_guidance


Extracting (mean): 100%|██████████| 124/124 [00:10<00:00, 11.51it/s]
Extracting (mean): 100%|██████████| 31/31 [00:03<00:00,  7.97it/s]


providing_guidance [mean] → Acc: 0.5000, Macro F1: 0.4686

 Evaluating providing_guidance


Extracting (max): 100%|██████████| 124/124 [00:10<00:00, 11.53it/s]
Extracting (max): 100%|██████████| 31/31 [00:03<00:00,  7.98it/s]


providing_guidance [max] → Acc: 0.4960, Macro F1: 0.4493

 Evaluating providing_guidance


Extracting (cls+mean): 100%|██████████| 124/124 [00:10<00:00, 11.52it/s]
Extracting (cls+mean): 100%|██████████| 31/31 [00:03<00:00,  7.97it/s]


providing_guidance [cls+mean] → Acc: 0.5282, Macro F1: 0.4674

 Evaluating actionability


Extracting (cls): 100%|██████████| 124/124 [00:11<00:00, 10.44it/s]
Extracting (cls): 100%|██████████| 31/31 [00:02<00:00, 10.92it/s]


actionability [cls] → Acc: 0.5899, Macro F1: 0.5358

 Evaluating actionability


Extracting (mean): 100%|██████████| 124/124 [00:11<00:00, 10.42it/s]
Extracting (mean): 100%|██████████| 31/31 [00:02<00:00, 10.89it/s]


actionability [mean] → Acc: 0.6222, Macro F1: 0.5665

 Evaluating actionability


Extracting (max): 100%|██████████| 124/124 [00:11<00:00, 10.41it/s]
Extracting (max): 100%|██████████| 31/31 [00:02<00:00, 10.91it/s]


actionability [max] → Acc: 0.5899, Macro F1: 0.5313

 Evaluating actionability


Extracting (cls+mean): 100%|██████████| 124/124 [00:11<00:00, 10.42it/s]
Extracting (cls+mean): 100%|██████████| 31/31 [00:02<00:00, 10.91it/s]


actionability [cls+mean] → Acc: 0.5899, Macro F1: 0.5158
