In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

df = pd.read_csv('df_eda.csv')
model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
modernBert = AutoModel.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def extract_embeddings(texts, model, tokenizer, method="cls", batch_size=16, max_length=512):
    model.eval()
    model.to(device)
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting BERT embeddings ({method})"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        if method == "cls":
            batch_embeddings = hidden_states[:, 0, :]  # [CLS] token
        elif method == "mean":
            batch_embeddings = hidden_states.mean(dim=1)
        elif method == "max":
            batch_embeddings = hidden_states.max(dim=1).values
        elif method == "cls+mean":
            cls = hidden_states[:, 0, :]
            mean = hidden_states.mean(dim=1)
            batch_embeddings = torch.cat([cls, mean], dim=1)
        else:
            raise ValueError(f"Unknown method '{method}'. Choose from: 'cls', 'mean', 'max', 'cls+mean'.")

        embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)


df['response_embeddings_cls'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="cls"))
df['response_embeddings_mean'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="mean"))
df['response_embeddings_max'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="max"))
df['response_embeddings_cls+mean'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="cls+mean"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Extracting BERT embeddings (cls): 100%|██████████| 155/155 [00:23<00:00,  6.73it/s]
Extracting BERT embeddings (mean): 100%|██████████| 155/155 [00:14<00:00, 11.01it/s]
Extracting BERT embeddings (max): 100%|██████████| 155/155 [00:14<00:00, 11.02it/s]
Extracting BERT embeddings (cls+mean): 100%|██████████| 155/155 [00:14<00:00, 11.03it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np

def evaluate_embeddings(X, y, name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    acc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        f1 = f1_score(y_test, y_pred, average='macro')
        f1_scores.append(f1)

    avg_acc = np.mean(acc_scores)
    avg_f1 = np.mean(f1_scores)
    print(f"{name} - Avg Macro F1: {avg_f1:.4f}")
    return avg_acc, avg_f1

X_bert_cls = np.array(df['response_embeddings_cls'].tolist())
X_bert_mean = np.array(df['response_embeddings_mean'].tolist())
X_bert_max = np.array(df['response_embeddings_max'].tolist())
X_bert_cls_mean = np.array(df['response_embeddings_cls+mean'].tolist())

target_columns = ['mistake_identification', 'mistake_location', 'providing_guidance', 'actionability']
targets = {
    "Task 1 (Mistake Identification)": df[target_columns[0]].values,
    "Task 2 (Mistake Location)": df[target_columns[1]].values,
    "Task 3 (Providing Guidance)": df[target_columns[2]].values,
    "Task 4 (Actionability)": df[target_columns[3]].values
}

embeddings = {
    "BERT [CLS]": X_bert_cls,
    "BERT Mean": X_bert_mean,
    "BERT Max": X_bert_max,
    "BERT [CLS]+Mean": X_bert_cls_mean,
}

results = {}
for emb_name, X_emb in embeddings.items():
    print(f"\n===== {emb_name} =====")
    results[emb_name] = {}
    for task_name, y in targets.items():
        acc, f1 = evaluate_embeddings(X_emb, y, task_name)
        results[emb_name][task_name] = [acc,f1]


===== BERT [CLS] =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6094
Task 2 (Mistake Location) - Avg Macro F1: 0.5073
Task 3 (Providing Guidance) - Avg Macro F1: 0.5195
Task 4 (Actionability) - Avg Macro F1: 0.5597

===== BERT Mean =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6183
Task 2 (Mistake Location) - Avg Macro F1: 0.5135
Task 3 (Providing Guidance) - Avg Macro F1: 0.5251
Task 4 (Actionability) - Avg Macro F1: 0.5805

===== BERT Max =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6009
Task 2 (Mistake Location) - Avg Macro F1: 0.4804
Task 3 (Providing Guidance) - Avg Macro F1: 0.4882
Task 4 (Actionability) - Avg Macro F1: 0.5358

===== BERT [CLS]+Mean =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6156
Task 2 (Mistake Location) - Avg Macro F1: 0.5120
Task 3 (Providing Guidance) - Avg Macro F1: 0.5298
Task 4 (Actionability) - Avg Macro F1: 0.5628
