ModernBert feature extractor

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

df = pd.read_csv('data/df_eda.csv')
label_map = {"No": 0, "To some extent": 1, "Yes": 2}
for metric in ["mistake_identification", "mistake_location", "providing_guidance", "actionability"]:
    df[metric + "_label"] = df[metric].map(label_map)
target_columns = ['mistake_identification_label', 'mistake_location_label', 'providing_guidance_label', 'actionability_label']

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
modernBert = AutoModel.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def extract_embeddings(texts, model, tokenizer, method="cls", batch_size=16, max_length=512):
    model.eval()
    model.to(device)
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting BERT embeddings ({method})"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        if method == "cls":
            batch_embeddings = hidden_states[:, 0, :]  # [CLS] token
        elif method == "mean":
            batch_embeddings = hidden_states.mean(dim=1)
        elif method == "max":
            batch_embeddings = hidden_states.max(dim=1).values
        elif method == "cls+mean":
            cls = hidden_states[:, 0, :]
            mean = hidden_states.mean(dim=1)
            batch_embeddings = torch.cat([cls, mean], dim=1)
        else:
            raise ValueError(f"Unknown method '{method}'. Choose from: 'cls', 'mean', 'max', 'cls+mean'.")

        embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)


# df['response_embeddings_cls'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="cls"))
df['response_embeddings_mean'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="mean"))
# df['response_embeddings_max'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="max"))
# df['response_embeddings_cls+mean'] = list(extract_embeddings(df['response'].tolist(), model=modernBert, tokenizer=tokenizer, method="cls+mean"))

# X_bert_cls = np.array(df['response_embeddings_cls'].tolist())
X_bert_mean = np.array(df['response_embeddings_mean'].tolist())
# X_bert_max = np.array(df['response_embeddings_max'].tolist())
# X_bert_cls_mean = np.array(df['response_embeddings_cls+mean'].tolist())

y_task1 = df[target_columns[0]].values
y_task2 = df[target_columns[1]].values
y_task3 = df[target_columns[2]].values
y_task4 = df[target_columns[3]].values


Extracting BERT embeddings (mean): 100%|██████████| 155/155 [00:18<00:00,  8.23it/s]


KeyError: 'mistake_identification_label'

In [None]:
import numpy as np
# np.save('data/embeddings/X_bert_cls.npy', X_bert_cls)
np.save('data/embeddings/X_bert_mean.npy', X_bert_mean)
# np.save('data/embeddings/X_bert_max.npy', X_bert_max)
# np.save('data/embeddings/X_bert_cls_mean.npy', X_bert_cls_mean)

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention_vector = nn.Parameter(torch.randn(hidden_size))

    def forward(self, hidden_states, mask):
        # hidden_states: (batch, seq_len, hidden_size)
        # mask: (batch, seq_len) — 1 for real tokens, 0 for padding
        self.attention_vector = self.attention_vector.to(hidden_states.device)
        # Compute token-level attention scores: (batch, seq_len)
        scores = torch.matmul(hidden_states, self.attention_vector)
        scores = scores.masked_fill(mask == 0, -1e9)  # Mask out padding tokens
        weights = F.softmax(scores, dim=1)

        # Apply weights to hidden states: (batch, hidden_size)
        weighted_output = torch.sum(hidden_states * weights.unsqueeze(-1), dim=1)
        return weighted_output

model = AutoModel.from_pretrained(model_id)
attention_pooling = AttentionPooling(hidden_size=model.config.hidden_size).to(device)

def extract_embeddings_attention(texts, model, tokenizer, batch_size=16, max_length=512):
    model.eval()
    model.to(device)
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting BERT embeddings (attention pooling)"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)
            attention_mask = inputs['attention_mask']
            batch_embeddings = attention_pooling(hidden_states, attention_mask)

        embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)

df['response_embeddings_attention'] = list(extract_embeddings_attention(df['response'].tolist(), model=model, tokenizer=tokenizer))
X_bert_attention = np.array(df['response_embeddings_attention'].tolist())
np.save('data/embeddings/X_bert_attention.npy', X_bert_attention)

Extracting BERT embeddings (attention pooling): 100%|██████████| 150/150 [00:16<00:00,  9.32it/s]


In [10]:
df.to_csv('data/df_embeddings.csv', index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np

def evaluate_embeddings(X, y, name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    acc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        f1 = f1_score(y_test, y_pred, average='macro')
        f1_scores.append(f1)

    avg_acc = np.mean(acc_scores)
    avg_f1 = np.mean(f1_scores)
    print(f"{name} - Avg Macro F1: {avg_f1:.4f}")
    return avg_acc, avg_f1

# Load embeddings from dataframe
X_bert_cls = np.array(df['response_embeddings_cls'].tolist())
X_bert_mean = np.array(df['response_embeddings_mean'].tolist())
X_bert_max = np.array(df['response_embeddings_max'].tolist())
X_bert_cls_mean = np.array(df['response_embeddings_cls+mean'].tolist())
X_bert_attention = np.array(df['response_embeddings_attention'].tolist())

# Load targets
targets = {
    "Task 1 (Mistake Identification)": df[target_columns[0]].values,
    "Task 2 (Mistake Location)": df[target_columns[1]].values,
    "Task 3 (Providing Guidance)": df[target_columns[2]].values,
    "Task 4 (Actionability)": df[target_columns[3]].values
}

# Embeddings dictionary
embeddings = {
    "BERT [CLS]": X_bert_cls,
    "BERT Mean": X_bert_mean,
    "BERT Max": X_bert_max,
    "BERT [CLS]+Mean": X_bert_cls_mean,
    "BERT Attention-Pooled": X_bert_attention
}

# Evaluate each embedding type across all tasks
results = {}
for emb_name, X_emb in embeddings.items():
    print(f"\n===== {emb_name} =====")
    results[emb_name] = {}
    for task_name, y in targets.items():
        acc, f1 = evaluate_embeddings(X_emb, y, task_name)
        results[emb_name][task_name] = [acc,f1]


===== BERT [CLS] =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5818
Task 2 (Mistake Location) - Avg Macro F1: 0.4696
Task 3 (Providing Guidance) - Avg Macro F1: 0.4928
Task 4 (Actionability) - Avg Macro F1: 0.5420

===== BERT Mean =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6000
Task 2 (Mistake Location) - Avg Macro F1: 0.5042
Task 3 (Providing Guidance) - Avg Macro F1: 0.4999
Task 4 (Actionability) - Avg Macro F1: 0.5844

===== BERT Max =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5540
Task 2 (Mistake Location) - Avg Macro F1: 0.4603
Task 3 (Providing Guidance) - Avg Macro F1: 0.4568


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Task 4 (Actionability) - Avg Macro F1: 0.5240

===== BERT [CLS]+Mean =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.6067
Task 2 (Mistake Location) - Avg Macro F1: 0.5068
Task 3 (Providing Guidance) - Avg Macro F1: 0.5046
Task 4 (Actionability) - Avg Macro F1: 0.5823

===== BERT Attention-Pooled =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5233
Task 2 (Mistake Location) - Avg Macro F1: 0.4380
Task 3 (Providing Guidance) - Avg Macro F1: 0.4500
Task 4 (Actionability) - Avg Macro F1: 0.4872


{'BERT [CLS]': {'Task 1 (Mistake Identification)': [0.7870833333333334,
   0.5818010135553957],
  'Task 2 (Mistake Location)': [0.5870833333333334, 0.4695766949091082],
  'Task 3 (Providing Guidance)': [0.5345833333333334, 0.4928147675305268],
  'Task 4 (Actionability)': [0.5975, 0.5419969882383128]},
 'BERT Mean': {'Task 1 (Mistake Identification)': [0.8012500000000001,
   0.600008934220882],
  'Task 2 (Mistake Location)': [0.6224999999999999, 0.5042206193009685],
  'Task 3 (Providing Guidance)': [0.5483333333333333, 0.49993909881447396],
  'Task 4 (Actionability)': [0.64125, 0.5843853493663623]},
 'BERT Max': {'Task 1 (Mistake Identification)': [0.79125, 0.5540090329313057],
  'Task 2 (Mistake Location)': [0.5879166666666668, 0.46031579693058544],
  'Task 3 (Providing Guidance)': [0.5104166666666666, 0.456823232036338],
  'Task 4 (Actionability)': [0.5912499999999999, 0.5240188662769023]},
 'BERT [CLS]+Mean': {'Task 1 (Mistake Identification)': [0.8233333333333335,
   0.6067062211791

from above => mean+cls best for all

below => mean is the best

In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def evaluate_embeddings_with_lda_cv(X, y, name, n_components=2):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores, acc_scores = [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Build a pipeline: scale -> LDA -> Logistic Regression
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("lda", LinearDiscriminantAnalysis(n_components=n_components)),
            ("clf", LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        f1 = f1_score(y_test, y_pred, average="macro")
        acc = accuracy_score(y_test, y_pred)
        f1_scores.append(f1)
        acc_scores.append(acc)

    avg_f1 = np.mean(f1_scores)
    avg_acc = np.mean(acc_scores)
    print(f"{name} - Avg Accuracy: {avg_acc:.4f}, Avg Macro F1: {avg_f1:.4f}")
    return avg_acc, avg_f1


# Evaluate each embedding type across all tasks
results = {}
for emb_name, X_emb in embeddings.items():
    print(f"\n===== {emb_name} =====")
    results[emb_name] = {}
    for task_name, y in targets.items():
        acc, f1 = evaluate_embeddings_with_lda_cv(X_emb, y, task_name)
        results[emb_name][task_name] = [acc,f1]


===== BERT [CLS] =====
Task 1 (Mistake Identification) - Avg Accuracy: 0.7096, Avg Macro F1: 0.5253
Task 2 (Mistake Location) - Avg Accuracy: 0.5462, Avg Macro F1: 0.4502
Task 3 (Providing Guidance) - Avg Accuracy: 0.4950, Avg Macro F1: 0.4579
Task 4 (Actionability) - Avg Accuracy: 0.5646, Avg Macro F1: 0.5115

===== BERT Mean =====
Task 1 (Mistake Identification) - Avg Accuracy: 0.7329, Avg Macro F1: 0.5527
Task 2 (Mistake Location) - Avg Accuracy: 0.5733, Avg Macro F1: 0.4769
Task 3 (Providing Guidance) - Avg Accuracy: 0.5075, Avg Macro F1: 0.4666
Task 4 (Actionability) - Avg Accuracy: 0.5867, Avg Macro F1: 0.5360

===== BERT Max =====
Task 1 (Mistake Identification) - Avg Accuracy: 0.7100, Avg Macro F1: 0.5182
Task 2 (Mistake Location) - Avg Accuracy: 0.5388, Avg Macro F1: 0.4465
Task 3 (Providing Guidance) - Avg Accuracy: 0.5037, Avg Macro F1: 0.4649
Task 4 (Actionability) - Avg Accuracy: 0.5675, Avg Macro F1: 0.5198

===== BERT [CLS]+Mean =====
Task 1 (Mistake Identification) - A

In [19]:
def evaluate_embeddings_mlp(X, y, name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    acc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = MLPClassifier(
            hidden_layer_sizes=(256, 128),
            max_iter=500,
            early_stopping=True,
            learning_rate='adaptive',
            random_state=42
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        f1 = f1_score(y_test, y_pred, average='macro')
        f1_scores.append(f1)

    avg_acc = np.mean(acc_scores)
    avg_f1 = np.mean(f1_scores)
    print(f"{name} - Avg Macro F1: {avg_f1:.4f}, Avg Accuracy: {avg_acc:.4f}")
    return avg_acc, avg_f1

# Evaluate all embeddings and tasks
results_mlp = {}
for emb_name, X_emb in embeddings.items():
    print(f"\n===== {emb_name} (MLP) =====")
    results_mlp[emb_name] = {}
    for task_name, y in targets.items():
        acc, f1 = evaluate_embeddings_mlp(X_emb, y, task_name)
        results_mlp[emb_name][task_name] = [acc, f1]


===== BERT [CLS] (MLP) =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5518, Avg Accuracy: 0.8650
Task 2 (Mistake Location) - Avg Macro F1: 0.4283, Avg Accuracy: 0.7079
Task 3 (Providing Guidance) - Avg Macro F1: 0.4411, Avg Accuracy: 0.6375
Task 4 (Actionability) - Avg Macro F1: 0.4636, Avg Accuracy: 0.6521

===== BERT Mean (MLP) =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5498, Avg Accuracy: 0.8642
Task 2 (Mistake Location) - Avg Macro F1: 0.4668, Avg Accuracy: 0.7029
Task 3 (Providing Guidance) - Avg Macro F1: 0.4423, Avg Accuracy: 0.6338
Task 4 (Actionability) - Avg Macro F1: 0.5387, Avg Accuracy: 0.6929

===== BERT Max (MLP) =====
Task 1 (Mistake Identification) - Avg Macro F1: 0.5487, Avg Accuracy: 0.8692
Task 2 (Mistake Location) - Avg Macro F1: 0.4296, Avg Accuracy: 0.7100
Task 3 (Providing Guidance) - Avg Macro F1: 0.4296, Avg Accuracy: 0.6396
Task 4 (Actionability) - Avg Macro F1: 0.4728, Avg Accuracy: 0.6604

===== BERT [CLS]+Mean (MLP) =====
Task 1 (Mis

improved accuracy compared to log reg
cls+mean seems to be the best

current best for actionability

## Try adding last student utterance 

In [20]:
# Load ModernBERT model
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Create concatenated input: (LSU + [SEP] + Response)
df["input_text"] = df["last_student_utterance"] + " [SEP] " + df["response"]

In [None]:
def extract_cls_embeddings(texts, batch_size=16):
    embeddings = []
    truncated_examples = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting CLS Embeddings"):
        batch = texts[i:i+batch_size]

        encoded = tokenizer(
            batch,
            return_tensors='pt',
            padding=True,
            truncation=True,
            return_overflowing_tokens=True,
            return_length=True
        )
        lengths = encoded.pop("length") 
        encoded.pop("overflow_to_sample_mapping", None)
      
        for j, length in enumerate(lengths[:len(batch)]):  # ensure match
            if length >= 512:
                truncated_examples.append(batch[j])

        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            outputs = model(**encoded)
            cls = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(cls)

    # Print truncated examples summary
    print(f"\n {len(truncated_examples)} out of {len(texts)} texts were truncated.")
    print("Sample truncated examples:\n")
    for example in truncated_examples[:5]:  # print first 5
        print("-" * 80)
        print(example[:500] + ("..." if len(example) > 500 else ""))  # print first 500 chars
        print()

    return np.array(embeddings)


# Extract CLS embeddings
X_lsu_bert_cls = extract_cls_embeddings(df["input_text"].tolist())

Extracting CLS Embeddings: 100%|██████████| 150/150 [01:21<00:00,  1.85it/s]


⚠️ 144 out of 2400 texts were truncated.
🔎 Sample truncated examples:

--------------------------------------------------------------------------------
 75 [SEP] That's almost right, but remember to multiply the numbers step by step, so 5 times 4 equals 20, and then add the 5 times 10 which is 50, giving us a total of 70. Let's try that together next time!

--------------------------------------------------------------------------------
 Yes 30*10 is 1300. [SEP] Tutor response: That's correct, 30 multiplied by 10 equals 300, not 1300.


### Assistant:
Tutor response (maximum one sentence):

--------------------------------------------------------------------------------
 is that a 20 [SEP] That's okay! Remember, there are actually 1,000 milliliters in one liter—so think of it like adding three zeros to 1 for the conversion.

--------------------------------------------------------------------------------
 We can work this out by taking the cost of the wife's ring (which is twice the c




In [24]:
def evaluate_embeddings_mlp(X, y, name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    acc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = MLPClassifier(
            hidden_layer_sizes=(256, 128),
            max_iter=500,
            early_stopping=True,
            learning_rate='adaptive',
            random_state=42
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        acc_scores.append(acc)
        f1_scores.append(f1)

    return np.mean(acc_scores), np.mean(f1_scores)

results = {}
for task_name, y in targets.items():
    acc, f1 = evaluate_embeddings_mlp(X_lsu_bert_cls, y, task_name)
    results[task_name] = (acc, f1)

print("\nFinal Results:")
for task, (acc, f1) in results.items():
    print(f"{task}: Accuracy = {acc:.4f}, Macro F1 = {f1:.4f}")


Final Results:
Task 1 (Mistake Identification): Accuracy = 0.8071, Macro F1 = 0.3804
Task 2 (Mistake Location): Accuracy = 0.6750, Macro F1 = 0.4141
Task 3 (Providing Guidance): Accuracy = 0.5979, Macro F1 = 0.4003
Task 4 (Actionability): Accuracy = 0.6250, Macro F1 = 0.4502


<!-- Task 1 (Mistake Identification) -  Avg Accuracy: 0.8650, Avg Macro F1: 0.5518,
Task 2 (Mistake Location) -             Avg Accuracy: 0.7079, Avg Macro F1: 0.4283
Task 3 (Providing Guidance) -           Avg Accuracy: 0.6375, Avg Macro F1: 0.4411, 
Task 4 (Actionability) -                Avg Accuracy: 0.6521, Avg Macro F1: 0.4636,  -->
using only responses embedding and cls method


In [29]:
np.save('data/embeddings/X_lsu_bert_cls.npy', X_lsu_bert_cls)