In [None]:
from datasets import load_dataset
dataset = load_dataset('OpenLLM-Ro/ro_arc_challenge', trust_remote_code=True)

def preprocess_splits(dataset):
    train_df = dataset['train'].to_pandas().sample(frac = 1, random_state = 1).reset_index(drop=True)
    test_df = dataset['test'].to_pandas().sample(frac = 1, random_state = 1).reset_index(drop=True)
    validation_df = dataset['validation'].to_pandas().sample(frac = 1, random_state = 1).reset_index(drop=True)

    train_df = train_df[train_df['answer'] != 'E']
    train_df = train_df[train_df['option_e'].isnull()]
    test_df = test_df[test_df['answer'] != 'E']
    test_df = test_df[test_df['option_e'].isnull()]
    validation_df = validation_df[validation_df['answer'] != 'E']
    validation_df = validation_df[validation_df['option_e'].isnull()]
    for df in (train_df, validation_df, test_df):
        df.drop(columns=["option_e"], inplace=True)

    splits = {
        "train": train_df,
        "validation": validation_df,
        "test": test_df
    }
    return splits

splits = preprocess_splits(dataset)
splits

In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

MODEL_ROOT = "roarc_embeddings"
strategies = ["classical-avg", "classical-last", "echo-avg", "summary-avg"]
model_names = {
    "llama3": "Llama-3.1-8B-Instruct",
    "rollama": "RoLlama3.1-8b-Instruct",
    "mgpt": "mGPT-1.3B-romanian",
    "llmic": "faur-ai/LLMic"
}

In [None]:
def load_embeddings(path):
    if path.endswith(".pt"):
        return torch.load(path)
    elif path.endswith(".npy"):
        return torch.tensor(np.load(path))
    else:
        raise ValueError("Unsupported file format")


In [40]:
from sklearn.metrics import classification_report, confusion_matrix
def train_mlp_classifier(X_train, y_train, X_val, y_val, input_dim, device):
    import torch.nn as nn
    import torch.optim as optim

    class MLP(nn.Module):
        def __init__(self, input_dim):
            super().__init__()
            self.fc = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, 4)
            )

        def forward(self, x):
            return self.fc(x)

    model = MLP(input_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)


    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)

    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        logits = model(X_train)
        loss = criterion(logits, y_train)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = torch.argmax(model(X_val), dim=1)
        acc = accuracy_score(y_val.cpu().numpy(), val_preds.cpu().numpy())
        f1 = f1_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average="macro")

    return model, acc, f1

def answer_to_index(ans):
    return {"A": 0, "B": 1, "C": 2, "D": 3}.get(ans.strip().upper(), -1)

def run_all_classifiers(root_dir, splits):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = {}

    for model_name in os.listdir(root_dir): 
            model_path = os.path.join(root_dir, model_name)

            for strategy in strategies:
                try:
                    print(f"Running MLP for: {model_name} / {strategy}")

                    def load_split(split):
                        path = os.path.join(model_path, strategy, split, "embeddings.npy")
                        return load_embeddings(path)

                    X_train = load_split("train")
                    y_train = [answer_to_index(a) for a in splits['train']['answer']]

                    X_val = load_split("validation")
                    y_val = [answer_to_index(a) for a in splits['validation']['answer']]

                    X_test = load_split("test")
                    y_test = [answer_to_index(a) for a in splits['test']['answer']]

                    if not (len(X_train) == len(y_train) and len(X_val) == len(y_val)):
                        print("Length mismatch, skipping.")
                        continue

                    model, acc, f1 = train_mlp_classifier(X_train, y_train, X_val, y_val, X_train.shape[1], device)

                    model.eval()
                    with torch.no_grad():
                        X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
                        test_preds = torch.argmax(model(X_test), dim=1)
                        test_acc = accuracy_score(y_test, test_preds.cpu().numpy())
                        test_f1 = f1_score(y_test, test_preds.cpu().numpy(), average="macro")
                        test_cm = confusion_matrix(y_test, test_preds.cpu().numpy())
                        test_report = classification_report(y_test, test_preds.cpu().numpy(), target_names=["A", "B", "C", "D"])

                    results[(model_name, strategy)] = {
                        "val_acc": acc,
                        "val_f1": f1,
                        "test_acc": test_acc,
                        "test_f1": test_f1,
                        "confusion_matrix": test_cm.tolist(),
                        "classification_report": test_report
                    }
                    # print(f"Classification report for {model_name} / {strategy}:\n{test_report}")
                    import seaborn as sns
                    import matplotlib.pyplot as plt
                    import numpy as np
                    from sklearn.metrics import ConfusionMatrixDisplay

                    def plot_confusion_matrix(cm, model_name, strategy, labels=["A", "B", "C", "D"]):
                        plt.figure(figsize=(6, 5))
                        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels)
                        plt.xlabel("Predicted")
                        plt.ylabel("Actual")
                        plt.title(f"Confusion Matrix: {model_name} / {strategy}")
                        plt.tight_layout()
                        plt.show()
                    # plot_confusion_matrix(test_cm, model_name, strategy)

                except Exception as e:
                    print(f"Failed for {model_name}/{strategy}: {e}")
                    continue

    return results

In [None]:
results = run_all_classifiers(MODEL_ROOT, splits)

In [42]:
import pandas as pd
records = []
for (model, strategy), scores in results.items():
    row = {
        "Model": model,
        "Strategy": strategy,
        "Val Acc": round(scores["val_acc"] * 100, 2),
        "Val F1": round(scores["val_f1"] * 100, 2),
        "Test Acc": round(scores["test_acc"] * 100, 2),
        "Test F1": round(scores["test_f1"] * 100, 2),
    }
    records.append(row)

df = pd.DataFrame(records)
df_sorted = df.sort_values(by=["Test Acc", "Test F1"], ascending=False)
df_sorted

Unnamed: 0,Model,Strategy,Val Acc,Val F1,Test Acc,Test F1
1,llama3,classical-last,34.8,33.58,36.08,35.37
13,rollama,classical-last,33.78,32.73,34.11,34.1
15,rollama,summary-avg,38.51,37.5,29.64,29.08
14,rollama,echo-avg,29.39,28.69,28.09,27.41
9,mgpt,classical-last,28.72,28.46,27.66,27.53
2,llama3,echo-avg,28.38,27.2,26.89,26.76
7,llmic,summary-avg,25.34,11.62,26.8,11.35
6,llmic,echo-avg,26.01,22.29,26.55,22.57
5,llmic,classical-last,24.32,9.78,26.55,10.49
8,mgpt,classical-avg,27.03,25.99,24.91,23.08
