In [1]:
import json, os
import pandas as pd
from pathlib import Path

# Where Trainer saves logs (we used these output_dir paths)
paths = {
    "DistilBERT": "outputs/distilbert_sst2_small",
    "BERT": "outputs/bert_sst2_small",
    "RoBERTa": "outputs/roberta_sst2_small",
}

def read_eval_metrics(out_dir: str):
    """
    Try to read evaluation results from the Trainer's eval logs if present;
    otherwise, fallback to a simple 'metrics.json' pattern if you saved one.
    """
    out = {"eval_accuracy": None, "eval_loss": None}
    # common HF files to check
    cand = [
        Path(out_dir) / "eval_results.json",
        Path(out_dir) / "all_results.json",
    ]
    for p in cand:
        if p.exists():
            with open(p, "r") as f:
                data = json.load(f)
            out["eval_accuracy"] = data.get("eval_accuracy")
            out["eval_loss"] = data.get("eval_loss")
            return out
    # if none found, return Nones
    return out


In [2]:
rows = []
for name, d in paths.items():
    metrics = read_eval_metrics(d)
    rows.append({
        "model": name,
        "dataset": "SST-2 (subset)",
        "eval_accuracy": metrics["eval_accuracy"],
        "eval_loss": metrics["eval_loss"],
        "output_dir": d,
    })
df = pd.DataFrame(rows)
df


Unnamed: 0,model,dataset,eval_accuracy,eval_loss,output_dir
0,DistilBERT,SST-2 (subset),,,outputs/distilbert_sst2_small
1,BERT,SST-2 (subset),,,outputs/bert_sst2_small
2,RoBERTa,SST-2 (subset),,,outputs/roberta_sst2_small


In [3]:
# OPTIONAL fallback: recompute eval accuracy for DistilBERT quickly
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np, evaluate

raw = load_dataset("glue", "sst2")
valid_small = raw["validation"].select(range(500))

tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tok_fn(batch): return tok(batch["sentence"], truncation=True, padding="max_length", max_length=128)
valid_tok = valid_small.map(tok_fn, batched=True, remove_columns=valid_small.column_names)
valid_tok = valid_tok.add_column("labels", valid_small["label"])

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

args = TrainingArguments(
    output_dir="outputs/distilbert_eval_only",
    per_device_eval_batch_size=32,
    report_to="none",
)

metric = evaluate.load("accuracy")
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": metric.compute(predictions=preds, references=labels)["accuracy"]}

trainer = Trainer(model=model, args=args, eval_dataset=valid_tok, tokenizer=tok, compute_metrics=compute_metrics)
res = trainer.evaluate()
res


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 500/500 [00:00<00:00, 8866.03 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args, eval_dataset=valid_tok, tokenizer=tok, compute_metrics=compute_metrics)


{'eval_loss': 0.6963725686073303,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.47,
 'eval_runtime': 40.1777,
 'eval_samples_per_second': 12.445,
 'eval_steps_per_second': 0.398}

In [4]:
out_csv = "model_comparison_results.csv"
df.to_csv(out_csv, index=False)
out_csv, df.shape


('model_comparison_results.csv', (3, 5))