In [1]:
from sacrebleu.metrics import BLEU, CHRF, TER
from datasets import load_dataset

ds = load_dataset('billingsmoore/temp', split='train')

# === Example data ===
references = ds['en']
predictions = ds['base_predictions']

ds

Dataset({
    features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'small_predictions', 'base_predictions'],
    num_rows: 100000
})

In [2]:
import json

batch_size = 500

# === Metrics ===
bleu_metric = BLEU()
chrf_metric = CHRF()
ter_metric = TER()

# === Output containers ===
bleu_scores = []
chrf_scores = []
ter_scores = []

# === Process in batches ===
num_batches = (len(predictions) + batch_size - 1) // batch_size

for i in range(0, len(predictions), batch_size):
    batch_num = i // batch_size + 1
    batch_preds = predictions[i:i + batch_size]
    batch_refs = references[i:i + batch_size]

    # Compute corpus-level metrics on the batch
    bleu = bleu_metric.corpus_score(batch_preds, [batch_refs]).score
    chrf = chrf_metric.corpus_score(batch_preds, [batch_refs]).score
    ter = ter_metric.corpus_score(batch_preds, [batch_refs]).score

    # Store scores
    bleu_scores.append(bleu)
    chrf_scores.append(chrf)
    ter_scores.append(ter)

    # Print per-batch scores
    print(f"Batch {batch_num}/{num_batches}")
    print(f"  BLEU: {bleu:.2f}")
    print(f"  chrF: {chrf:.2f}")
    print(f"  TER: {ter:.2f}")

# === Save results ===
results = {
    "batch_bleu_scores": bleu_scores,
    "batch_chrf_scores": chrf_scores,
    "batch_ter_scores": ter_scores,
    "average_bleu": sum(bleu_scores) / len(bleu_scores),
    "average_chrf": sum(chrf_scores) / len(chrf_scores),
    "average_ter": sum(ter_scores) / len(ter_scores),
    "batch_size": batch_size,
    "num_batches": num_batches,
    "num_examples": len(predictions)
}

with open("base_batchwise_metrics.json", "w") as f:
    json.dump(results, f, indent=2)

Batch 1/200
  BLEU: 19.16
  chrF: 41.84
  TER: 74.35
Batch 2/200
  BLEU: 17.45
  chrF: 41.14
  TER: 76.13
Batch 3/200
  BLEU: 19.65
  chrF: 41.62
  TER: 74.61
Batch 4/200
  BLEU: 18.18
  chrF: 41.03
  TER: 74.15
Batch 5/200
  BLEU: 18.28
  chrF: 41.58
  TER: 75.48
Batch 6/200
  BLEU: 19.38
  chrF: 42.83
  TER: 73.55
Batch 7/200
  BLEU: 18.12
  chrF: 40.50
  TER: 75.25
Batch 8/200
  BLEU: 18.34
  chrF: 40.83
  TER: 74.37
Batch 9/200
  BLEU: 19.01
  chrF: 41.18
  TER: 75.27
Batch 10/200
  BLEU: 17.50
  chrF: 40.34
  TER: 76.90
Batch 11/200
  BLEU: 17.57
  chrF: 40.94
  TER: 75.70
Batch 12/200
  BLEU: 19.54
  chrF: 42.08
  TER: 73.93
Batch 13/200
  BLEU: 19.61
  chrF: 42.36
  TER: 74.19
Batch 14/200
  BLEU: 18.14
  chrF: 41.36
  TER: 75.74
Batch 15/200
  BLEU: 19.36
  chrF: 42.01
  TER: 74.64
Batch 16/200
  BLEU: 19.34
  chrF: 41.70
  TER: 75.01
Batch 17/200
  BLEU: 17.61
  chrF: 41.16
  TER: 76.36
Batch 18/200
  BLEU: 16.84
  chrF: 39.80
  TER: 75.98
Batch 19/200
  BLEU: 19.38
  chrF: 42