In [1]:
from sacrebleu.metrics import BLEU, CHRF, TER
from datasets import load_dataset

ds = load_dataset('billingsmoore/temp', split='train')

# === Example data ===
references = ds['en']
predictions = ds['small_predictions']

ds

Dataset({
    features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'small_predictions', 'base_predictions'],
    num_rows: 100000
})

In [2]:
import json

batch_size = 500

# === Metrics ===
bleu_metric = BLEU()
chrf_metric = CHRF()
ter_metric = TER()

# === Output containers ===
bleu_scores = []
chrf_scores = []
ter_scores = []

# === Process in batches ===
num_batches = (len(predictions) + batch_size - 1) // batch_size

for i in range(0, len(predictions), batch_size):
    batch_num = i // batch_size + 1
    batch_preds = predictions[i:i + batch_size]
    batch_refs = references[i:i + batch_size]

    # Compute corpus-level metrics on the batch
    bleu = bleu_metric.corpus_score(batch_preds, [batch_refs]).score
    chrf = chrf_metric.corpus_score(batch_preds, [batch_refs]).score
    ter = ter_metric.corpus_score(batch_preds, [batch_refs]).score

    # Store scores
    bleu_scores.append(bleu)
    chrf_scores.append(chrf)
    ter_scores.append(ter)

    # Print per-batch scores
    print(f"Batch {batch_num}/{num_batches}")
    print(f"  BLEU: {bleu:.2f}")
    print(f"  chrF: {chrf:.2f}")
    print(f"  TER: {ter:.2f}")

# === Save results ===
results = {
    "batch_bleu_scores": bleu_scores,
    "batch_chrf_scores": chrf_scores,
    "batch_ter_scores": ter_scores,
    "average_bleu": sum(bleu_scores) / len(bleu_scores),
    "average_chrf": sum(chrf_scores) / len(chrf_scores),
    "average_ter": sum(ter_scores) / len(ter_scores),
    "batch_size": batch_size,
    "num_batches": num_batches,
    "num_examples": len(predictions)
}

with open("batchwise_metrics.json", "w") as f:
    json.dump(results, f, indent=2)

Batch 1/200
  BLEU: 14.59
  chrF: 38.15
  TER: 78.09
Batch 2/200
  BLEU: 14.38
  chrF: 38.83
  TER: 77.68
Batch 3/200
  BLEU: 15.97
  chrF: 39.18
  TER: 76.30
Batch 4/200
  BLEU: 15.60
  chrF: 38.45
  TER: 77.11
Batch 5/200
  BLEU: 15.23
  chrF: 39.72
  TER: 77.36
Batch 6/200
  BLEU: 16.02
  chrF: 40.09
  TER: 76.88
Batch 7/200
  BLEU: 15.68
  chrF: 38.69
  TER: 77.15
Batch 8/200
  BLEU: 15.55
  chrF: 39.06
  TER: 77.21
Batch 9/200
  BLEU: 15.72
  chrF: 38.71
  TER: 77.81
Batch 10/200
  BLEU: 14.23
  chrF: 38.00
  TER: 79.53
Batch 11/200
  BLEU: 14.62
  chrF: 38.50
  TER: 77.58
Batch 12/200
  BLEU: 16.44
  chrF: 39.49
  TER: 77.22
Batch 13/200
  BLEU: 16.92
  chrF: 39.91
  TER: 77.97
Batch 14/200
  BLEU: 14.90
  chrF: 38.58
  TER: 78.92
Batch 15/200
  BLEU: 16.16
  chrF: 39.56
  TER: 77.84
Batch 16/200
  BLEU: 16.63
  chrF: 39.35
  TER: 77.62
Batch 17/200
  BLEU: 14.93
  chrF: 39.45
  TER: 78.06
Batch 18/200
  BLEU: 13.53
  chrF: 37.09
  TER: 79.61
Batch 19/200
  BLEU: 16.35
  chrF: 39