# Perplexity

In [None]:
import math

def calculate_perplexity(log_probs):
    """
    Given a list of log probabilities (base e) for each token in a sequence,
    compute the perplexity of the model for this sequence.
    """
    N = len(log_probs)  # number of tokens
    # Compute the average negative log-likelihood:
    avg_neg_log_likelihood = -sum(log_probs)/N
    # Compute perplexity:
    perplexity = math.exp(avg_neg_log_likelihood)
    return perplexity




In [17]:
# Example:
# If log_probs = [-1.2, -0.5, -0.3], then
# sum(log_probs) = -2.0, N = 3,
# avg_neg_log_likelihood = -(-2.0)/3 = 0.666...,
# perplexity = exp(0.666...) ≈ 1.95.
log_probs = [-1.2, -0.5, -0.3]
calculate_perplexity(log_probs)

1.9477340410546757

# BLEU: Implementation 

In [25]:
import math
from collections import Counter

def compute_bleu(candidate, references, max_n=4):

    # Tokenize candidate and references
    candidate_tokens = candidate.split()
    references_tokens = [ref.split() for ref in references]

    precisions = []
    for n in range(1, max_n+1):
        # Get n-grams for candidate
        candidate_ngrams = Counter(tuple(candidate_tokens[i:i+n]) for i in range(len(candidate_tokens)-n+1))
        max_ref_ngrams = Counter()

        # Get max reference n-grams counts
        for ref in references_tokens:
            ref_ngrams = Counter(tuple(ref[i:i+n]) for i in range(len(ref)-n+1))
            for ngram in ref_ngrams:
                max_ref_ngrams[ngram] = max(max_ref_ngrams.get(ngram,0), ref_ngrams[ngram])

        # Clip candidate n-gram counts by reference max counts
        clipped_counts = {ngram: min(count, max_ref_ngrams.get(ngram,0)) for ngram, count in candidate_ngrams.items()}

        precision = sum(clipped_counts.values()) / max(1, sum(candidate_ngrams.values()))
        precisions.append(precision)

    # Geometric mean of precisions
    if min(precisions) > 0:
        geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_n)
    else:
        geo_mean = 0

    # Brevity penalty
    ref_lens = [len(ref) for ref in references_tokens]
    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - len(candidate_tokens)), ref_len))
    bp = math.exp(1 - closest_ref_len / len(candidate_tokens)) if len(candidate_tokens) < closest_ref_len else 1

    return bp * geo_mean

# Example
candidate = "the cat is on mat"
references = ["the cat is on the mat", "there is a cat on the mat"]

print(f"BLEU score: {compute_bleu(candidate, references):.4f}")

BLEU score: 0.5789


In [30]:
# Example
candidate = "cat is on mat"
references = ["the cat sits on the mat", "there is a cat on the mat"]

print(f"BLEU score: {compute_bleu(candidate, references):.4f}")

BLEU score: 0.0000


Because you’re computing sentence-level BLEU without smoothing, the score collapses to 0 whenever any higher-order n-gram precision is 0. That happens a lot:

Short candidates: if len(candidate_tokens) < n, there are no n-grams for that n ⇒ precision for that order = 0 ⇒ geometric mean = 0.

Paraphrases / wording changes: candidate has no 3-gram/4-gram overlaps with the references ⇒ some p_n = 0 ⇒ overall BLEU = 0.

Case / tokenization mismatches: “The” vs “the”, punctuation differences, etc., reduce overlaps and can zero out higher-order precisions.

Single reference: fewer chances to match n-grams, increasing the odds of zeros.

(Less likely) Empty candidate: your BP step will actually raise a division-by-zero error (not just return 0), because you divide by len(candidate_tokens).

In your code, this line enforces the collapse:
```
if min(precisions) > 0:
    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_n)
else:
    geo_mean = 0

```

If any precision is 0 (very common for 3/4-grams), geo_mean is forced to 0.

**Quick fixe:**
1- Add smoothing (essential at sentence level)
Replace the geometric-mean block with a smoothed version (Chen & Cherry–style epsilon smoothing is the simplest):
```
# Geometric mean of precisions with simple smoothing
eps = 1e-9
precisions_smoothed = [p if p > 0 else eps for p in precisions]
geo_mean = math.exp(sum(math.log(p) for p in precisions_smoothed) / max_n)
```
2- Lower max_n or use weights

or short sentences, consider BLEU-2 (bigrams) or weight lower orders more heavily.

3- Normalize tokens

Lowercase and strip punctuation consistently before n-gram counting.

4- Use multiple references

Increases chance of higher-order overlaps.

5- Prefer corpus BLEU for reporting

Aggregate n-gram counts over many sentences; zeros become much rarer.

In [33]:
import math
from collections import Counter

def compute_bleu(candidate, references, max_n=4, smooth=True, eps=1e-9):
    # Tokenize
    candidate_tokens = candidate.lower().split()
    references_tokens = [ref.lower().split() for ref in references]

    precisions = []
    for n in range(1, max_n+1):
        cand_ngrams = Counter(tuple(candidate_tokens[i:i+n]) for i in range(len(candidate_tokens)-n+1))
        max_ref_ngrams = Counter()
        for ref in references_tokens:
            ref_ngrams = Counter(tuple(ref[i:i+n]) for i in range(len(ref)-n+1))
            for ng, c in ref_ngrams.items():
                if c > max_ref_ngrams[ng]:
                    max_ref_ngrams[ng] = c

        clipped = {ng: min(c, max_ref_ngrams.get(ng, 0)) for ng, c in cand_ngrams.items()}
        match = sum(clipped.values())
        total = sum(cand_ngrams.values())
        precisions.append(match / total if total > 0 else 0.0)

    # Geometric mean (with optional smoothing)
    if smooth:
        precisions = [p if p > 0 else eps for p in precisions]
    if any(p <= 0 for p in precisions):
        # no smoothing case hits here and becomes 0
        geo_mean = 0.0
    else:
        geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_n)

    # Brevity penalty (guard empty candidate)
    c = len(candidate_tokens)
    if c == 0:
        return 0.0
    ref_lens = [len(ref) for ref in references_tokens]
    closest_ref_len = min(ref_lens, key=lambda r: (abs(r - c), r))
    bp = math.exp(1 - closest_ref_len / c) if c < closest_ref_len else 1.0

    return bp * geo_mean


In [35]:
# Example
candidate = "the cat is on mat"
references = ["the cat sits on the mat", "there is a cat on the mat"]

print(f"BLEU score: {compute_bleu(candidate, references):.4f}")

BLEU score: 0.0000


# Production-grade implementation of BLEU

Above is not “best” for real evaluations. It’s missing a few things practitioners rely on:

- Smoothing for zero precisions (critical at sentence level)
- Configurable n-gram weights (e.g., BLEU-1, BLEU-2, BLEU-4)
- Robust tokenization / case handling
- Corpus-level BLEU (micro-averaging counts across a set)
- Clear choice of effective reference length for BP (closest/shortest/average)
- Safe handling when the candidate is shorter than n (no n-grams)

Below is a more production-leaning drop-in that adds these, while staying minimal.



In [27]:
import math
from collections import Counter
from typing import Iterable, List, Sequence, Tuple, Callable, Optional
import pandas as pd

# ---------------- Utility Functions ---------------- #
def simple_tokenize(s: str) -> List[str]:
    return s.lower().split()

def ngrams(tokens: Sequence[str], n: int) -> Iterable[Tuple[str, ...]]:
    L = len(tokens)
    for i in range(max(0, L - n + 1)):
        yield tuple(tokens[i:i+n])

def modified_precision(candidate_tokens: List[str],
                       references_tokens: List[List[str]],
                       n: int) -> Tuple[int, int]:
    cand_counts = Counter(ngrams(candidate_tokens, n))
    if not cand_counts:
        return 0, 0

    max_ref_counts = Counter()
    for ref in references_tokens:
        ref_counts = Counter(ngrams(ref, n))
        for g, c in ref_counts.items():
            max_ref_counts[g] = max(max_ref_counts.get(g, 0), c)

    clipped = {g: min(c, max_ref_counts.get(g, 0)) for g, c in cand_counts.items()}
    return sum(clipped.values()), sum(cand_counts.values())

def brevity_penalty(candidate_len: int, ref_lens: List[int], strategy: str = "closest") -> float:
    if candidate_len == 0:
        return 0.0
    if strategy == "shortest":
        r = min(ref_lens)
    elif strategy == "average":
        r = sum(ref_lens) / len(ref_lens)
    else:  # closest
        r = min(ref_lens, key=lambda rl: (abs(rl - candidate_len), rl))
    if candidate_len > r:
        return 1.0
    return math.exp(1 - (r / candidate_len))

def chen_cherry_smoothing(p_list: List[float], method: int = 1) -> List[float]:
    eps = 1e-9
    if method == 1:
        return [p if p > 0 else eps for p in p_list]
    return p_list

# ---------------- BLEU Functions ---------------- #
def compute_sentence_bleu(candidate: str,
                          references: Sequence[str],
                          max_n: int = 4,
                          weights: Optional[Sequence[float]] = None,
                          tokenizer: Callable[[str], List[str]] = simple_tokenize,
                          smooth: Optional[int] = 1,
                          bp_strategy: str = "closest") -> float:
    if weights is None:
        weights = [1.0 / max_n] * max_n

    cand = tokenizer(candidate)
    refs = [tokenizer(r) for r in references]

    precisions = []
    for n in range(1, max_n + 1):
        match, total = modified_precision(cand, refs, n)
        p = (match / total) if total > 0 else 0.0
        precisions.append(p)

    if smooth is not None:
        precisions = chen_cherry_smoothing(precisions, method=smooth)

    if any(p <= 0 for p in precisions):
        geo_mean = 0.0
    else:
        geo_mean = math.exp(sum(w * math.log(p) for w, p in zip(weights, precisions)))

    bp = brevity_penalty(len(cand), [len(r) for r in refs], strategy=bp_strategy)
    return bp * geo_mean

def compute_corpus_bleu(candidates: Sequence[str],
                        list_of_references: Sequence[Sequence[str]],
                        max_n: int = 4,
                        weights: Optional[Sequence[float]] = None,
                        tokenizer: Callable[[str], List[str]] = simple_tokenize,
                        bp_strategy: str = "closest") -> float:
    if weights is None:
        weights = [1.0 / max_n] * max_n

    total_match = [0] * max_n
    total_count = [0] * max_n
    cand_len_total = 0
    ref_len_choices = []

    for cand_str, refs_strs in zip(candidates, list_of_references):
        cand = tokenizer(cand_str)
        refs = [tokenizer(r) for r in refs_strs]

        cand_len_total += len(cand)
        if bp_strategy == "shortest":
            ref_len_choices.append(min(len(r) for r in refs))
        elif bp_strategy == "average":
            ref_len_choices.append(sum(len(r) for r in refs) / len(refs))
        else:
            ref_len_choices.append(min((len(r) for r in refs),
                                       key=lambda rl: (abs(rl - len(cand)), rl)))

        for n in range(1, max_n + 1):
            m, t = modified_precision(cand, refs, n)
            total_match[n - 1] += m
            total_count[n - 1] += t

    precisions = [(m / t) if t > 0 else 0.0 for m, t in zip(total_match, total_count)]
    precisions = [p if p > 0 else 1e-9 for p in precisions]

    if any(p <= 0 for p in precisions):
        geo_mean = 0.0
    else:
        geo_mean = math.exp(sum(w * math.log(p) for w, p in zip(weights, precisions)))

    R = sum(ref_len_choices)
    C = cand_len_total
    bp = math.exp(1 - R / C) if C < R and C > 0 else (0.0 if C == 0 else 1.0)

    return bp * geo_mean

# ---------------- Examples & Table ---------------- #
rows = []

def add_row(name, score, **kwargs):
    rows.append({"Example": name, "BLEU": score, **kwargs})

# 1) Simple BLEU-1
candidate = "The cat sat on the mat"
references = ["The cat is on the mat"]
score_bleu1 = compute_sentence_bleu(candidate, references, max_n=1, weights=[1.0], smooth=None)
add_row("Sentence BLEU-1 (no smoothing)", score_bleu1, candidate=candidate, references=references)

# 2) Short sentence BLEU-4
candidate2 = "hello"
references2 = ["hello world"]
score_no_smooth = compute_sentence_bleu(candidate2, references2, max_n=4, smooth=None)
score_smooth = compute_sentence_bleu(candidate2, references2, max_n=4, smooth=1)
add_row("Short sentence BLEU-4 (no smoothing)", score_no_smooth, candidate=candidate2, references=references2)
add_row("Short sentence BLEU-4 (with smoothing=1)", score_smooth, candidate=candidate2, references=references2)

# 3) Multiple references
candidate3 = "the cat is on mat"
references3 = ["the cat is on the mat", "there is a cat on the mat"]
score_multi_ref = compute_sentence_bleu(candidate3, references3, max_n=4, smooth=1)
add_row("Sentence BLEU-4 with multiple references", score_multi_ref, candidate=candidate3, references=references3)

# 4) Brevity penalty strategies
candidate4 = "the cat is on mat"
references4 = ["the cat is definitely on the mat"]
for strategy in ["closest", "shortest", "average"]:
    score = compute_sentence_bleu(candidate4, references4, max_n=4, smooth=1, bp_strategy=strategy)
    add_row(f"BP strategy: {strategy}", score, candidate=candidate4, references=references4)

# 5) Custom weights
score_bleu2 = compute_sentence_bleu(candidate3, references3, max_n=2, weights=[0.5, 0.5], smooth=1)
score_bleu4 = compute_sentence_bleu(candidate3, references3, max_n=4, weights=[0.25]*4, smooth=1)
add_row("Custom weights BLEU-2", score_bleu2, candidate=candidate3, references=references3)
add_row("Custom weights BLEU-4", score_bleu4, candidate=candidate3, references=references3)

# 6) Corpus BLEU
candidates = ["the cat is on mat", "there is a dog in the park", "I like pizza"]
references_list = [
    ["the cat is on the mat", "there is a cat on the mat"],
    ["there is a dog at the park", "a dog is in the park"],
    ["I love pizza", "I like pizza a lot"]
]
corpus_score = compute_corpus_bleu(candidates, references_list, max_n=4)
add_row("Corpus BLEU-4 (3 sentences)", corpus_score)

# 7) Tokenization effect
def case_sensitive_tokenize(s: str) -> List[str]:
    return s.split()

cand_cs = "The Cat Is On The Mat"
refs_cs = ["the cat is on the mat"]
score_case_insensitive = compute_sentence_bleu(cand_cs, refs_cs, tokenizer=simple_tokenize, max_n=4, smooth=1)
score_case_sensitive = compute_sentence_bleu(cand_cs, refs_cs, tokenizer=case_sensitive_tokenize, max_n=4, smooth=1)
add_row("Case-insensitive tokenizer", score_case_insensitive, candidate=cand_cs, references=refs_cs)
add_row("Case-sensitive tokenizer", score_case_sensitive, candidate=cand_cs, references=refs_cs)

# Convert to DataFrame
df = pd.DataFrame(rows)
print(df)


                                     Example          BLEU  \
0             Sentence BLEU-1 (no smoothing)  8.333333e-01   
1       Short sentence BLEU-4 (no smoothing)  0.000000e+00   
2   Short sentence BLEU-4 (with smoothing=1)  6.541924e-08   
3   Sentence BLEU-4 with multiple references  5.789301e-01   
4                       BP strategy: closest  2.408487e-03   
5                      BP strategy: shortest  2.408487e-03   
6                       BP strategy: average  2.408487e-03   
7                      Custom weights BLEU-2  7.090416e-01   
8                      Custom weights BLEU-4  5.789301e-01   
9                Corpus BLEU-4 (3 sentences)  6.136894e-01   
10                Case-insensitive tokenizer  1.000000e+00   
11                  Case-sensitive tokenizer  1.000000e-09   

                 candidate                                         references  
0   The cat sat on the mat                            [The cat is on the mat]  
1                    hello       

The above script:
- Implements sentence- and corpus-level BLEU.
- Runs several test scenarios (BLEU-1, BLEU-4 with/without smoothing, multiple refs, brevity penalties, custom weights, corpus BLEU, tokenization effects).
- Collects everything into a pandas DataFrame for inspection.

# ROUGE: simple implementation

In [36]:
from collections import Counter

def compute_rouge_1(candidate, reference):
    # Tokenize the sentences
    candidate_tokens = candidate.split()
    reference_tokens = reference.split()

    # Count unigrams
    candidate_counts = Counter(candidate_tokens)
    reference_counts = Counter(reference_tokens)

    # Calculate overlapping unigrams
    overlapping_unigrams = sum(min(candidate_counts[word], reference_counts[word]) for word in candidate_counts)

    # Calculate recall
    recall = overlapping_unigrams / max(1, len(reference_tokens))

    # Calculate precision
    precision = overlapping_unigrams / max(1, len(candidate_tokens))

    # Calculate F1-score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * precision * recall / (precision + recall)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1_score
    }

# Example usage
candidate = "the cat sat on the mat"
reference = "the cat is sitting on the mat"

scores = compute_rouge_1(candidate, reference)
print(f"ROUGE-1 Precision: {scores['precision']:.2f}")
print(f"ROUGE-1 Recall: {scores['recall']:.2f}")
print(f"ROUGE-1 F1 Score: {scores['f1']:.2f}")

ROUGE-1 Precision: 0.83
ROUGE-1 Recall: 0.71
ROUGE-1 F1 Score: 0.77


# Extension to ROUGE-2, and ROUGE-L

In [None]:
import math
import re
from collections import Counter
from typing import Dict, List, Tuple, Iterable, Sequence, Optional

# ---------------- Tokenization ---------------- #
def tokenize(s: str, lowercase: bool = True, strip_punct: bool = True) -> List[str]:
    if lowercase:
        s = s.lower()
    if strip_punct:
        s = re.sub(r"[^\w\s]", "", s)
    return s.split()

def ngrams_from_tokens(tokens: Sequence[str], n: int) -> Iterable[Tuple[str, ...]]:
    L = len(tokens)
    for i in range(max(0, L - n + 1)):
        yield tuple(tokens[i : i + n])

# ---------------- BLEU (existing helpers, optional) ---------------- #
def modified_precision(candidate_tokens: List[str],
                       references_tokens: List[List[str]],
                       n: int) -> Tuple[int, int]:
    cand_counts = Counter(ngrams_from_tokens(candidate_tokens, n))
    if not cand_counts:
        return 0, 0

    max_ref_counts = Counter()
    for ref in references_tokens:
        ref_counts = Counter(ngrams_from_tokens(ref, n))
        for g, c in ref_counts.items():
            if c > max_ref_counts[g]:
                max_ref_counts[g] = c

    clipped = {g: min(c, max_ref_counts.get(g, 0)) for g, c in cand_counts.items()}
    return sum(clipped.values()), sum(cand_counts.values())

# ---------------- ROUGE-N (N = 1 or 2) ---------------- #
def _rouge_n_pair(candidate_tokens: List[str], reference_tokens: List[str], n: int) -> Dict[str, float]:
    cand_counts = Counter(ngrams_from_tokens(candidate_tokens, n))
    ref_counts  = Counter(ngrams_from_tokens(reference_tokens, n))

    if not ref_counts and not cand_counts:
        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}  # both empty
    if not ref_counts:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    if not cand_counts:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    overlap = 0
    for g, c in cand_counts.items():
        overlap += min(c, ref_counts.get(g, 0))

    cand_total = sum(cand_counts.values())
    ref_total  = sum(ref_counts.values())

    precision = overlap / cand_total if cand_total > 0 else 0.0
    recall    = overlap / ref_total  if ref_total  > 0 else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
    return {"precision": precision, "recall": recall, "f1": f1}

def compute_rouge_n(candidate: str,
                    references: Sequence[str],
                    n: int = 1,
                    aggregator: str = "max",
                    lowercase: bool = True,
                    strip_punct: bool = True) -> Dict[str, float]:
    """
    ROUGE-N over multiple references.
    aggregator='max' -> take the reference with highest F1
    aggregator='avg' -> average precision/recall/F1 over references
    """
    cand_tok = tokenize(candidate, lowercase=lowercase, strip_punct=strip_punct)
    ref_toks = [tokenize(r, lowercase=lowercase, strip_punct=strip_punct) for r in references]

    scores = [_rouge_n_pair(cand_tok, rt, n) for rt in ref_toks]
    if not scores:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    if aggregator == "avg":
        p = sum(s["precision"] for s in scores) / len(scores)
        r = sum(s["recall"]    for s in scores) / len(scores)
        f = sum(s["f1"]        for s in scores) / len(scores)
        return {"precision": p, "recall": r, "f1": f}
    else:  # 'max'
        best = max(scores, key=lambda s: s["f1"])
        return best

# ---------------- ROUGE-L (Longest Common Subsequence) ---------------- #
def _lcs_length(a: List[str], b: List[str]) -> int:
    # Classic DP for LCS length, O(len(a)*len(b)) time, O(min) space possible; here simple 2D for clarity.
    m, n = len(a), len(b)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if a[i] == b[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i+1][j], dp[i][j+1])
    return dp[m][n]

# using recursing approach for longest common sequence problem
def _lcs_length_recursive_dp(a,b):
    m, n = len(a), len(b)
    def dp(i,j):
        if i==m or j == n:
            return 0
        return 1+ dp(i+1,j+1) if a[i]==b[j] else max(dp(i+1,j), dp(i,j+1))
    return dp(0,0)


def _rouge_l_pair(candidate_tokens: List[str], reference_tokens: List[str]) -> Dict[str, float]:
    if not candidate_tokens and not reference_tokens:
        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}
    if not reference_tokens or not candidate_tokens:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    lcs = _lcs_length(candidate_tokens, reference_tokens)
    # lcs = _lcs_length_recursive_dp(candidate_tokens, reference_tokens)
    p = lcs / len(candidate_tokens) if len(candidate_tokens) > 0 else 0.0
    r = lcs / len(reference_tokens) if len(reference_tokens) > 0 else 0.0
    f1 = (2 * p * r / (p + r)) if (p + r) > 0 else 0.0
    return {"precision": p, "recall": r, "f1": f1}

def compute_rouge_l(candidate: str,
                    references: Sequence[str],
                    aggregator: str = "max",
                    lowercase: bool = True,
                    strip_punct: bool = True) -> Dict[str, float]:
    """
    ROUGE-L over multiple references.
    aggregator='max' -> take the reference with highest F1
    aggregator='avg' -> average precision/recall/F1 over references
    """
    cand_tok = tokenize(candidate, lowercase=lowercase, strip_punct=strip_punct)
    ref_toks = [tokenize(r, lowercase=lowercase, strip_punct=strip_punct) for r in references]

    scores = [_rouge_l_pair(cand_tok, rt) for rt in ref_toks]
    if not scores:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    if aggregator == "avg":
        p = sum(s["precision"] for s in scores) / len(scores)
        r = sum(s["recall"]    for s in scores) / len(scores)
        f = sum(s["f1"]        for s in scores) / len(scores)
        return {"precision": p, "recall": r, "f1": f}
    else:  # 'max'
        best = max(scores, key=lambda s: s["f1"])
        return best

# ---------------- Convenience wrapper: compute all ---------------- #
def compute_rouge_all(candidate: str,
                      references: Sequence[str],
                      aggregator: str = "max",
                      lowercase: bool = True,
                      strip_punct: bool = True) -> Dict[str, Dict[str, float]]:
    """
    Returns:
      {
        "rouge1": {"precision":..., "recall":..., "f1":...},
        "rouge2": {"precision":..., "recall":..., "f1":...},
        "rougeL": {"precision":..., "recall":..., "f1":...}
      }
    """
    r1 = compute_rouge_n(candidate, references, n=1, aggregator=aggregator,
                         lowercase=lowercase, strip_punct=strip_punct)
    r2 = compute_rouge_n(candidate, references, n=2, aggregator=aggregator,
                         lowercase=lowercase, strip_punct=strip_punct)
    rL = compute_rouge_l(candidate, references, aggregator=aggregator,
                         lowercase=lowercase, strip_punct=strip_punct)
    return {"rouge1": r1, "rouge2": r2, "rougeL": rL}

# ---------------- Example usage ---------------- #
if __name__ == "__main__":
    cand = "Alice won the contest yesterday"
    refs = [
        "Alice won the contest",
        "The contest was won by Alice"
    ]

    scores = compute_rouge_all(cand, refs, aggregator="max")
    print("ROUGE-1:", scores["rouge1"])
    print("ROUGE-2:", scores["rouge2"])
    print("ROUGE-L:", scores["rougeL"])

    # Average over references instead of max:
    scores_avg = compute_rouge_all(cand, refs, aggregator="avg")
    print("\n(Average over refs)")
    print("ROUGE-1:", scores_avg["rouge1"])
    print("ROUGE-2:", scores_avg["rouge2"])
    print("ROUGE-L:", scores_avg["rougeL"])


ROUGE-1: {'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889}
ROUGE-2: {'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}
ROUGE-L: {'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889}

(Average over refs)
ROUGE-1: {'precision': 0.8, 'recall': 0.8333333333333333, 'f1': 0.8080808080808081}
ROUGE-2: {'precision': 0.5, 'recall': 0.6, 'f1': 0.5396825396825397}
ROUGE-L: {'precision': 0.6000000000000001, 'recall': 0.6666666666666666, 'f1': 0.6262626262626263}
