In [7]:
from rouge_score import rouge_scorer

reference = "A panda eats bamboo leaves in the forest."
generated = "The panda is eating leaves in the jungle."

# Create the scorer with more ROUGE variants
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

scores = scorer.score(reference, generated)

# Print results
for metric, score in scores.items():
    print(f"{metric.upper()}:")
    print(f"  Precision: {score.precision:.3f}")
    print(f"  Recall:    {score.recall:.3f}")
    print(f"  F1 Score:  {score.fmeasure:.3f}\n")

ROUGE1:
  Precision: 0.625
  Recall:    0.625
  F1 Score:  0.625

ROUGE2:
  Precision: 0.286
  Recall:    0.286
  F1 Score:  0.286

ROUGEL:
  Precision: 0.625
  Recall:    0.625
  F1 Score:  0.625



In [8]:
from itertools import combinations  # Import combinations to generate skip-bigrams (pairs of tokens)

# Function to generate all skip-bigrams from a list of tokens
def get_skip_bigrams(tokens):
    return set(combinations(tokens, 2))  # Create all 2-token combinations where order matters

# Function to compute ROUGE-S score
def rouge_s(reference: str, candidate: str):
    ref_tokens = reference.lower().split()      # Convert reference text to lowercase and split into words
    cand_tokens = candidate.lower().split()     # Do the same for candidate text

    ref_skip_bigrams = get_skip_bigrams(ref_tokens)     # Get all skip-bigrams from the reference
    cand_skip_bigrams = get_skip_bigrams(cand_tokens)   # Get all skip-bigrams from the candidate

    overlap = ref_skip_bigrams & cand_skip_bigrams      # Find common skip-bigrams using set intersection
    num_overlap = len(overlap)                          # Count how many skip-bigrams overlap

    # Calculate precision: overlapping / total skip-bigrams in candidate
    precision = num_overlap / len(cand_skip_bigrams) if cand_skip_bigrams else 0

    # Calculate recall: overlapping / total skip-bigrams in reference
    recall = num_overlap / len(ref_skip_bigrams) if ref_skip_bigrams else 0

    # Calculate F1-score using harmonic mean of precision and recall
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0

    # Return results rounded to 3 decimal places along with debug info
    return {
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1": round(f1, 3),
        "num_overlap": num_overlap,
        "total_candidate": len(cand_skip_bigrams),
        "total_reference": len(ref_skip_bigrams)
    }

# ----------- Test Example -----------

reference = "A panda eats bamboo leaves in the forest"   # Reference summary
generated = "The panda is eating leaves in the jungle"    # LLM-generated summary

score = rouge_s(reference, generated)     # Run ROUGE-S calculation
print("ROUGE-S Score:")                   # Display label
print(score)                              # Print the resulting precision, recall, f1, and overlap counts


ROUGE-S Score:
{'precision': 0.222, 'recall': 0.214, 'f1': 0.218, 'num_overlap': 6, 'total_candidate': 27, 'total_reference': 28}


In [2]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Obtaining dependency information for absl-py from https://files.pythonhosted.org/packages/f6/d4/349f7f4bd5ea92dab34f5bb0fe31775ef6c311427a14d5a5b31ecb442341/absl_py-2.2.2-py3-none-any.whl.metadata
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting nltk (from rouge-score)
  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl.metadata
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting j


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
