BLEU (Bilingual Evaluation Understudy): Measures the overlap of n-grams between the generated and reference text.


In [3]:
%pip install nltk


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Reference and candidate texts
reference = ["The cat is on the mat".split()]  # Tokenized reference
candidate = "The dog was on the mat".split()  # Tokenized candidate

# Calculate BLEU-1 (unigram) and BLEU-2 (bigram)
bleu_1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1)
bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=SmoothingFunction().method1)

print(f"BLEU-1 (Unigram Precision): {round(bleu_1, 3)}")
print(f"BLEU-2 (Bigram Precision): {round(bleu_2, 3)}")

# Analyze n-gram overlaps
unigrams_reference = set(reference[0])  # Unique unigrams in reference
unigrams_candidate = set(candidate)  # Unique unigrams in candidate
bigram_reference = set(zip(reference[0], reference[0][1:]))  # Bigrams in reference
bigram_candidate = set(zip(candidate, candidate[1:]))  # Bigrams in candidate

# Print overlaps
print(f"Unigram Overlap: {unigrams_reference & unigrams_candidate}")
print(f"Bigram Overlap: {bigram_reference & bigram_candidate}")


BLEU-1 (Unigram Precision): 0.667
BLEU-2 (Bigram Precision): 0.516
Unigram Overlap: {'The', 'mat', 'on', 'the'}
Bigram Overlap: {('on', 'the'), ('the', 'mat')}


ROUGE-1, ROUGE-2, and ROUGE-L,

In [22]:
%pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24971 sha256=00be6254b6a9001c7168d7861a5f6dbdf13c105f7ec39bc46b832ea36927c464
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\1e\19\43\8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
from rouge_score import rouge_scorer

# Reference and candidate texts
reference = "The cat is on the mat"
candidate = "The cat sat on the mat"

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
scores = scorer.score(reference, candidate)

# Display ROUGE scores
print("ROUGE-1 (Unigram Overlap):")
print(f"Precision: {round(scores['rouge1'].precision, 3)}")
print(f"Recall: {round(scores['rouge1'].recall, 3)}")
print(f"F1-Score: {round(scores['rouge1'].fmeasure, 3)}")

print("\nROUGE-2 (Bigram Overlap):")
print(f"Precision: {round(scores['rouge2'].precision, 3)}")
print(f"Recall: {round(scores['rouge2'].recall, 3)}")
print(f"F1-Score: {round(scores['rouge2'].fmeasure, 3)}")

print("\nROUGE-L (Longest Common Subsequence):")
print(f"Precision: {round(scores['rougeL'].precision, 3)}")
print(f"Recall: {round(scores['rougeL'].recall, 3)}")
print(f"F1-Score: {round(scores['rougeL'].fmeasure, 3)}")


ROUGE-1 (Unigram Overlap):
Precision: 0.833
Recall: 0.833
F1-Score: 0.833

ROUGE-2 (Bigram Overlap):
Precision: 0.6
Recall: 0.6
F1-Score: 0.6

ROUGE-L (Longest Common Subsequence):
Precision: 0.833
Recall: 0.833
F1-Score: 0.833


METEOR

In [25]:
from nltk.translate.meteor_score import meteor_score

# Reference and candidate texts (tokenized as lists of words)
reference = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
candidate = ["A", "fast", "brown", "fox", "leaped", "over", "a", "sleeping", "dog"]

# Calculate METEOR score
score = meteor_score([reference], candidate)
print(f"METEOR score: {round(score, 3)}")



METEOR score: 0.654


TER

In [26]:
%pip install translate-toolkit


Collecting translate-toolkit
  Downloading translate_toolkit-3.14.1-py3-none-any.whl.metadata (12 kB)
Collecting lxml>=4.6.3 (from translate-toolkit)
  Downloading lxml-5.3.0-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Downloading translate_toolkit-3.14.1-py3-none-any.whl (745 kB)
   ---------------------------------------- 0.0/745.2 kB ? eta -:--:--
   --------------------------------------- 745.2/745.2 kB 32.2 MB/s eta 0:00:00
Downloading lxml-5.3.0-cp311-cp311-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 3.8/3.8 MB 28.4 MB/s eta 0:00:00
Installing collected packages: lxml, translate-toolkit
Successfully installed lxml-5.3.0 translate-toolkit-3.14.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
from nltk.util import ngrams
from typing import List

def calculate_ter(reference: str, hypothesis: str) -> float:
    """
    Calculate Translation Edit Rate (TER).
    Arguments:
        reference: The reference translation (ground truth).
        hypothesis: The hypothesis (candidate) translation.
    Returns:
        TER score as a float (lower is better).
    """
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()

    # Compute edit distance
    import nltk
    edit_distance = nltk.edit_distance(ref_tokens, hyp_tokens)
    
    # TER = Edit Distance / Reference Length
    ter_score = edit_distance / len(ref_tokens)
    return ter_score


# Reference and candidate translations
reference = "The quick brown fox jumps over the lazy dog."
hypothesis = "A fast brown fox leaped over a sleeping dog."

# Calculate TER score
ter_score = calculate_ter(reference, hypothesis)
print(f"TER score: {round(ter_score, 3)}")



TER score: 0.556
