These functions will compute [BLEU, METEOR, ROUGE, and CIDEr](https://encord.com/blog/vision-language-models-guide/#h3) scores for a list of predicted captions and their corresponding list of ground truth captions.

In [2]:
!pip install nltk rouge-score
!git clone https://github.com/salaniz/pycocoevalcap
%cd pycocoevalcap
!pip install -e .

fatal: destination path 'pycocoevalcap' already exists and is not an empty directory.
/content/pycocoevalcap
Obtaining file:///content/pycocoevalcap
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: pycocoevalcap
  Running setup.py develop for pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [3]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider

# To download the necessary datasets for nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # This is required for word_tokenize

def calculate_bleu(references, hypotheses):
    """
    Calculate BLEU score between actual and predicted sentences using smoothing.
    """
    smoothie = SmoothingFunction().method1  # Using method1 for example
    references_tokenized = [[word_tokenize(ref)] for ref in references]
    hypotheses_tokenized = [word_tokenize(hyp) for hyp in hypotheses]
    return corpus_bleu(references_tokenized, hypotheses_tokenized, smoothing_function=smoothie)

def calculate_meteor(references, hypotheses):
    """
    Calculate METEOR score between actual and predicted sentences.
    """
    references_tokenized = [word_tokenize(ref) for ref in references]
    hypotheses_tokenized = [word_tokenize(hyp) for hyp in hypotheses]
    return sum(meteor_score([ref], hyp) for ref, hyp in zip(references_tokenized, hypotheses_tokenized)) / len(hypotheses_tokenized)

def calculate_rouge(references, hypotheses):
    """
    Calculate ROUGE scores between actual and predicted sentences.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {key: 0.0 for key in ['rouge1', 'rouge2', 'rougeL']}
    for ref, hyp in zip(references, hypotheses):
        score = scorer.score(ref, hyp)
        for key in scores:
            scores[key] += score[key].fmeasure
    for key in scores:
        scores[key] /= len(hypotheses)
    return scores

def calculate_cider(references, hypotheses):
    """
    Calculate CIDEr score between actual and predicted sentences.
    """
    scorer = Cider()
    # Convert to the format expected by CIDEr scorer
    hypo_dict = {i: [hyp] for i, hyp in enumerate(hypotheses)}
    ref_dict = {i: [ref] for i, ref in enumerate(references)}
    score, _ = scorer.compute_score(ref_dict, hypo_dict)
    return score

# Example data
references = [
    "A dog sitting on a park bench.",
    "Two people walking down the street."
]
hypotheses = [
    "A dog is on a bench in the park.",
    "Two persons are walking on the road."
]

# Compute scores
bleu_score = calculate_bleu(references, hypotheses)
meteor_score = calculate_meteor(references, hypotheses)
rouge_scores = calculate_rouge(references, hypotheses)
cider_score = calculate_cider(references, hypotheses)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor_score)
print("ROUGE Scores:", rouge_scores)
print("CIDEr Score:", cider_score)

BLEU Score: 0.04617747988825104
METEOR Score: 0.48989932627121613
ROUGE Scores: {'rouge1': 0.6057692307692307, 'rouge2': 0.14285714285714288, 'rougeL': 0.5432692307692308}
CIDEr Score: 1.5078986568615875


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
