In [2]:
# Install
!pip install pycocoevalcap

# Simple usage
from pycocoevalcap.eval import COCOEvalCap
from pycocotools.coco import COCO
import json
import os
import tempfile
import pandas as pd



In [4]:


def evaluate_with_pycocoevalcap(eval_dir="eval_result_json"):
        
    all_annotations = []
    all_results = {}  
    annotation_id = 0
    image_id = 0
    image_mapping = {} 
    
    for filename in os.listdir(eval_dir):
        if filename.endswith('.json'):
            model_name = filename.replace('local_evaluation_', '').replace('.json', '')
            
            with open(os.path.join(eval_dir, filename), 'r') as f:
                data = json.load(f)
            
            model_results = []
            
            for image_key, image_data in data.items():
                image_name = image_data['image_name']
                
                # Create consistent image_id mapping
                if image_name not in image_mapping:
                    image_mapping[image_name] = image_id
                    
                    # Add ground truth annotations 
                    for gt_caption in image_data['ground_truth']:
                        all_annotations.append({
                            "image_id": image_id,
                            "id": annotation_id,
                            "caption": gt_caption.strip()
                        })
                        annotation_id += 1
                    
                    image_id += 1
                
                # Add model prediction
                current_image_id = image_mapping[image_name]
                generated_caption = list(image_data['model_results'].values())[0]
                
                if "failed" not in generated_caption.lower() and "error" not in generated_caption.lower():
                    model_results.append({
                        "image_id": current_image_id,
                        "caption": generated_caption.strip()
                    })
            
            all_results[model_name] = model_results
    
    
    # Create COCO ground truth object
    coco_gt_data = {
        'annotations': all_annotations,
        'images': [{'id': img_id} for img_id in range(len(image_mapping))],
        'info': {'description': 'Image Captioning Evaluation'},
        'licenses': [],
        'type': 'captions'
    }
    
    # Initialize COCO ground truth
    coco_gt = COCO()
    coco_gt.dataset = coco_gt_data
    coco_gt.createIndex()
        
    # Evaluate each model
    evaluation_results = {}
    
    for model_name, model_results in all_results.items():
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
            json.dump(model_results, temp_file)
            temp_filename = temp_file.name
        

        coco_res = coco_gt.loadRes(temp_filename)
        
        coco_eval = COCOEvalCap(coco_gt, coco_res)
        
        coco_eval.evaluate()
        
        evaluation_results[model_name] = coco_eval.eval.copy()
        
        print(f"✅ {model_name} evaluation complete")
            
 
    return evaluation_results, coco_gt


In [5]:

def display_pycocoevalcap_results(evaluation_results):    
    print("\n" + "="*80)
    print("EVALUATION RESULTS USING COCO")
    print("="*80)
    
    
    table_data = []
    for model_name, scores in evaluation_results.items():
        if scores:  # Only include models with results
            row = {
                'Model': model_name.upper(),
                'BLEU-1': f"{scores.get('Bleu_1', 0):.4f}",
                'BLEU-2': f"{scores.get('Bleu_2', 0):.4f}",
                'BLEU-3': f"{scores.get('Bleu_3', 0):.4f}",
                'BLEU-4': f"{scores.get('Bleu_4', 0):.4f}",
                'METEOR': f"{scores.get('METEOR', 0):.4f}",
                'ROUGE-L': f"{scores.get('ROUGE_L', 0):.4f}",
                'CIDEr': f"{scores.get('CIDEr', 0):.4f}",
                'SPICE': f"{scores.get('SPICE', 0):.4f}"
            }
            table_data.append(row)
        
    df = pd.DataFrame(table_data)

    return df


In [7]:
results, coco_gt = evaluate_with_pycocoevalcap("/home/duyle/Documents/Case-Study3/eval_result_json")

summary_df = display_pycocoevalcap_results(results)


summary_df    

creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 176, 'reflen': 131, 'guess': [176, 166, 156, 146], 'correct': [110, 57, 24, 8]}
ratio: 1.3435114503714236
Bleu_1: 0.625
Bleu_2: 0.463
Bleu_3: 0.321
Bleu_4: 0.206
computing METEOR score...


PTBTokenizer tokenized 606 tokens at 21907.05 tokens per second.
PTBTokenizer tokenized 200 tokens at 8423.68 tokens per second.


METEOR: 0.297
computing Rouge score...
ROUGE_L: 0.475
computing CIDEr score...
CIDEr: 0.540
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 1.127 s
SPICE: 0.228
✅ git evaluation complete
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 606 tokens at 16777.06 tokens per second.
PTBTokenizer tokenized 125 tokens at 4262.32 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 114, 'reflen': 108, 'guess': [114, 104, 94, 84], 'correct': [57, 21, 2, 0]}
ratio: 1.0555555555457818
Bleu_1: 0.500
Bleu_2: 0.318
Bleu_3: 0.129
Bleu_4: 0.000
computing METEOR score...
METEOR: 0.145
computing Rouge score...
ROUGE_L: 0.364
computing CIDEr score...
CIDEr: 0.377
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 785.1 ms
SPICE: 0.100
✅ blip2 evaluation complete
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 606 tokens at 21824.99 tokens per second.
PTBTokenizer tokenized 282 tokens at 8921.69 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 257, 'reflen': 130, 'guess': [257, 247, 237, 227], 'correct': [91, 18, 3, 1]}
ratio: 1.97692307690787
Bleu_1: 0.354
Bleu_2: 0.161
Bleu_3: 0.069
Bleu_4: 0.035
computing METEOR score...
METEOR: 0.188
computing Rouge score...
ROUGE_L: 0.313
computing CIDEr score...
CIDEr: 0.121
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 666.0 ms
SPICE: 0.119
✅ vit_gpt2 evaluation complete
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 606 tokens at 18801.09 tokens per second.
PTBTokenizer tokenized 249 tokens at 9169.46 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 236, 'reflen': 131, 'guess': [236, 226, 216, 206], 'correct': [126, 59, 24, 9]}
ratio: 1.8015267175435
Bleu_1: 0.534
Bleu_2: 0.373
Bleu_3: 0.249
Bleu_4: 0.161
computing METEOR score...
METEOR: 0.283
computing Rouge score...
ROUGE_L: 0.401
computing CIDEr score...
CIDEr: 0.152
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 630.6 ms
SPICE: 0.232
✅ blip evaluation complete

EVALUATION RESULTS USING COCO


Unnamed: 0,Model,BLEU-1,BLEU-2,BLEU-3,BLEU-4,METEOR,ROUGE-L,CIDEr,SPICE
0,GIT,0.625,0.4633,0.3208,0.2062,0.2972,0.4749,0.5401,0.2283
1,BLIP2,0.5,0.3177,0.129,0.0,0.1454,0.3642,0.3771,0.0999
2,VIT_GPT2,0.3541,0.1606,0.0689,0.0346,0.1884,0.3127,0.1211,0.1192
3,BLIP,0.5339,0.3733,0.2493,0.1613,0.2827,0.4012,0.1519,0.2324
