In [1]:
# Install
!pip install pycocoevalcap

# Simple usage
from pycocoevalcap.eval import COCOEvalCap
from pycocotools.coco import COCO
import json
import os
import tempfile




In [None]:


def evaluate_with_pycocoevalcap(eval_dir="eval_json"):
    """
    Complete example of using pycocoevalcap with your evaluation files
    """
    
    print("🚀 Loading evaluation data...")
    
    # Collect all data from your JSON files
    all_annotations = []
    all_results = {}  # Store results per model
    
    annotation_id = 0
    image_id = 0
    image_mapping = {}  # Map your image names to IDs
    
    # Process each model's evaluation file
    for filename in os.listdir(eval_dir):
        if filename.endswith('.json'):
            model_name = filename.replace('local_evaluation_', '').replace('.json', '')
            
            with open(os.path.join(eval_dir, filename), 'r') as f:
                data = json.load(f)
            
            model_results = []
            
            for image_key, image_data in data.items():
                image_name = image_data['image_name']
                
                # Create consistent image_id mapping
                if image_name not in image_mapping:
                    image_mapping[image_name] = image_id
                    
                    # Add ground truth annotations (only once per image)
                    for gt_caption in image_data['ground_truth']:
                        all_annotations.append({
                            "image_id": image_id,
                            "id": annotation_id,
                            "caption": gt_caption.strip()
                        })
                        annotation_id += 1
                    
                    image_id += 1
                
                # Add model prediction
                current_image_id = image_mapping[image_name]
                generated_caption = list(image_data['model_results'].values())[0]
                
                # Skip failed predictions
                if "failed" not in generated_caption.lower() and "error" not in generated_caption.lower():
                    model_results.append({
                        "image_id": current_image_id,
                        "caption": generated_caption.strip()
                    })
            
            all_results[model_name] = model_results
    
    print(f"📊 Loaded {len(image_mapping)} images with {len(all_annotations)} ground truth captions")
    
    # Create COCO ground truth object
    coco_gt_data = {
        'annotations': all_annotations,
        'images': [{'id': img_id} for img_id in range(len(image_mapping))],
        'info': {'description': 'Image Captioning Evaluation'},
        'licenses': [],
        'type': 'captions'
    }
    
    # Initialize COCO ground truth
    coco_gt = COCO()
    coco_gt.dataset = coco_gt_data
    coco_gt.createIndex()
    
    print(f"✅ Ground truth COCO object created")
    
    # Evaluate each model
    evaluation_results = {}
    
    for model_name, model_results in all_results.items():
        print(f"\n🔍 Evaluating {model_name.upper()}...")
        
        if not model_results:
            print(f"❌ No valid predictions for {model_name}")
            continue
        
        # Save results to temporary file (required by pycocoevalcap)
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
            json.dump(model_results, temp_file)
            temp_filename = temp_file.name
        
        try:
            # Load results
            coco_res = coco_gt.loadRes(temp_filename)
            
            # Create evaluator
            coco_eval = COCOEvalCap(coco_gt, coco_res)
            
            # Run evaluation
            coco_eval.evaluate()
            
            # Store results
            evaluation_results[model_name] = coco_eval.eval.copy()
            
            print(f"✅ {model_name} evaluation complete")
            
        except Exception as e:
            print(f"❌ Error evaluating {model_name}: {e}")
            evaluation_results[model_name] = {}
        
        finally:
            # Clean up temporary file
            os.unlink(temp_filename)
    
    return evaluation_results, coco_gt

def display_pycocoevalcap_results(evaluation_results):
    """Display results in a nice format"""
    
    print("\n" + "="*80)
    print("🏆 OFFICIAL COCO EVALUATION RESULTS")
    print("="*80)
    
    # Create comparison table
    import pandas as pd
    
    table_data = []
    for model_name, scores in evaluation_results.items():
        if scores:  # Only include models with results
            row = {
                'Model': model_name.upper(),
                'BLEU-1': f"{scores.get('Bleu_1', 0):.4f}",
                'BLEU-2': f"{scores.get('Bleu_2', 0):.4f}",
                'BLEU-3': f"{scores.get('Bleu_3', 0):.4f}",
                'BLEU-4': f"{scores.get('Bleu_4', 0):.4f}",
                'METEOR': f"{scores.get('METEOR', 0):.4f}",
                'ROUGE-L': f"{scores.get('ROUGE_L', 0):.4f}",
                'CIDEr': f"{scores.get('CIDEr', 0):.4f}",
                'SPICE': f"{scores.get('SPICE', 0):.4f}"
            }
            table_data.append(row)
    
    if table_data:
        df = pd.DataFrame(table_data)
        print(df.to_string(index=False))
        
        # Find best performing models
        print(f"\nBEST PERFORMING MODELS:")
        print("-" * 50)
        
        metrics = ['Bleu_4', 'METEOR', 'ROUGE_L', 'CIDEr', 'SPICE']
        metric_names = ['BLEU-4', 'METEOR', 'ROUGE-L', 'CIDEr', 'SPICE']
        
        for metric, name in zip(metrics, metric_names):
            best_model = max(evaluation_results.keys(), 
                           key=lambda x: evaluation_results[x].get(metric, 0))
            best_score = evaluation_results[best_model].get(metric, 0)
            print(f"{name:10}: {best_model.upper():10} ({best_score:.4f})")

    
    return df if table_data else None

# Main execution
def run_official_evaluation():
    """Run the complete official evaluation"""
    
    print("🚀 Starting Official COCO Evaluation...")
    
    # Run evaluation
    results, coco_gt = evaluate_with_pycocoevalcap("eval_json")
    
    # Display results
    summary_df = display_pycocoevalcap_results(results)
    
    # Save results
    with open('official_coco_evaluation.json', 'w') as f:
        json.dump(results, f, indent=2)
    
    if summary_df is not None:
        summary_df.to_csv('official_evaluation_summary.csv', index=False)
    
    print(f"\n💾 Results saved to:")
    print(f"  - official_coco_evaluation.json")
    print(f"  - official_evaluation_summary.csv")
    
    return results, summary_df

# Run the evaluation


In [3]:
evaluation_results, summary_df = run_official_evaluation()

🚀 Starting Official COCO Evaluation...
🚀 Loading evaluation data...
📊 Loaded 10 images with 50 ground truth captions
creating index...
index created!
✅ Ground truth COCO object created

🔍 Evaluating GIT...
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 606 tokens at 17305.81 tokens per second.
PTBTokenizer tokenized 200 tokens at 7120.86 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 176, 'reflen': 131, 'guess': [176, 166, 156, 146], 'correct': [110, 57, 24, 8]}
ratio: 1.3435114503714236
Bleu_1: 0.625
Bleu_2: 0.463
Bleu_3: 0.321
Bleu_4: 0.206
computing METEOR score...
METEOR: 0.297
computing Rouge score...
ROUGE_L: 0.475
computing CIDEr score...
CIDEr: 0.540
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.8

SPICE evaluation took: 6.728 s
SPICE: 0.228
✅ git evaluation complete

🔍 Evaluating BLIP2...
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 114, 'reflen': 108, 'guess': [114, 104, 94, 84], 'correct': [57, 21, 2, 0]}
ratio: 1.0555555555457818
Bleu_1: 0.500
Bleu_2: 0.318
Bleu_3: 0.129
Bleu_4: 0.000
computing METEOR score...


PTBTokenizer tokenized 606 tokens at 23283.11 tokens per second.
PTBTokenizer tokenized 125 tokens at 4718.24 tokens per second.


METEOR: 0.145
computing Rouge score...
ROUGE_L: 0.364
computing CIDEr score...
CIDEr: 0.377
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.1 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.8

SPICE evaluation took: 5.281 s
SPICE: 0.100
✅ blip2 evaluation complete

🔍 Evaluating VIT_GPT2...
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 257, 'reflen': 130, 'guess': [257, 247, 237, 227], 'correct': [91, 18, 3, 1]}
ratio: 1.97692307690787
Bleu_1: 0.354
Bleu_2: 0.161
Bleu_3: 0.069
Bleu_4: 0.035
computing METEOR score...


PTBTokenizer tokenized 606 tokens at 23453.47 tokens per second.
PTBTokenizer tokenized 282 tokens at 11247.01 tokens per second.


METEOR: 0.188
computing Rouge score...
ROUGE_L: 0.313
computing CIDEr score...
CIDEr: 0.121
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.0

SPICE evaluation took: 8.198 s
SPICE: 0.119
✅ vit_gpt2 evaluation complete

🔍 Evaluating BLIP...
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 606 tokens at 19295.67 tokens per second.
PTBTokenizer tokenized 249 tokens at 8149.45 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 236, 'reflen': 131, 'guess': [236, 226, 216, 206], 'correct': [126, 59, 24, 9]}
ratio: 1.8015267175435
Bleu_1: 0.534
Bleu_2: 0.373
Bleu_3: 0.249
Bleu_4: 0.161
computing METEOR score...
METEOR: 0.283
computing Rouge score...
ROUGE_L: 0.401
computing CIDEr score...
CIDEr: 0.152
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.9

SPICE evaluation took: 7.754 s
SPICE: 0.232
✅ blip evaluation complete

🏆 OFFICIAL COCO EVALUATION RESULTS
   Model BLEU-1 BLEU-2 BLEU-3 BLEU-4 METEOR ROUGE-L  CIDEr  SPICE
     GIT 0.6250 0.4633 0.3208 0.2062 0.2972  0.4749 0.5401 0.2283
   BLIP2 0.5000 0.3177 0.1290 0.0000 0.1454  0.3642 0.3771 0.0999
VIT_GPT2 0.3541 0.1606 0.0689 0.0346 0.1884  0.3127 0.1211 0.1192
    BLIP 0.5339 0.3733 0.2493 0.1613 0.2827  0.4012 0.1519 0.2324

🥇 BEST PERFORMING MODELS:
--------------------------------------------------
BLEU-4    : GIT        (0.2062)
METEOR    : GIT        (0.2972)
ROUGE-L   : GIT        (0.4749)
CIDEr     : GIT        (0.5401)
SPICE     : BLIP       (0.2324)

💾 Results saved to:
  - official_coco_evaluation.json
  - official_evaluation_summary.csv
