In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# one of these usually works (try in this order)
%pip install pycocoevalcap
# or
%pip install git+https://github.com/salaniz/pycocoevalcap
# or (original repo layout)
%pip install git+https://github.com/tylin/coco-caption.git#subdirectory=pycocoevalcap

Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-rez25ney
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-rez25ney
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/tylin/coco-caption.git#subdirectory=pycocoevalcap
  Cloning https://github.com/tylin/coco-caption.git to /tmp/pip-req-build-qpqot9oz
  Running command git clone --filter=blob:none --quie

Evaluating Results

In [22]:
import json
from pathlib import Path
from collections import defaultdict

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

ANN_PATH = "/content/drive/MyDrive/data/nocap_val_4500_captions.json"
PAIR_FILE = "/content/drive/MyDrive/data/baseline_vs_hybrid.json"
OUT_BASELINE = "/content/drive/MyDrive/data/results_coca_baseline.json"
OUT_HYBRID   = "/content/drive/MyDrive/data/results_coca_hybrid.json"

# ---------- helpers ----------
def save_preds_coco_style(pairs, file2id, key, out_path):
    preds = []
    missing = 0
    for it in pairs:
        fn = it["file_name"]
        if fn not in file2id:
            missing += 1
            continue
        cap = (it.get(key) or "").strip()
        preds.append({"image_id": file2id[fn], "caption": cap})
    if missing:
        print(f"[WARN] {missing} predictions had file_names not found in GT; skipped.")
    with open(out_path, "w") as f:
        json.dump(preds, f, ensure_ascii=False, indent=2)
    print(f"[OK] wrote {len(preds)} predictions -> {out_path}")
    return out_path

def read_pred_ids(res_path):
    with open(res_path, "r") as f:
        data = json.load(f)
    return [d["image_id"] for d in data], data

def eval_file_on_ids(coco, res_path, img_ids, label="ALL"):
    cocoRes = coco.loadRes(res_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params["image_id"] = img_ids  # critical: restrict to exact same set
    cocoEval.evaluate()
    print(f"\n=== Results [{label}] for {Path(res_path).name} ===")
    order = ["CIDEr", "SPICE", "BLEU_4", "METEOR", "ROUGE_L", "BLEU_3", "BLEU_2", "BLEU_1"]
    for m in order:
        if m in cocoEval.eval:
            print(f"{m:8s}: {cocoEval.eval[m]:.3f}")
    return cocoEval.eval

# ---------- 1) load data ----------
with open(ANN_PATH, "r") as f:
    ann = json.load(f)
with open(PAIR_FILE, "r") as f:
    pairs = json.load(f)

# file_name -> image_id, image_id -> domain
file2id = {img["file_name"]: img["id"] for img in ann["images"]}
id2domain = {img["id"]: img.get("domain", "unknown") for img in ann["images"]}

# ---------- 2) write COCO-style preds ----------
save_preds_coco_style(pairs, file2id, "baseline", OUT_BASELINE)
save_preds_coco_style(pairs, file2id, "hybrid_best_caption", OUT_HYBRID)

# ---------- 3) compute intersection of IDs ----------
gt_ids = set([img["id"] for img in ann["images"]])
base_ids, base_data = read_pred_ids(OUT_BASELINE)
hybr_ids, hybr_data = read_pred_ids(OUT_HYBRID)

base_ids = set(base_ids)
hybr_ids = set(hybr_ids)
common_ids = gt_ids & base_ids & hybr_ids

print("\n--- Coverage Report ---")
print(f"GT images         : {len(gt_ids)}")
print(f"Baseline preds    : {len(base_ids)}")
print(f"Hybrid preds      : {len(hybr_ids)}")
print(f"Intersection used : {len(common_ids)}")
extra_in_base = len(base_ids - gt_ids)
extra_in_hybr = len(hybr_ids - gt_ids)
missing_base  = len(gt_ids - base_ids)
missing_hybr  = len(gt_ids - hybr_ids)
print(f"Baseline extra(not in GT): {extra_in_base}, missing(from GT): {missing_base}")
print(f"Hybrid   extra(not in GT): {extra_in_hybr}, missing(from GT): {missing_hybr}")

# Convert intersection to list (sorted for determinism)
common_ids = sorted(list(common_ids))

# ---------- 4) evaluate overall on intersection ----------
coco = COCO(ANN_PATH)
overall_baseline = eval_file_on_ids(coco, OUT_BASELINE, common_ids, label="ALL∩")
overall_hybrid   = eval_file_on_ids(coco, OUT_HYBRID,   common_ids, label="ALL∩")

# ---------- 5) per-split evaluation on intersection ----------
splits = defaultdict(list)
for img_id, dom in id2domain.items():
    splits[dom].append(img_id)

for dom in ["in-domain", "near-domain", "out-of-domain", "out-domain", "unknown"]:
    if dom not in splits or len(splits[dom]) == 0:
        continue
    split_ids = sorted(list(set(splits[dom]) & set(common_ids)))
    if len(split_ids) == 0:
        continue
    eval_file_on_ids(coco, OUT_BASELINE, split_ids, label=f"{dom}∩")
    eval_file_on_ids(coco, OUT_HYBRID,   split_ids, label=f"{dom}∩")


[OK] wrote 4000 predictions -> /content/drive/MyDrive/data/results_coca_baseline.json
[OK] wrote 4000 predictions -> /content/drive/MyDrive/data/results_coca_hybrid.json

--- Coverage Report ---
GT images         : 4500
Baseline preds    : 4000
Hybrid preds      : 4000
Intersection used : 4000
Baseline extra(not in GT): 0, missing(from GT): 500
Hybrid   extra(not in GT): 0, missing(from GT): 500
loading annotations into memory...
Done (t=0.15s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 41984, 'reflen': 41380, 'guess': [41984, 37984, 33984, 29984], 'correct': [34465, 20709, 10677, 4949]}
ratio: 1.014596423392919
Bleu_1: 0.821
Bleu_2: 0.669
Bleu_3: 0.520
Bleu_4: 0.390
computing METEOR score...
METEOR: 0.290
computing Rouge score...
ROUGE_L: 0.578
computing CIDEr score...
CIDEr: 1.064
computing SPICE score...
SPICE: 0.148

=== Results [ALL∩] for

In [None]:
with open(OUT_BASELINE, "w") as f:
    json.dump(overall_baseline, f, ensure_ascii=False, indent=2)

In [None]:
with open(OUT_HYBRID, "w") as f:
    json.dump(overall_hybrid, f, ensure_ascii=False, indent=2)