In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install open_clip_torch matplotlib

Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0


In [4]:
import open_clip
import torch
import torch.nn.functional as F
from PIL import Image
import json, os
from pathlib import Path
from tqdm import tqdm
import numpy as np

In [5]:
# --- Device / helpers ---
device = "cuda" if torch.cuda.is_available() else "cpu"
amp_enabled = (device == "cuda")

In [7]:
# --- Setup (paths) ---
ANN_PATH = "/content/drive/MyDrive/data/nocap_val_4500_captions.json"
IMG_DIR = "/content/drive/MyDrive/data/selected_images"
CAPTION_JSON = "/content/drive/MyDrive/data/captions.json"
CAPTION_CLIPSCORE_JSON = "/content/drive/MyDrive/data/captions_with_clipscores.json"
CAPTION_HYBRID_JSON = "/content/drive/MyDrive/data/captions_hybrid_scored.json"

In [8]:
# Pick a CLIP backbone + weights (common options shown below)
CLIP_MODEL   = "ViT-B-32"
CLIP_CKPT    = "openai"  # e.g., "openai" or "laion2b_s34b_b79k"

In [9]:
# ===== Load CLIP (NOT CoCa) =====
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    model_name=CLIP_MODEL,
    pretrained=CLIP_CKPT
)
clip_model = clip_model.to(device)
clip_model.eval()

open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [10]:
# ===== Helpers =====
def clipscore_image_text(pil_img, captions, batch_size=8):
    """
    Returns cosine similarities (list of floats) between one image and N captions.
    """
    # Encode image
    with torch.no_grad():
        img = clip_preprocess(pil_img).unsqueeze(0).to(device)
        image_features = clip_model.encode_image(img)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

    # Encode text (batched)
    sims = []
    for i in range(0, len(captions), batch_size):
        batch_caps = captions[i:i+batch_size]
        with torch.no_grad():
            tok = open_clip.tokenize(batch_caps).to(device)
            text_features = clip_model.encode_text(tok)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
            # cosine similarity = dot product since both are L2-normalized
            sim = (image_features @ text_features.T).squeeze(0)  # shape [batch]
            sims.extend(sim.tolist())
    return sims  # each in roughly [-1, 1]


In [20]:
# ===== Read previous captions =====
with open(CAPTION_JSON, "r") as f:
    items = json.load(f)

results = []
missing, failed = 0, 0
for it in tqdm(items, desc="Scoring with CLIP"):
    fname = it["file_name"]
    fpath = Path(IMG_DIR) / fname
    if not fpath.exists():
        missing += 1
        continue

    caps = it.get("captions", [])
    if not caps:
        continue

    try:
        with Image.open(fpath).convert("RGB") as pil:
            scores = clipscore_image_text(pil, caps, batch_size=8)
    except Exception as e:
        failed += 1
        continue

    # Find best caption
    best_idx   = int(np.argmax(scores))
    best_cap   = caps[best_idx]
    best_score = float(scores[best_idx])

    results.append({
        "file_name": fname,
        "captions": caps,
        "clipscores": [float(s) for s in scores],  # cosine similarity per caption
        "best_index": best_idx,
        "best_caption": best_cap,
        "best_score": best_score
    })

Scoring with CLIP: 100%|██████████| 4000/4000 [1:36:56<00:00,  1.45s/it]


In [21]:
# ===== Save =====
with open(CAPTION_CLIPSCORE_JSON, "w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(results)} items -> {CAPTION_CLIPSCORE_JSON}")
print(f"Missing images: {missing} | Failed during scoring: {failed}")

# (Optional) quick peek
if results:
    print("\nExample:")
    print(json.dumps(results[0], indent=2, ensure_ascii=False))

Saved 4000 items -> /content/drive/MyDrive/data/captions_with_clipscores.json
Missing images: 0 | Failed during scoring: 0

Example:
{
  "file_name": "0013ea2087020901.jpg",
  "captions": [
    "this photo is one of the first photos i have of my great - great - great great great great great great great great great great great great great",
    "a little boy that is standing up with a bat .",
    "1 9 5 0 - 0 4 - 0 1 - baby - in - front - of - house - 0 1 . jpg",
    "an old black and white photo of a little boy ."
  ],
  "clipscores": [
    0.23179174959659576,
    0.2581939101219177,
    0.28768110275268555,
    0.31926512718200684
  ],
  "best_index": 3,
  "best_caption": "an old black and white photo of a little boy .",
  "best_score": 0.31926512718200684
}
