In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install open_clip_torch pillow torch torchvision

Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0


In [None]:
!pip -q install open_clip_torch pillow tqdm

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import json, numpy as np, torch, torch.nn.functional as F
import open_clip
from tqdm import tqdm
from pathlib import Path

# === PATHS (EDIT) ===
JSON_IN  = "/content/drive/MyDrive/data/captions_with_clipscores.json"
IMG_DIR  = "/content/drive/MyDrive/data/images"
JSON_OUT = "/content/drive/MyDrive/data/captions_with_logp.json"
ALPHA    = 1.0

VARIANTS = [
    ("coca_ViT-B-32", "laion2b_s13b_b90k"),
    ("coca_ViT-B-32", "mscoco_finetuned_laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "mscoco_finetuned_laion2b_s13b_b90k"),
]

device = "cuda" if torch.cuda.is_available() else "cpu"

# Safety: ensure default stays float32
torch.set_default_dtype(torch.float32)
try:
    torch.set_float32_matmul_precision("medium")
except Exception:
    pass

def zscore(x):
    x = np.asarray(x, float)
    return (x - x.mean()) / (x.std() + 1e-8)S

In [9]:
# ---------- Load ALL variants once, move to device + float32 ----------
models, preprocesses, tokenizers = [], [], []
print("Loading CoCa variants → device:", device)
for name, ckpt in VARIANTS:
    m, _, pp = open_clip.create_model_and_transforms(name, pretrained=ckpt, device=device)
    # Force full float32 (avoids mixed-dtype LayerNorm on CPU)
    m.to(device=device, dtype=torch.float32)
    m.eval()
    tok = open_clip.get_tokenizer(name)
    models.append(m); preprocesses.append(pp); tokenizers.append(tok)
    print(f"  ✓ {name} [{ckpt}] dtype={next(m.parameters()).dtype}, device={next(m.parameters()).device}")


Loading CoCa variants → device: cuda


open_clip_pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

  ✓ coca_ViT-B-32 [laion2b_s13b_b90k] dtype=torch.float32, device=cuda:0


open_clip_pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

  ✓ coca_ViT-B-32 [mscoco_finetuned_laion2b_s13b_b90k] dtype=torch.float32, device=cuda:0


open_clip_pytorch_model.bin:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

  ✓ coca_ViT-L-14 [laion2b_s13b_b90k] dtype=torch.float32, device=cuda:0


open_clip_pytorch_model.bin:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

  ✓ coca_ViT-L-14 [mscoco_finetuned_laion2b_s13b_b90k] dtype=torch.float32, device=cuda:0


Generating log logP(c∣I) for one Image

In [None]:
import torch, torch.nn.functional as F
from PIL import Image

# Assumes you already have:
# models       = [m0, m1, m2, m3]
# preprocesses = [pp0, pp1, pp2, pp3]
# tokenizers   = [tok0, tok1, tok2, tok3]
# device = "cuda" if torch.cuda.is_available() else "cpu"

@torch.no_grad()
def coca_logp(model, preprocess, tokenizer, image_path, caption, device="cuda"):
    # --- image tensor (float32) ---
    img = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device=device, dtype=torch.float32)

    # --- tokens ---
    toks = tokenizer([caption]).to(device)        # [1, L]
    inp, targ = toks[:, :-1], toks[:, 1:]         # teacher forcing shift

    # 1) Preferred: model returns caption logits directly
    logits = None
    try:
        out = model(img, toks, return_loss=False)
        if isinstance(out, dict) and "caption_logits" in out:
            logits = out["caption_logits"]        # [1, L-1, V]
    except Exception:
        pass

    # Helper: get image tokens [B, N, D] for the decoder
    def get_image_tokens():
        # Try explicit tokens API
        try:
            # Some open_clip CoCa builds support return_tokens=True
            x = model.encode_image(img, return_tokens=True)  # [B, N, D]
            if x.dim() == 3:
                return x
        except Exception:
            pass
        # Try visual module
        if hasattr(model, "visual"):
            try:
                v = model.visual(img)  # might be [B, N, D] or [B, D]
                if v.dim() == 3:
                    return v
                if v.dim() == 2:
                    return v.unsqueeze(1)  # [B, 1, D]
            except Exception:
                pass
        # Fallback: pooled embedding
        v = model.encode_image(img)
        if v.dim() == 3:
            return v
        if v.dim() == 2:
            return v.unsqueeze(1)          # [B, 1, D]
        raise RuntimeError("Could not obtain image tokens")

    # 2) Fallback: run decoder with EMBEDDINGS (not token IDs)
    if logits is None and hasattr(model, "text_decoder"):
        image_tokens = get_image_tokens()          # [1, N, D]

        # Locate token & positional embeddings
        tok_emb = getattr(model, "token_embedding", None)
        if tok_emb is None and hasattr(model, "text"):
            tok_emb = getattr(model.text, "token_embedding", None)

        pos_emb = getattr(model, "positional_embedding", None)
        if pos_emb is None and hasattr(model, "text"):
            pos_emb = getattr(model.text, "positional_embedding", None)

        if tok_emb is None or pos_emb is None:
            raise RuntimeError("Missing token/positional embeddings for CoCa build.")

        text_embs = tok_emb(inp)                   # [1, L-1, D]
        # Add positional embeddings
        if pos_emb.dim() == 2:                     # [context, D]
            text_embs = text_embs + pos_emb[: text_embs.shape[1]].unsqueeze(0).to(device)
        else:                                      # [1, context, D]
            text_embs = text_embs + pos_emb[:, : text_embs.shape[1], :].to(device)

        # Ensure correct ranks for attention: [B, T, D] and [B, N, D]
        if text_embs.dim() != 3:
            text_embs = text_embs.unsqueeze(0)
        if image_tokens.dim() == 2:
            image_tokens = image_tokens.unsqueeze(1)

        logits = model.text_decoder(image_tokens, text_embs)  # [1, L-1, V]

    if logits is None:
        raise RuntimeError("Could not obtain caption logits from CoCa.")

    logits = logits.to(dtype=torch.float32)

    # --- token log-probs ---
    log_probs  = F.log_softmax(logits, dim=-1)
    token_logp = torch.gather(log_probs, -1, targ.unsqueeze(-1)).squeeze(-1)  # [1, L-1]
    nonpad     = (targ != 0).float()
    logp_sum   = (token_logp * nonpad).sum(1)                                 # [1]
    T          = nonpad.sum(1).clamp_min(1)
    logp_mean  = (logp_sum / T).item()
    return logp_mean, logp_sum.item()

def score_four_captions(models, preprocesses, tokenizers, image_path, captions, device="cuda"):
    assert len(models) == len(preprocesses) == len(tokenizers) == 4
    assert len(captions) == 4
    # Make sure models are on the right device & fp32 once
    for m in models:
        m.to(device=device, dtype=torch.float32).eval()

    lpm_list, lps_list = [], []
    for i, cap in enumerate(captions):
        lpm, lps = coca_logp(models[i], preprocesses[i], tokenizers[i], image_path, cap, device=device)
        lpm_list.append(lpm); lps_list.append(lps)
    return {"logp_mean": lpm_list, "logp_sum": lps_list}

# -------------------------
# Example usage (fill paths/captions)
# -------------------------
image_path = "/content/drive/MyDrive/data/images/0032257bf3cd56d0.jpg"
captions = [
  "an elephant in the national park in south india stock photo - 2 2 2 2 2 2 2 2",
  "an elephant with a seat on its back .",
  "elephant in chiang mai , northern thailand",
  "an elephant is standing in the dirt with a saddle on it 's back ."
]

scores = score_four_captions(models, preprocesses, tokenizers, image_path, captions, device=device)
print("logp_mean:", [round(x, 4) for x in scores["logp_mean"]])
print("logp_sum :", [round(x, 4) for x in scores["logp_sum"]])


logp_mean: [-9.8956, -7.1789, -13.7126, -8.7052]
logp_sum : [-207.8078, -71.789, -109.7005, -139.283]


Generating log logP(c∣I) for every captions

In [22]:
# ===== Read previous captions =====
with open(JSON_IN, "r") as f:
    items = json.load(f)

results = []
missing, failed = 0, 0
for it in tqdm(items, desc="Scoring with CLIP"):
    fname = it["file_name"]
    imgpath = Path(IMG_DIR) / fname
    if not imgpath.exists():
        missing += 1
        continue

    caps = it.get("captions", [])
    if not caps:
        continue

    try:
        with Image.open(imgpath).convert("RGB") as pil:
            # scores = clipscore_image_text(pil, caps, batch_size=8)
            ss=score_four_captions(models, preprocesses, tokenizers, imgpath, caps, device=device)
    except Exception as e:
        failed += 1
        continue

    # Find best caption
    # best_idx   = int(np.argmax(scores))
    # best_cap   = caps[best_idx]
    # best_score = float(scores[best_idx])

    results.append({
        "file_name": fname,
        "captions": caps,
        "logp_mean": scores["logp_mean"],
        "logp_sum": scores["logp_sum"],
    })

Scoring with CLIP: 100%|██████████| 4000/4000 [1:02:03<00:00,  1.07it/s]


In [23]:
with open(JSON_OUT, "w") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Wrote ->", JSON_OUT)

Wrote -> /content/drive/MyDrive/data/captions_with_logp.json


In [7]:
import json, numpy as np

# --- inputs/outputs ---
JSON_IN_WITH_CLIP = "/content/drive/MyDrive/data/captions_with_clipscores.json"   # original items (has clipscores)
JSON_IN_WITH_LOGP = "/content/drive/MyDrive/data/captions_with_logp.json"         # your 'results' list saved
JSON_OUT_HYBRID   = "/content/drive/MyDrive/data/captions_with_hybrid_2.json"

ALPHA = 2  # try 0.5, 1.0, 2.0 on a small val split

def zscore(x):
    x = np.asarray(x, float)
    return (x - x.mean()) / (x.std() + 1e-8)

# (optional) if you want min-max instead of z-score:
def minmax(x):
    x = np.asarray(x, float)
    mn, mx = x.min(), x.max()
    return (x - mn) / (mx - mn + 1e-8)

# --- load ---
with open(JSON_IN_WITH_CLIP, "r") as f:
    items = json.load(f)
with open(JSON_IN_WITH_LOGP, "r") as f:
    scored = json.load(f)

# index scored by file_name
scored_by_name = {e["file_name"]: e for e in scored}

hybrid = []
skipped = 0

for it in items:
    fname = it["file_name"]
    caps  = it.get("captions", [])
    clips = it.get("clipscores", None)
    sc    = scored_by_name.get(fname)

    if sc is None or clips is None or not caps:
        skipped += 1
        continue

    # sanity: lengths must match
    if len(caps) != len(sc["logp_mean"]) or len(caps) != len(clips):
        skipped += 1
        continue

    # ---- normalize per image (choose ONE: zscore OR minmax) ----
    L = zscore(sc["logp_mean"])   # CoCa faithfulness/fluency
    S = zscore(clips)             # CLIP alignment
    # L = minmax(sc["logp_mean"]); S = minmax(clips)  # <- alternative

    # ---- combine ----
    hybrid_scores = (L + ALPHA * S).tolist()
    best_idx = int(np.argmax(hybrid_scores))

    hybrid.append({
        "file_name": fname,
        "captions": caps,
        "clipscores": clips,
        "logp_mean": sc["logp_mean"],
        "logp_sum": sc["logp_sum"],
        "z_logp_mean": L.tolist(),
        "z_clipscores": S.tolist(),
        "alpha": ALPHA,
        "hybrid_scores": hybrid_scores,
        "hybrid_best_index": best_idx,
        "hybrid_best_caption": caps[best_idx],
        "hybrid_best_score": float(hybrid_scores[best_idx]),
    })

with open(JSON_OUT_HYBRID, "w") as f:
    json.dump(hybrid, f, ensure_ascii=False, indent=2)

print(f"Wrote -> {JSON_OUT_HYBRID} | images processed: {len(hybrid)} | skipped: {skipped}")
# peek
print(json.dumps(hybrid[0], indent=2)[:800], "...\n")


Wrote -> /content/drive/MyDrive/data/captions_with_hybrid_2.json | images processed: 4000 | skipped: 0
{
  "file_name": "0013ea2087020901.jpg",
  "captions": [
    "this photo is one of the first photos i have of my great - great - great great great great great great great great great great great great great",
    "a little boy that is standing up with a bat .",
    "1 9 5 0 - 0 4 - 0 1 - baby - in - front - of - house - 0 1 . jpg",
    "an old black and white photo of a little boy ."
  ],
  "clipscores": [
    0.23179174959659576,
    0.2581939101219177,
    0.28768110275268555,
    0.31926512718200684
  ],
  "logp_mean": [
    -9.895607948303223,
    -7.178898811340332,
    -13.71255874633789,
    -8.705184936523438
  ],
  "logp_sum": [
    -207.80776977539062,
    -71.78898620605469,
    -109.70046997070312,
    -139.282958984375
  ],
  "z_logp_mean": [
    -0.009328389546995839,
    1.1 ...



Creating one file with baseline caption and hybrid best caption

In [15]:
!pip -q install open_clip_torch pillow tqdm

from tqdm import tqdm

JSON_IN_HYBRID   = "/content/drive/MyDrive/data/captions_with_hybrid.json"
JSON_OUT_HYBRID2 = "/content/drive/MyDrive/data/baseline_vs_hybrid.json"

# ===== Read previous captions =====
with open(JSON_IN_HYBRID, "r") as f:
    items = json.load(f)

results = []
missing, failed = 0, 0
for it in tqdm(items, desc="Scoring with CLIP"):
    fname = it["file_name"]

    baseline = it.get("captions", [])

    results.append({
        "file_name": fname,
        "baseline": baseline[3],
        "hybrid_best_caption": it["hybrid_best_caption"]
    })

Scoring with CLIP: 100%|██████████| 4000/4000 [00:00<00:00, 596226.45it/s]


In [16]:
with open(JSON_OUT_HYBRID2, "w") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Wrote -> {JSON_OUT_HYBRID} | images processed: {len(results)} | skipped: {skipped}")
# peek
print(json.dumps(results[0], indent=2)[:800], "...\n")

Wrote -> /content/drive/MyDrive/data/captions_with_hybrid_2.json | images processed: 4000 | skipped: 0
{
  "file_name": "0013ea2087020901.jpg",
  "baseline": "an old black and white photo of a little boy .",
  "hybrid_best_caption": "an old black and white photo of a little boy ."
} ...

