In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install open_clip_torch matplotlib

Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0


In [None]:
import open_clip
import torch
import torch.nn.functional as F
from PIL import Image
import json, os
from pathlib import Path
from tqdm import tqdm
import numpy as np

In [None]:
# --- Device / helpers ---
device = "cuda" if torch.cuda.is_available() else "cpu"
amp_enabled = (device == "cuda")

In [None]:
# --- Setup (paths) ---
ANN_PATH = "/content/drive/MyDrive/data/nocap_val_4500_captions.json"
IMG_DIR = "/content/drive/MyDrive/data/images"
CAPTION_JSON = "/content/drive/MyDrive/data/captions.json"
CAPTION_CLIPSCORE_JSON = "/content/drive/MyDrive/data/captions_with_clipscores.json"
CAPTION_HYBRID_JSON = "/content/drive/MyDrive/data/captions_hybrid_scored.json"

In [None]:
def clean_caption(tokens):
    # open_clip.decode -> string with special tokens; strip them
    s = open_clip.decode(tokens).split("<end_of_text>")[0]
    s = s.replace("<start_of_text>", "").strip()
    # normalize whitespace
    return " ".join(s.split())

# --- Model variants you requested ---
VARIANTS = [
    ("coca_ViT-B-32", "laion2b_s13b_b90k"),
    ("coca_ViT-B-32", "mscoco_finetuned_laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "mscoco_finetuned_laion2b_s13b_b90k"),
]

# --- Load all models & transforms once (to avoid reloading per image) ---
models = []
for model_name, ckpt in VARIANTS:
    model, _, transform = open_clip.create_model_and_transforms(
        model_name=model_name,
        pretrained=ckpt
    )
    model = model.to(device)
    model.eval()
    models.append((model_name, ckpt, model, transform))

print(f"Loaded {len(models)} CoCa variants on {device}.")

open_clip_pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

open_clip_pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

open_clip_pytorch_model.bin:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

open_clip_pytorch_model.bin:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

Loaded 4 CoCa variants on cpu.


In [None]:
# --- Read nocaps json ---
LIMIT = 1000

with open(ANN_PATH, "r") as f:
    nocaps = json.load(f)

images = nocaps.get("images", [])
if LIMIT is not None:
    images = images[:LIMIT]

print(f"Will caption {len(images)} image(s).")

Will caption 1000 image(s).


In [None]:
# --- Generate captions ---
results = []
missing_files = 0
failed = 0

for img_info in tqdm(images, desc="Captioning"):
    fname = img_info["file_name"]
    fpath = Path(IMG_DIR) / fname
    if not fpath.exists():
        missing_files += 1
        continue

    captions = []
    try:
        with Image.open(fpath).convert("RGB") as pil:
            for (model_name, ckpt, model, transform) in models:
                # transform and move to device
                pixel = transform(pil).unsqueeze(0).to(device)
                with torch.no_grad(), torch.cuda.amp.autocast(enabled=amp_enabled):
                    tokens = model.generate(pixel)
                cap = clean_caption(tokens[0])
                captions.append(cap)
                print(cap)
    except Exception as e:
        failed += 1
        # keep going; skip this image
        continue

    results.append({
        "file_name": fname,
        "captions": captions
    })

  with torch.no_grad(), torch.cuda.amp.autocast(enabled=amp_enabled):


this photo is one of the first photos i have of my great - great - great great great great great great great great great great great great great
a little boy that is standing up with a bat .
1 9 5 0 - 0 4 - 0 1 - baby - in - front - of - house - 0 1 . jpg


Captioning:   0%|          | 1/1000 [02:55<48:43:57, 175.61s/it]

an old black and white photo of a little boy .
an elephant in the national park in south india stock photo - 2 2 2 2 2 2 2 2
an elephant with a seat on its back .
elephant in chiang mai , northern thailand


Captioning:   0%|          | 2/1000 [04:48<38:23:39, 138.50s/it]

an elephant is standing in the dirt with a saddle on it 's back .
photo of a collection of coffee and other coffee products . photo is part of a collection of coffee and other coffee products . photo is part of a
a table topped with an open air blender and a container of food .
energy drinks , caffeine , caffeine , caffeine , caffeine , energy drinks , energy drinks , caffeine , caffeine , caffeine , caffeine , caffeine , caffeine ,


Captioning:   0%|          | 3/1000 [07:36<42:08:38, 152.18s/it]

a table topped with cans of coffee and other drinks .
how to store and care for fresh , organic food in a small space .
a table topped with lots of green and orange vegetables .
fresh green parsley on the shelf in the supermarket stock fotó - 1 1 9 4 4 3 1 6 4


Captioning:   0%|          | 4/1000 [09:58<41:01:55, 148.31s/it]

a close up of a bunch of green leafy vegetables .
blue jay on a tree in the spring stock photo - 2 3 2 2 2 2 2 2
a bird is perched on a branch in the forest .
blue jay on nest with eggs


Captioning:   0%|          | 5/1000 [11:48<37:07:03, 134.30s/it]

a blue jay sitting on a tree branch in a tree .
wall art - painting - the old string set up for the new year 2 0 1 2 by jenny armitage
several brown vases of various sizes and colors .
violins in a row


Captioning:   1%|          | 6/1000 [13:31<34:11:03, 123.81s/it]

a row of violins hanging on the wall .
1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
a white cake decorated with flowers and blue and pink icing .
japanese wedding cake with figurines on the top . jpg


Captioning:   1%|          | 7/1000 [15:37<34:17:56, 124.35s/it]

a three tiered white cake decorated with little figurines .
2 0 1 2 world paracycling champs day one
a close - up of a motorcycle with a side mirror .
photo of wheelchair racer in action .


Captioning:   1%|          | 8/1000 [17:17<32:11:18, 116.81s/it]

a man riding a bicycle down the middle of the street .
new york city , united states - 2 0 1 2 : an old white wall with an open front door in an alley in new york city .
an old cabinet and microwave sitting on a sidewalk .
a garage in the lower east side of new york city .


Captioning:   1%|          | 9/1000 [19:30<33:30:28, 121.72s/it]

a garage that has been torn down on the side of the street .
2 0 1 2 - world 's first high - performance jet - ski is ready for action
a man on a surfboard in the water .
jetski rental and tours in cape town , south africa


Captioning:   1%|          | 10/1000 [21:17<32:15:37, 117.31s/it]

a man on a jet ski in the water .
the world 's best photos of carshow and carshow - flickr ...
a white and black bus and some people and a building
bugatti veyron spotted in paris


Captioning:   1%|          | 11/1000 [22:58<30:46:37, 112.03s/it]

a man standing next to a silver car on a trailer .
my new work space . i love my new work space . i love my new work space . i love my new work space . i love my
a computer monitor sitting on top of a wooden desk .
my desk at work . i 'm not sure what i 'd do with it , but it 's pretty cool .


Captioning:   1%|          | 12/1000 [25:32<34:15:40, 124.84s/it]

a desk with a computer , laptop , and wine glass .
photo - the 2 0 1 2 college world series is set for may 1 2 - 1 4 in oklahoma city . photo provided < strong ></
a baseball player is swinging a bat at a ball .
unc wilmington baseball vs . georgia southern


Captioning:   1%|▏         | 13/1000 [27:29<33:35:07, 122.50s/it]

a man swinging a baseball bat on a field .
photo - a u . s . marine and a u . s . marine from the 1 st light armored reconnaissance company , 1 st marine division
two men are working on a piece of wood .
canadian army personnel on a military vehicle . photo : canadian armed forces . ( cnw group / canadian armed forces ) ( cnw group / canadian


Captioning:   1%|▏         | 14/1000 [30:08<36:36:03, 133.63s/it]

two men standing on top of an armored vehicle .
2 0 1 2 world of outlaws top 1 0 cars
a group of motorcycles parked next to each other .
kyosho ultima rb 6 1 / 1 0 4 wd off - road competition electric buggy kit


Captioning:   2%|▏         | 15/1000 [32:28<37:06:24, 135.62s/it]

a red and black toy car and a red and white toy car on the floor .
two snow leopards on top of a rock in a park . stock photo - 1 1 2 2 2 2 2 2 2
a flock of birds sitting on top of a large rock .
snow leopards at the san diego zoo safari park


Captioning:   2%|▏         | 16/1000 [34:31<35:59:27, 131.67s/it]

a pair of snow leopard laying on top of a rock ledge .
new 2 0 1 5 new design new design new design new design new design new design new design new design new design new design new design new design
a bicycle with a basket on the front and a blender on the back .
test : cube reaction gtc 2 0 1 4


Captioning:   2%|▏         | 17/1000 [36:35<35:20:14, 129.41s/it]

a person riding a bike on a green floor .
the st . regis singapore is one of the top 1 0 best luxury hotels in the world for 2 0 1 5 .
an outdoor area that has several tables and chairs set up in the middle of the room .
the base sukhumvit 7 7 by compass hospitality


Captioning:   2%|▏         | 18/1000 [38:40<34:58:14, 128.20s/it]

there is a lot of furniture on the deck .
image of a black and silver camera with a red and white light spot on the top of the body and a red and white light spot on the
a person is on a bike on a subway train .
rent manfrotto 5 0 0 fluid video head and 7 0 - 2 0 0 f / 2 . 8 lens


Captioning:   2%|▏         | 19/1000 [41:19<37:27:42, 137.47s/it]

a camera on top of a tripod with a camera attached to it .
a white flower with a white flower in front of a white background stock photo - 1 3 2 2 2 2 2 2
a pair of shoes that have a cat in them .
white easter lily flower on a pile of silver coins stock fotó - 1 1 9 7 4 3 1 8 0


Captioning:   2%|▏         | 20/1000 [43:47<38:12:29, 140.36s/it]

a white flower sitting on top of coins .
2 0 1 1 1 1 2 2 - 1 1 2 2 1 1 - u 1 8 u 1 8 u 1 8 u 1 8 u
a soccer player is trying to control the ball .
soccer player kicking the soccer ball on the field


Captioning:   2%|▏         | 20/1000 [45:30<37:09:36, 136.51s/it]


KeyboardInterrupt: 

In [None]:
# --- Save ---
with open(CAPTION_JSON, "w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nDone. Saved {len(results)} items to: {CAPTION_JSON}")
print(f"Missing files: {missing_files} | Failed during generation: {failed}")
