In [2]:
!pip install numpy pillow torch torchvision tqdm

Collecting numpy
  Downloading numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pillow
  Downloading pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting torch
  Downloading torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting filelock (from torch)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import json
import re
import string
from collections import Counter
from tqdm import tqdm

import numpy as np
from PIL import Image

import torch
import torch.nn as nn
from torchvision import models, transforms


In [4]:
DATA_JSON = "../dataset_rsicd.json"   # go up one folder to find dataset file
IMAGE_DIR = "../RSICD_images/"        # go up one folder to find images
OUTPUT_DIR = "../preprocessed/"       # save preprocessed folder alongside dataset

MIN_WORD_FREQ = 5
MAX_SEQ_LEN = 20


In [5]:
import os

print(os.path.exists(DATA_JSON))   # should print True
print(os.path.exists(IMAGE_DIR))   # should print True

True
True


In [6]:
with open(DATA_JSON, "r") as f:
    dataset = json.load(f)

print(f"Loaded {len(dataset['images'])} image entries")
print("Example entry keys:", dataset['images'][0].keys())

Loaded 10921 image entries
Example entry keys: dict_keys(['filename', 'imgid', 'sentences', 'split', 'sentids'])


In [7]:
import re, string

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(f"[{string.punctuation}]", "", caption)
    return caption.strip()

all_captions = []
for img_entry in dataset['images']:
    for sent in img_entry['sentences']:
        raw = " ".join(sent['tokens'])
        all_captions.append(clean_caption(raw))

print("Sample cleaned caption:", all_captions[0])
print("Total captions:", len(all_captions))

Sample cleaned caption: many planes are parked next to a long building in an airport
Total captions: 54605


In [10]:
counter = Counter()
for cap in all_captions:
    counter.update(cap.split())

# keep only frequent words
words = [w for w, c in counter.items() if c >= MIN_WORD_FREQ]

# add special tokens
word_map = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
for i, w in enumerate(words, start=4):
    word_map[w] = i

rev_word_map = {v: k for k, v in word_map.items()}

print("Vocab size:", len(word_map))

# make sure folder exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# save vocab
with open(os.path.join(OUTPUT_DIR, "word_to_idx.json"), "w") as f:
    json.dump(word_map, f)
with open(os.path.join(OUTPUT_DIR, "idx_to_word.json"), "w") as f:
    json.dump(rev_word_map, f)



Vocab size: 1304


In [11]:
captions_dict = {}

for img_entry in dataset['images']:
    img_id = img_entry['filename']
    captions_dict[img_id] = []
    for sent in img_entry['sentences']:
        tokens = [word_map.get(w, word_map["<unk>"]) 
                  for w in clean_caption(" ".join(sent['tokens'])).split()]
        
        # wrap with <start>, <end>
        tokens = [word_map["<start>"]] + tokens[:MAX_SEQ_LEN-2] + [word_map["<end>"]]
        
        # pad sequence
        while len(tokens) < MAX_SEQ_LEN:
            tokens.append(word_map["<pad>"])
        
        captions_dict[img_id].append(tokens)

# save
with open(os.path.join(OUTPUT_DIR, "captions_tokens.json"), "w") as f:
    json.dump(captions_dict, f)

print("Saved tokenized captions.")


Saved tokenized captions.


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

resnet = models.resnet50(pretrained=True)
modules = list(resnet.children())[:-1]  # remove last FC layer
resnet = nn.Sequential(*modules)
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


Using device: cpu




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/aishwaryshree/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████████████████████████████████| 97.8M/97.8M [00:11<00:00, 8.63MB/s]


In [13]:
image_map = {}
features = []

with torch.no_grad():
    for idx, img_entry in tqdm(enumerate(dataset['images']), total=len(dataset['images'])):
        img_id = img_entry['filename']
        img_path = os.path.join(IMAGE_DIR, img_id)
        
        if not os.path.exists(img_path):
            continue
        
        image_map[img_id] = idx
        
        img = Image.open(img_path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)
        
        feat = resnet(img_tensor).squeeze().cpu().numpy()
        features.append(feat)

features = np.array(features)

# save
np.save(os.path.join(OUTPUT_DIR, "image_features.npy"), features)
with open(os.path.join(OUTPUT_DIR, "image_map.json"), "w") as f:
    json.dump(image_map, f)

print("Saved image features and mapping.")


100%|█████████████████████████████████████| 10921/10921 [05:22<00:00, 33.91it/s]

Saved image features and mapping.





In [14]:
splits = {"train": [], "val": [], "test": []}

for img_entry in dataset['images']:
    img_id = img_entry['filename']
    split = img_entry['split']
    splits[split].append(img_id)

for split in splits:
    with open(os.path.join(OUTPUT_DIR, f"{split}.json"), "w") as f:
        json.dump(splits[split], f)

print("Saved dataset splits.")


Saved dataset splits.
