Extract some images from the Mini COCO2014 Dataset: https://www.kaggle.com/datasets/nagasai524/mini-coco2014-dataset-for-image-captioning

In [1]:
import os 
import sys 
import json 
import numpy as np 
import pandas 
import random 
from collections import defaultdict
from nltk.tokenize import word_tokenize

In [2]:
with open('captions.json', 'r') as file: 
    data = json.load(file)
data[0:2]

[{'image_id': 299675,
  'id': 328,
  'caption': 'A white square kitchen with tile floor that needs repairs '},
 {'image_id': 513461,
  'id': 572,
  'caption': 'A surfer, a woman, and a child walk on the beach.'}]

In [3]:
# we want to sort "data" based on the image_id
sorted_data = sorted(data, key = lambda x: x['image_id'])
print(*sorted_data[:10], sep='\n')

{'image_id': 9, 'id': 661611, 'caption': 'Closeup of bins of food that include broccoli and bread.'}
{'image_id': 9, 'id': 661977, 'caption': 'A meal is presented in brightly colored plastic trays.'}
{'image_id': 9, 'id': 663627, 'caption': 'there are containers filled with different kinds of foods'}
{'image_id': 9, 'id': 666765, 'caption': 'Colorful dishes holding meat, vegetables, fruit, and bread.'}
{'image_id': 9, 'id': 667602, 'caption': 'A bunch of trays that have different food.'}
{'image_id': 61, 'id': 444409, 'caption': 'They are brave for riding in the jungle on those elephants.'}
{'image_id': 61, 'id': 446671, 'caption': 'SOME PEOPLE IN THE WOODS RIDING TWO ELEPHANTS'}
{'image_id': 61, 'id': 452062, 'caption': 'Some people who are riding on top of elephants.'}
{'image_id': 61, 'id': 452272, 'caption': 'there are people riding elephants in the middle of a forest'}
{'image_id': 61, 'id': 455584, 'caption': 'Several elephants in the jungle carrying people on their backs'}


In [4]:
# Now we have the "sorted_data" by the image_id 
# we will create a dictionary in the format {'image_id': list of reference captions}
my_data = defaultdict(list)
for infor in sorted_data: 
    if len(my_data[infor['image_id']]) == 5: # since some images have more than 5 captions
        continue 
    my_data[infor['image_id']].append(' '.join(word_tokenize(infor['caption'])))

my_data[9]

['Closeup of bins of food that include broccoli and bread .',
 'A meal is presented in brightly colored plastic trays .',
 'there are containers filled with different kinds of foods',
 'Colorful dishes holding meat , vegetables , fruit , and bread .',
 'A bunch of trays that have different food .']

In [5]:
cnt = 0 
for captions in list(my_data.values()):
    cnt += len(captions)
cnt, len(my_data) * 5

(93915, 93915)

In [6]:
# now we will randomly choose 9k images for our image captioning project 
image_ids = list(my_data.keys())
n = 8200
random_img_ids = random.sample(image_ids, n)

# finally we will store all the image ids in the random_list with captions for each image id in the captions_coco.txt line by line 
with open('captions_coco.txt', 'w') as file:
    file.write('image,caption\n')
    for img_id in random_img_ids:
        for caption in my_data[img_id]:
            file.write(f'''COCO_train2014_{str(img_id).rjust(12, '0')}.jpg,"{caption}"\n''')

In [7]:
from sklearn.model_selection import train_test_split

# Captions file 
CAPTIONS_FILE = 'captions_coco.txt'
with open(CAPTIONS_FILE, 'r') as f: 
    captions_data = f.readlines()[1:]

### Split dataset to 3 datasets: train (0.8), validation (0.1), test (0.1)

image_captions_dict = defaultdict(list)

for line in captions_data:
    img_id, caption = line.strip().split(',', 1)
    image_captions_dict[img_id].append(caption)
    
# Get a list of unique image IDs
unique_image_ids = list(image_captions_dict.keys())

# Split image IDs into train, val, test
train_imgs, val_test_imgs = train_test_split(unique_image_ids, test_size=0.2, random_state=42)
val_imgs, test_imgs = train_test_split(val_test_imgs, test_size=0.5, random_state=42)

# Function to get captions and image IDs for each set
def get_captions_and_ids(image_ids_subset):
    subset_captions = []
    subset_image_ids = []
    for img_id in image_ids_subset:
        captions = image_captions_dict[img_id]
        subset_captions.extend(captions)
        subset_image_ids.extend([img_id] * len(captions))
    return subset_captions, subset_image_ids

# Get captions and image IDs for each set
train_captions, train_image_ids = get_captions_and_ids(train_imgs)
val_captions, val_image_ids = get_captions_and_ids(val_imgs)
test_captions, test_image_ids = get_captions_and_ids(test_imgs)

# Write train.txt
with open('train.txt', 'w') as f: 
    for image_id, caption in zip(train_image_ids, train_captions):
        f.write(f"{image_id},{caption}\n")

# Write val.txt
with open('val.txt', 'w') as f: 
    for image_id, caption in zip(val_image_ids, val_captions):
        f.write(f"{image_id},{caption}\n")

# Write test.txt
with open('test.txt', 'w') as f: 
    for image_id, caption in zip(test_image_ids, test_captions):
        f.write(f"{image_id},{caption}\n")
