# Connecting google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Downloading Dataset

In [2]:
!wget "https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip"
!unzip -q flickr8k.zip -d ./flickr8k
!rm flickr8k.zip
!echo "Downloaded Flickr8k dataset successfully."

--2025-05-05 13:40:52--  https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250505%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250505T134052Z&X-Amz-Expires=300&X-Amz-Signature=9668bc20fd03d1d81da287645e4cad113bc91e8690545a7cb29b8c0b74015761&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dflickr8k.zip&response-content-type=application%2Foctet-stream [following]
--2025-05-05 13:40:52--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credent

# Importing neccessary librarys

In [3]:
from os import listdir
import numpy as np
from tqdm import tqdm
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Load Descriptions

In [5]:
with open('/content/flickr8k/captions.txt', 'r') as f:
  next(f) # continue the header
  captions_doc = f.read()

In [None]:
captions_doc.split('\n')[0]

'1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .'

# Mapping the image_id with it's Captions

In [6]:
def load_descption_to_dictionary(descriptions):
  map = {}
  for line in tqdm(descriptions.split("\n")):
    # description shpuld have atleast 2 words
    if len(line) < 2:
      continue

    # extract image_id, image descripton
    image_id, image_desc = line.split(",")[0], line.split(",")[1:]

    # extract image_id not it's extension
    image_id = image_id.split(".")[0]

    # Convert the caption list to a string by joining its elements
    caption = ' '.join(image_desc)

    # Create a list if the image ID is not already in the mapping dictionary
    if image_id not in map:
      map[image_id] = []

    # store the image_id and it's caption
    map[image_id].append(caption)

  return map

In [7]:
captions_dict = load_descption_to_dictionary(captions_doc)
captions_dict["1000268201_693b08cb0e"], len(captions_dict)

100%|██████████| 40456/40456 [00:00<00:00, 691738.19it/s]


(['A child in a pink dress is climbing up a set of stairs in an entry way .',
  'A girl going into a wooden building .',
  'A little girl climbing into a wooden playhouse .',
  'A little girl climbing the stairs to her playhouse .',
  'A little girl in a pink dress going into a wooden cabin .'],
 8091)

# Cleaning Captions

In [8]:
def clean_captions(captions_dict):
  for image, captions in tqdm(captions_dict.items()):
    for i in range(len(captions)):
      # Load captions one by one
      caption = captions[i]

      # Convert the caption to lowercase
      caption = caption.lower()

      # delete digits, special chars, etc.,
      caption = caption.replace('[^A-Za-z]', '')

      # delete additional spaces
      caption = caption.replace('\s+', ' ')

      # add start and end tags to the caption and remove word < length 2
      caption = '<startseq> ' + " ".join([word for word in caption.split() if len(word)>1]) + ' <endseq>'
      captions[i] = caption

  return

In [9]:
print("Before Cleaning")
captions_dict["1000268201_693b08cb0e"]

Before Cleaning


['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [10]:
clean_captions(captions_dict)
print("\nAfter Cleaning")
captions_dict["1000268201_693b08cb0e"]

100%|██████████| 8091/8091 [00:00<00:00, 50422.13it/s]


After Cleaning





['<startseq> child in pink dress is climbing up set of stairs in an entry way <endseq>',
 '<startseq> girl going into wooden building <endseq>',
 '<startseq> little girl climbing into wooden playhouse <endseq>',
 '<startseq> little girl climbing the stairs to her playhouse <endseq>',
 '<startseq> little girl in pink dress going into wooden cabin <endseq>']

# Creating list of captions

In [11]:
def create_caption_list(captions_dict):
  all_captions = []
  for imageId in tqdm(captions_dict):
    for caption in captions_dict[imageId]:
      all_captions.append(caption)
  return all_captions

In [12]:
all_captions = create_caption_list(captions_dict)
all_captions[:10], len(all_captions)

100%|██████████| 8091/8091 [00:00<00:00, 1017574.62it/s]


(['<startseq> child in pink dress is climbing up set of stairs in an entry way <endseq>',
  '<startseq> girl going into wooden building <endseq>',
  '<startseq> little girl climbing into wooden playhouse <endseq>',
  '<startseq> little girl climbing the stairs to her playhouse <endseq>',
  '<startseq> little girl in pink dress going into wooden cabin <endseq>',
  '<startseq> black dog and spotted dog are fighting <endseq>',
  '<startseq> black dog and tri-colored dog playing with each other on the road <endseq>',
  '<startseq> black dog and white dog with brown spots are staring at each other in the street <endseq>',
  '<startseq> two dogs of different breeds looking at each other on the road <endseq>',
  '<startseq> two dogs on pavement moving toward each other <endseq>'],
 40455)

# Create Tokenizer

In [13]:
def create_tokenizer(all_captions):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(all_captions)
  # print(tokenizer.word_index)
  return tokenizer

In [14]:
tokenizer = create_tokenizer(all_captions)

In [15]:
vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(caption.split())for caption in all_captions)

vocab_size, max_len

(8485, 35)

# Train Test Split

In [16]:
imageIds = list(captions_dict.keys())
split = int(len(imageIds) * 0.90)
train = imageIds[:split]
test = imageIds[split:]

# Save

In [18]:
import os
os.chdir('/content/drive/MyDrive/Image-Captioning')

In [19]:
with open("Preprocessing/tokenizer.pkl", "wb") as f:
  pickle.dump(tokenizer, f)

with open("Preprocessing/captions_dict.pkl", "wb") as f:
  pickle.dump(captions_dict, f)

with open("Preprocessing/all_captions.pkl", "wb") as f:
  pickle.dump(all_captions, f)