In [2]:
from collections import defaultdict

def get_captions():
    cap_file = open("Flickr8k_text/Flickr8k.token.txt", "r")
    captions = defaultdict(list)
    for line in cap_file.readlines():
        img, caption = line.split("\t", maxsplit=1)
        captions[img.split('.')[0]].append(caption.strip())
    cap_file.close()
    return dict(captions)

In [3]:
captions = get_captions()
len(captions.keys())

8092

In [4]:
import re
for idx, caption_list in captions.items():
    for i in range(5):
        caption_list[i] = re.sub(r'[^\w\s]','',caption_list[i])
        caption_list[i] = caption_list[i].split()
        caption_list[i] = [word.lower() for word in caption_list[i]]
        caption_list[i] = [word for word in caption_list[i] if word.isalpha() and len(word) > 1]
        caption_list[i] =  ' '.join(caption_list[i])

In [5]:
original_vocabulary = set()
for idx, caption_list in captions.items():
    for caption in caption_list:
        original_vocabulary.update(caption.split())
print(f"Original vocabulary size: {len(original_vocabulary)}")

Original vocabulary size: 8763


In [6]:
train_captions = {}
with open("Flickr8k_text/Flickr_8k.trainImages.txt", "r") as f:
    for line in f.readlines():
        train_captions[line.split('.')[0]] = captions[line.split('.')[0]]
dev_captions = {}
with open("Flickr8k_text/Flickr_8k.devImages.txt", "r") as f:
    for line in f.readlines():
        dev_captions[line.split('.')[0]] = captions[line.split('.')[0]]
test_captions = {}
with open("Flickr8k_text/Flickr_8k.testImages.txt", "r") as f:
    for line in f.readlines():
        test_captions[line.split('.')[0]] = captions[line.split('.')[0]]

In [7]:
for img, caption_list in train_captions.items():
    for i in range(5):
        caption_list[i] = "startseq " + caption_list[i] + " endseq"

In [8]:
from collections import Counter
vocab_count = Counter()
for idx, caption_list in train_captions.items():
    for caption in caption_list:
        vocab_count.update(caption.split())
word_threshold = 10
vocab = [word for word in vocab_count if vocab_count[word] >= word_threshold]
len(vocab)

1651

In [9]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model

model = InceptionV3(weights='imagenet')
model = Model(model.input, model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [10]:
import numpy as np
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing import image

encoded_train_images = {}
encoded_test_images = {}
cnt = 0

for img in train_captions:
    img_vector = image.load_img(f"Flicker8k_Dataset/{img}.jpg", target_size=(299, 299))
    img_vector = image.img_to_array(img_vector)
    img_vector = np.expand_dims(img_vector, axis=0)
    img_vector = preprocess_input(img_vector)
    img_vector = model.predict(img_vector)
    img_vector = np.reshape(img_vector, img_vector.shape[1])
    encoded_train_images[img] = img_vector
    cnt+=1
    if cnt%10 == 0:
        print(f"{cnt} train images encoded")

cnt = 0
for img in test_captions:
    img_vector = image.load_img(f"Flicker8k_Dataset/{img}.jpg", target_size=(299, 299))
    img_vector = image.img_to_array(img_vector)
    img_vector = np.expand_dims(img_vector, axis=0)
    img_vector = preprocess_input(img_vector)
    img_vector = model.predict(img_vector)
    img_vector = np.reshape(img_vector, img_vector.shape[1])
    encoded_test_images[img] = img_vector
    cnt+=1
    if cnt%10 == 0:
        print(f"{cnt} test images encoded")

10 train images encoded
20 train images encoded
30 train images encoded
40 train images encoded
50 train images encoded
60 train images encoded
70 train images encoded
80 train images encoded
90 train images encoded
100 train images encoded
110 train images encoded
120 train images encoded
130 train images encoded
140 train images encoded
150 train images encoded
160 train images encoded
170 train images encoded
180 train images encoded
190 train images encoded
200 train images encoded
210 train images encoded
220 train images encoded
230 train images encoded
240 train images encoded
250 train images encoded
260 train images encoded
270 train images encoded
280 train images encoded
290 train images encoded
300 train images encoded
310 train images encoded
320 train images encoded
330 train images encoded
340 train images encoded
350 train images encoded
360 train images encoded
370 train images encoded
380 train images encoded
390 train images encoded
400 train images encoded
410 train

In [11]:
import pickle

with open("encoded_train_images.pkl", "wb") as f:
    pickle.dump(encoded_train_images, f)

with open("encoded_test_images.pkl", "wb") as f:
    pickle.dump(encoded_test_images, f)