# Package Imports

In [1]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model 
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add



In [2]:
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

# Extract Image Features

In [3]:
model = VGG16()
# leaving out the predeiction layer
model = Model(inputs = model.inputs, outputs=model.layers[-2].output) 
print(model.summary())

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

In [None]:
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    # load image and make it numpy array
    img_path = directory + '/'+ img_name
    image = load_img(img_path, target_size=(224,224))
    image = img_to_array(image) 
    # reshape and preprocess for vgg16 
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    # store feature
    extracted_feature = model.predict(image, verbose=0)
    image_id = img_name.split('.')[0]
    features[image_id] = extracted_feature
    

  0%|          | 0/8091 [00:00<?, ?it/s]

In [None]:
# store features in file for reference
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [None]:
# load features from pickle dump
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

## Load captions

In [None]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r')as f:
    next(f)
    captions_doc = f.read()
captions_doc

In [None]:
captions = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in captions:
        captions[image_id] = []
    captions[image_id].append(caption)

## Clean Data

In [None]:
print(captions['1000268201_693b08cb0e'])

In [None]:
def clean(img_captions_map):
    for img_id, caption_list in img_captions_map.items():
        for i in range(len(caption_list)):
            # remove non-alphabetical characters
            cap = caption_list[i]
            cap = cap.lower().replace('\s+','').replace('[^A-Za-z]','')
            cap = " ".join([word for word in cap.split() if len(word)>1])
            # add start and end tags
            cap = '<start> ' + cap + ' <end>'
            caption_list[i] = cap
clean(captions)

In [None]:
print(captions['1000268201_693b08cb0e'])

In [None]:
all_captions = []
for key in captions:
    for cap in captions[key]:
        all_captions.append(cap)
print(len(all_captions))
all_captions[:10]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index)+1
max_length = max(len(caption.split()) for caption in all_captions) # for padding purposes
print(vocab_size)
print(max_length)

# Train and Test Data

In [None]:
image_ids = list(captions.keys())
split = int(len(image_ids) * 0.90)
train,test = image_ids[:split], image_ids[split:]

## Batched Input

In [None]:
def data_generator(data_keys, caption_map, tokenizer, voacb_size, batch_size):
    X1, X2, Y = list(), list(), list()
    n = 0
    while True:
        for key in data_keys:
            n += 1
            caption_list = caption_map[key]
            for cap in caption_list:
                seq = tokenizer.texts_to_sequences([cap])[0]
                