In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import pickle
import string
import os
from numpy import array
import glob
from pickle import dump, load
from time import time
from tqdm import tqdm

import torch
import tensorflow as tf
import torchvision.transforms as transforms
import keras
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.preprocessing import sequence, image
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras import Input, layers, optimizers

import matplotlib.pyplot as plt
%matplotlib inline

import random
!pip install -U nltk
!pip install nltk==3.5 
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')

In [None]:
train_f = open("/kaggle/input/flickr-8k/Flickr8k_text/Flickr8k_text/Flickr_8k.trainImages.txt")
train_image_paths = train_f.read().splitlines()

test_f = open("/kaggle/input/flickr-8k/Flickr8k_text/Flickr8k_text/Flickr_8k.testImages.txt")
test_image_paths = test_f.read().splitlines()

val_f = open("/kaggle/input/flickr-8k/Flickr8k_text/Flickr8k_text/Flickr_8k.valImages.txt")
val_image_paths = val_f.read().splitlines()

val_image_paths[0]

In [None]:
cap_f = open("/kaggle/input/flickr-8k/Flickr8k_text/Flickr8k_text/Flickr8k.token.txt")
captns = cap_f.read().splitlines()

captions = {}

for cap in captns:
    img, temp = cap.split("\t")
    if img[:-2] in captions.keys():
        captions[img[:-2]].append(temp)
    else:
        captions[img[:-2]] = [temp]

# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)

for img, caption_list in captions.items():
    for i in range(len(caption_list)):
        temp = caption_list[i]
        temp = temp.split()
        temp = [word.lower() for word in temp]
        # remove punctuation from each token
        temp = [w.translate(table) for w in temp]
        # remove hanging 's' and 'a'
        temp = [word for word in temp if len(word)>1]
        # remove tokens with numbers in them
        temp = [word for word in temp if word.isalpha()]
        # store as string
        caption_list[i] =  ' '.join(temp)

In [None]:
train_cap = {}
for img in train_image_paths:
    train_cap[img] = captions[img]

test_cap = {}
for img in test_image_paths:
    test_cap[img] = captions[img]
    
val_cap = {}
for img in val_image_paths:
    val_cap[img] = captions[img]    
    
val_cap['2090545563_a4e66ec76b.jpg']

In [None]:
for img, caption_list in train_cap.items():
    for i in range(len(caption_list)):
        caption_list[i] =  'startseq ' + caption_list[i] + ' endseq'

In [None]:
# val_cap, test_cap and train_cap are dictionies with keys as image names and values as a list of captions for the corresponding image.

## Code for creating vision embeddings

In [None]:
# # Load the inception v3 model
# model = InceptionV3(weights='imagenet')
# # Create a new model, by removing the last layer (output layer) from the inception v3
# model_new = Model(model.input, model.layers[-2].output)

In [None]:
# def encode(img):
#     img = Image.open(img)
#     img = img.resize((299,299))
#     img = np.asarray(img)
#     img = np.expand_dims(img, axis=0)
#     img = preprocess_input(img)
#     img_enc = model_new.predict(img)
#     img_enc = np.reshape(img_enc, img_enc.shape[1])
#     return img_enc

In [None]:
# train_encoding = {}
# for img in tqdm(train_image_paths):
#     train_encoding[img] = encode("/kaggle/input/flickr-8k/Flicker8k_Images/Flicker8k_Images/" + img)

In [None]:
# val_encoding = {}
# for img in tqdm(val_image_paths):
#     val_encoding[img] = encode("/kaggle/input/flickr-8k/Flicker8k_Images/Flicker8k_Images/" + img)

In [None]:
# test_encoding = {}
# for img in tqdm(test_image_paths):
#     test_encoding[img] = encode("/kaggle/input/flickr-8k/Flicker8k_Images/Flicker8k_Images/" + img)

In [None]:
# with open('/kaggle/working/train_vis_embeddings.pickle', 'wb') as handle:
#     pickle.dump(train_encoding, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('/kaggle/working/val_vis_embeddings.pickle', 'wb') as handle:
#     pickle.dump(val_encoding, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('/kaggle/working/test_vis_embeddings.pickle', 'wb') as handle:
#     pickle.dump(test_encoding, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Reading vision embeddings

In [None]:
with open('/kaggle/input/vr-project-embeddings/train_vis_embeddings.pickle', 'rb') as handle:
    train_encoding = pickle.load(handle)

with open('/kaggle/input/vr-project-embeddings/val_vis_embeddings.pickle', 'rb') as handle:
    val_encoding = pickle.load(handle)

with open('/kaggle/input/vr-project-embeddings/test_vis_embeddings.pickle', 'rb') as handle:
    test_encoding = pickle.load(handle)

## Word embeddings

In [None]:
# Vocabulary size of captions

vocabulary = set()
for key in train_cap.keys():
    [vocabulary.update(d.split()) for d in train_cap[key]]
print('Train Vocabulary Size: %d' % len(vocabulary))

for key in val_cap.keys():
    [vocabulary.update(d.split()) for d in val_cap[key]]

for key in test_cap.keys():
    [vocabulary.update(d.split()) for d in test_cap[key]]
print('Total Vocabulary Size: %d' % len(vocabulary))

In [None]:
words_list = []
for key, values in train_cap.items():
    for caption in values:
        words_list.extend(caption.split(" "))

word_count_threshold = 10
word_count = {}

for word in words_list:  
    if word in word_count.keys():
        word_count[word] += 1
    else:
        word_count[word] = 1
        
# modified_vocab has only those words that have occured at least 10 times in all the captions of the train images
modified_vocab = [word for word in word_count if word_count[word] >= word_count_threshold]
len(modified_vocab)

In [None]:
indexToWord = {}
wordToIndex = {}
index = 1

for w in modified_vocab:
    wordToIndex[w] = index
    indexToWord[index] = w
    index += 1
    
vocab_size = len(indexToWord) + 1

In [None]:
# The following cell aims to find the maximum number of words in all of the train captions

# calculate the length of the description with the most words
def max_length(captions):
    # convert a dictionary of clean captions to a list of captions
    def convertToLines(captions):
        complete_desc = list()
        for key in captions.keys():
            [complete_desc.append(d) for d in captions[key]]
        return complete_desc

    lines = convertToLines(captions)
    return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_cap)
print('Max Description Length: %d' % max_length)

In [None]:
# data generator, intended to be used in a call to model.fit_generator()
def dataGenerator(imageCaptions, images, wordToIndex, max_length, num_images_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for img, imgCapList in imageCaptions.items():
            n+=1
            imgEnC = images[img]
            for cap in imgCapList:
                # encode the sequence
                seq = [wordToIndex[word] for word in cap.split(' ') if word in wordToIndex]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = tf.keras.preprocessing.sequence.pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(imgEnC)
                    X2.append(in_seq)
                    y.append(out_seq)
            if n==num_images_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

In [None]:
# Load Glove vectors

complete_word_embeddings = {}
f = open('/kaggle/input/glove6b200d/glove.6B.200d.txt', encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    complete_word_embeddings[word] = coefs
    
f.close()

print('Found %s word vectors.' % len(complete_word_embeddings))

In [None]:
word_embedding_dimension = 200
req_word_embeddings = np.zeros((vocab_size, word_embedding_dimension))

for word, i in wordToIndex.items():
    embedding_vector = complete_word_embeddings.get(word)
    if embedding_vector is not None:
        req_word_embeddings[i] = embedding_vector

## Model

In [None]:
imgInput = Input(shape=(2048,))
imgDropout = Dropout(0.5)(imgInput)
imgDense = Dense(256, activation='relu')(imgDropout)
wordInput = Input(shape=(max_length,))
wordEmb = Embedding(vocab_size, word_embedding_dimension, mask_zero=True)(wordInput)
wordDropout = Dropout(0.5)(wordEmb)
wordNN = LSTM(256)(wordDropout)
combineImgWord = keras.layers.add([imgDense, wordNN])
combineDense = Dense(256, activation='relu')(combineImgWord)
outputs = Dense(vocab_size, activation='softmax')(combineDense)


In [None]:
model = Model(inputs=[imgInput, wordInput], outputs=outputs)
model.layers[2].set_weights([req_word_embeddings])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

## Training

In [None]:
epochs = 20
number_imgs_per_batch = 3
steps = len(train_cap) // number_imgs_per_batch

In [None]:
# for i in range(epochs):
#     generator = dataGenerator(train_cap, train_encoding, wordToIndex, max_length, number_imgs_per_batch)
#     model.fit_generator(generator, epochs = 1, steps_per_epoch = steps, verbose = 1)
#     model.save('/kaggle/working/model_weights/model_' + str(i) + '.h5')

In [None]:
model = Model(inputs=[imgInput, wordInput], outputs=outputs)
model.layers[2].set_weights([req_word_embeddings])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Restore the weights
model.load_weights('/kaggle/input/model-weights/model_19.h5')

## Inference

In [None]:
def greedySearch(img, model, max_length = 34):
    pred_caption = 'startseq'
    completePred = False
    for _ in range(max_length):
        intermediate_caption = [wordToIndex[w] for w in pred_caption.split() if w in wordToIndex]
        intermediate_caption = tf.keras.preprocessing.sequence.pad_sequences([intermediate_caption], maxlen=max_length)
        pred_index = np.argmax(model.predict([img,intermediate_caption], verbose=0))
        word = indexToWord[pred_index]
        pred_caption += ' ' + word
        if word == 'endseq':
            completePred = True
            break
    if completePred:
        return ' '.join(pred_caption.split()[1:-1])
    return ' '.join(pred_caption.split()[1:])
    

In [None]:
imgName = list(train_encoding.keys())[512]
imgEncoding = train_encoding[imgName].reshape((1,2048))
plt.imshow(plt.imread("/kaggle/input/flickr-8k/Flicker8k_Images/Flicker8k_Images/" + imgName))
plt.show()
print("Greedy search output:",greedySearch(imgEncoding, model))

In [None]:
def evaluate_model(image_names, image_embeddings, image_cap, model, seq):
    b_scores = 0
    m_scores = 0
    count = 0
    for image_name in tqdm(image_names):    
        img_end = image_embeddings[image_name].reshape((1, 2048))
        prediction = greedySearch(img_end, model)
        
        modified_sentence = []
        modified_word_tokens = []
        
        for reference in image_cap[image_name]:
            final = reference.split().copy()
            if seq:
                final = final[1:-1]
            modified_word_tokens.append(final)
            final = ' '.join(final)
            modified_sentence.append(final)
        
        b_score = sentence_bleu(modified_word_tokens, prediction.split())
        m_score = meteor_score(modified_sentence, prediction)        
        b_scores += b_score
        m_scores += m_score
        count += 1

    return b_scores/count, m_scores/count

In [None]:
b, m = evaluate_model(test_image_paths, test_encoding, test_cap, model, False)

In [None]:
print("BLEU score = ", b)

In [None]:
print("METEOR score = ", m)