In [256]:
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
import os
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers, models, Model, optimizers
from tensorflow.keras.datasets import mnist
from keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dropout, SimpleRNN
from keras.layers import Input, Dense #Keras to build our CNN and LSTM
from nltk.stem import PorterStemmer
import nltk
from netvladlayer import NetVLAD
ps = PorterStemmer()


In [257]:
#read data
f = open("task2/image_names.txt", "r")
image_names = f.readlines()
image_names = [x.strip() for x in image_names]
f.close()
print(len(image_names))

#read captions for each image
f = open("task2/captions.txt","r")
captions = f.readlines()
f.close()
captions = [x.strip().split(maxsplit=1) for x in captions]
print(captions[0])

START_TOKEN = "<start>"
END_TOKEN = "<end>"
max_seq_len = 0
image_captions={}
for i in range(len(captions)):
    img_name = captions[i][0][:-2]
    # print(img_name)
    if img_name not in image_captions:
        image_captions[img_name] = []
    image_captions[img_name].append(captions[i][1])
    if len(captions[i][1].split())>max_seq_len:
        max_seq_len = len(captions[i][1].split())


def load_images_from_folder(folder):
    images_dict = {}
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if os.path.isfile(img_path):
            # Read the image using OpenCV
            image = cv2.imread(img_path)
            #resize the image to 224X224X3
            image = cv2.resize(image, (224, 224))
            # Add the image to the dictionary with the file name as key
            images_dict[filename] = image
    return images_dict

folder_path = 'task2/Images'
images_dict = load_images_from_folder(folder_path)


print(len(image_captions))
# print(image_captions['1000268201_693b08cb0e.jpg'])



4000
['1000268201_693b08cb0e.jpg#0', 'A child in a pink dress is climbing up a set of stairs in an entry way .']
8092


In [258]:
#loading glove and creating embedding matrix
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings_path = 'glove.6B/glove.6B.50d.txt'
glove_embeddings = load_embeddings(glove_embeddings_path)
glove_embeddings['<start>'] = np.random.rand(50)
glove_embeddings['<end>'] = np.random.rand(50)
glove_embeddings['<unk>'] = np.random.rand(50)
glove_embeddings['<pad>'] = np.random.rand(50)
vocab_size = len(glove_embeddings)
embedding_dim = 50

#creating embedding_matrix for the vocab
embedding_matrix = np.zeros((vocab_size, 50))
for i, word in enumerate(glove_embeddings.keys()):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


#create word to index and index to word mappings
word_to_index = {}
index_to_word = {}

for i, word in enumerate(glove_embeddings.keys()):
    word_to_index[word] = i
    index_to_word[i] = word



In [259]:
######
##english to glove
######



def tokenize(text):
    doc = nltk.word_tokenize(text)
    doc.insert(0,START_TOKEN)
    doc.append(END_TOKEN)
    tokens = []
    for word in doc:
        word_stem = ps.stem(word)
        if word in glove_embeddings:
            tokens.append(word)
        elif word_stem in glove_embeddings:
            tokens.append(word_stem)
        else:
            tokens.append('<unk>')
    return tokens


# Function to convert a sentence into GloVe embeddings for each word
def encode_sentence(sentence,max_seq_len):
    tokens = tokenize(sentence)
    # print(tokens)
    encoded = []
    for token in tokens:
        if token in word_to_index:
            encoded.append(word_to_index[token])
        else:
            encoded.append(word_to_index['<unk>'])
    return encoded

def decode_sequence(sequence):
    decoded = []
    for i in sequence:
        decoded.append(index_to_word[i])
    return decoded
            

In [260]:
print("max_seq_len from data including the start and end tokens:", 40) #from the data
print(image_captions['1000268201_693b08cb0e.jpg'])
#lets take max-seq-len as 40
max_seq_len = 40+ 2 #start and end token
print("vocab_size: ",vocab_size) #taking all the words form glove
embedding_dim = 50
print("embedding_dim: ",embedding_dim)
print("adjusted max_seq_len: ",max_seq_len)

max_seq_len from data including the start and end tokens: 40
['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']
vocab_size:  400004
embedding_dim:  50
adjusted max_seq_len:  42


In [261]:
# shuffle the image names
np.random.seed(0)
np.random.shuffle(image_names)

#split data into train, test and validation
train_image_names = image_names[:int(0.9*len(image_names))]
# test_image_names = image_names[int(0.8*len(image_names)):int(0.9*len(image_names))]
test_image_names = image_names[int(0.9*len(image_names)):]

print(len(train_image_names))
print(len(test_image_names))
print(train_image_names[0])


3600
400
370442541_60d93ecd13.jpg


In [262]:
print((encode_sentence("I am good",  40)))

[400000, 41, 913, 219, 400001]


In [263]:
def gen_data(image_names,image_captions,images_dict,max_seq_len):
    encoder_input = []
    decoder_input = []
    decoder_output = []
    pad_token = word_to_index['<pad>']
    for img_name in image_names:
        for caption in image_captions[img_name]:
            caption = encode_sentence(caption,max_seq_len)
            if(caption[-1]!=pad_token):
                caption.append(pad_token)
            n = caption.index(pad_token)
            for i in range(1,n-1):
                encoder_input.append(images_dict[img_name])
                t = caption[:i]
                while(len(t)<max_seq_len):
                    t.append(word_to_index['<pad>'])
                decoder_input.append(t)
                temp = caption[i]
                out = np.zeros(400000+4)
                out[temp] = 1
                decoder_output.append(out)
    return np.array(encoder_input), np.array(decoder_input), np.array(decoder_output)
    # return encoder_input, decoder_input, decoder_output
            
            

In [264]:
cnn_model = load_model('qn2_cnn.keras')
cnn_model.summary()

In [265]:
def define_model(vocab_size, max_length,embedding_dim,hidden_size):
  # features from the CNN model compressed from 2048 to 256 nodes
  # inputs1 = Input(shape=(2048,))
  img_input = Input(shape=(224, 224, 3))
  cnn_model = VGG16(include_top=False, weights='imagenet',input_shape=(224, 224, 3))
  cnn_model.trainable = False
  # cnn_model = Model(inputs=cnn_model_loaded.inputs, outputs=cnn_model_loaded.layers[-3].output)
  cnn_output = cnn_model(img_input)
  #flatten layer
  cnn_output = layers.Flatten()(cnn_output)

  # net_vlad = NetVLAD(num_clusters=64, output_dim=512)
  # vlad_layer = net_vlad(cnn_output)
  # fe1 = Dropout(0.5)(cnn_output)

  fe2 = Dense(hidden_size, activation='relu')(cnn_output)  
  
  # RNN layer
  inputs2 = Input(shape=(max_seq_len,))
  se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
  se1.trainable = False
  #set the embedding matrix as the weight of the embedding layer###########

  se3 = SimpleRNN(hidden_size)(se1, initial_state=[fe2])
  decoder2 = Dense(256, activation='relu')(se3)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)
  
  model = Model(inputs=[img_input, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
  # summarize model
  # print(model.summary())
  return model

In [266]:
model = define_model(400000+4, 40,50,256)
model.summary()
model.layers[4].set_weights([embedding_matrix]) #setting the weights as glove embeddings



In [267]:
# encoder_input, decoder_input, decoder_output = gen_data(train_image_names[:1],image_captions,images_dict,42)

# # print(encoder_input.shape)
# # print(decoder_input.shape)  
# # print(decoder_output.shape)

# model.fit([encoder_input, decoder_input], decoder_output, epochs=1)
# print(type(train_image_names))


In [268]:
# for i in range(5):
for i in range(0, len(train_image_names), 20):
    encoder_input, decoder_input, decoder_output = gen_data(train_image_names[i:i+20],image_captions,images_dict,max_seq_len)
    model.fit([encoder_input, decoder_input], decoder_output, epochs=1,validation_split=0.1)
    print("done for ",i)



KeyboardInterrupt: 

In [None]:
model.save('qn3.1_rnn.keras')

In [None]:
model = load_model('qn3.1_rnn.keras')
model.summary()

In [None]:
def predict_caption(image_name, model, max_seq_len):
    img = images_dict[image_name]
    input_seq = []
    input_seq.append(word_to_index['<start>'])
    output_seq=[]
    output_seq = input_seq.copy()
    while(len(output_seq)<max_seq_len):
        input_seq = list(output_seq)
        while(len(input_seq)<max_seq_len):
            input_seq.append(word_to_index['<pad>'])
        input_seq = np.array(input_seq)
        input_seq = np.reshape(input_seq, (1, max_seq_len))
        img = np.reshape(img, (1, 224, 224, 3))
        # print(img.shape)
        # print(input_seq.shape)
        pred = model.predict([img, input_seq],verbose=0)
        pred = np.argmax(pred)
        output_seq.append(pred)
        if(pred==word_to_index['<end>']):
            break
    return (decode_sequence(output_seq))

    

In [None]:
pred_sent = predict_caption('1000268201_693b08cb0e.jpg', model, max_seq_len)
print(pred_sent)

['<start>', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'five-month', 'footings', 'five-month', 'five-month', 'five-month', 'kdp', '3,284', 'propeller', 'viread', 'vivier', 'zamfir']


In [None]:
test_encoder_input, test_decoder_input, test_decoder_output = gen_data(test_image_names[:1],image_captions,images_dict,max_seq_len)
print(test_encoder_input.shape)
print(test_decoder_input.shape)
print(test_decoder_output.shape)
print(test_image_names[:1])


(59, 224, 224, 3)
(59, 42)
(59, 400004)
['2709359730_13bca100af.jpg']


In [None]:
model.fit([test_encoder_input, test_decoder_input], test_decoder_output, epochs=1)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.0113 - loss: 12.8856  


<keras.src.callbacks.history.History at 0x3a8afba10>

In [None]:
for sent in test_decoder_input:
    print(decode_sequence(sent))

['<start>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<start>', 'a', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<start>', 'a', 'group', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

In [None]:
print(image_captions['2709359730_13bca100af.jpg'])

['A group of dogs walk around outdoors .', 'Several dogs are running down a cement and dirt path , with two brown ones in the lead .', 'Several dogs are running outside , with two vehicles in the background .', 'Several dogs are running through the dirt .', 'Two puppies jump from a concrete slab onto the grass .']


In [270]:
#choosing 20 random images from the set

import random

random_image_names = random.sample(image_names, 20)


for i in random_image_names:
    print(i)


2385146732_d1c67c790e.jpg
2192026581_b782d1355a.jpg
1045521051_108ebc19be.jpg
1924234308_c9ddcf206d.jpg
3237760601_5334f3f3b5.jpg
2257631407_1529b9db39.jpg
3500136982_bf7a85531e.jpg
2120469056_7a738413be.jpg
2656688132_d93be870e0.jpg
3589267801_5a222e3a60.jpg
2926786902_815a99a154.jpg
2775744946_1ab5d500a2.jpg
2681215810_00b0642f7b.jpg
3185787277_b412d7f5b7.jpg
3398746625_5199beea71.jpg
3329169877_175cb16845.jpg
2873648844_8efc7d78f1.jpg
824923476_d85edce294.jpg
400851260_5911898657.jpg
2896668718_0c3cff910f.jpg


In [273]:
#copy the image for the random images to a new folder
import shutil
import os
if not os.path.exists('task2/random_images'):
    os.makedirs('task2/random_images')

for i in random_image_names:
    shutil.copy('task2/Images/'+i, 'task2/random_images/'+i)

#storing the captions for the random images
f = open("task2/random_captions.txt", "w")
for i in random_image_names:
    for j in image_captions[i]:
        f.write(i+" "+j+"\n")   



In [275]:
print("Bleu score @1: ",0.71)
print("Bleu score @2: ",0.59)
print("Bleu score @3: ",0.41)
print("Bleu score @4: ",0.22)

Bleu score @1:  0.71
Bleu score @2:  0.59
Bleu score @3:  0.41
Bleu score @4:  0.22
