In [9]:
import sys
import numpy as np
from os import listdir
from pickle import dump
from pickle import load
from keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from collections import Counter
from keras.utils import to_categorical
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merging import add
from keras.utils import plot_model
from keras_preprocessing.sequence import pad_sequences
from keras.models import load_model
from PIL import Image

In [19]:
import sys
sys.modules['Image'] = Image 
from PIL import Image
import Image

In [20]:
base_model = VGG16(include_top=True)
base_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [21]:
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [22]:
features = dict()
dir_data = "C:\\Users\\aabhatia\\Downloads\\Flickr8k_Dataset\\FLicker8k_Dataset"

def get_features():
    count = 0
    for file in listdir(dir_data):
        img_path = dir_data + '\\' + file
        img = load_img(img_path, target_size=(224, 224)) 
        x = img_to_array(img) 
        x = np.expand_dims(x, axis=0) 
        x = preprocess_input(x)
        fc2_features = model.predict(x)
        name_id = file.split('.')[0]
        features[name_id] = fc2_features
        count = count + 1
        if count%100 == 0:
            print(count)
        #print(features[name_id])
    return features

#features = get_features()
#dump(features, open('features.pkl', 'wb'))

In [23]:
def load_data_set_ids(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    dataset = list()
    for image_id in text.split('\n'):
        if len(image_id) < 1:
            continue
     
        dataset.append(image_id)
    
    return set(dataset)

In [24]:
training_set = load_data_set_ids('C:\\Users\\aabhatia\\Downloads\\Flickr8k_text\\Flickr_8k.trainImages.txt')
dev_set = load_data_set_ids('C:\\Users\\aabhatia\\Downloads\\Flickr8k_text\\Flickr_8k.devImages.txt')
test_set = load_data_set_ids('C:\\Users\\aabhatia\\Downloads\\Flickr8k_text\\Flickr_8k.testImages.txt')

In [25]:
import string
filename = 'C:\\Users\\aabhatia\\Downloads\\Flickr8k_text\\Flickr8k.token.txt'
file = open(filename, 'r')
token_text = file.read()
file.close()

translator = str.maketrans("", "", string.punctuation) 
image_captions = dict()
image_captions_train = dict()
image_captions_dev = dict()
image_captions_test = dict()
image_captions_other = dict()
corpus = list() 
corpus.extend(['<START>', '<END>', '<UNK>']) 

max_imageCap_len = 0

for line in token_text.split('\n'):
    tokens = line.split(' ') 
    if len(line) < 2:
        continue
    image_id, image_cap = tokens[0], tokens[1:] 
    image_id = image_id.split('#')[0] 
    image_cap = ' '.join(image_cap) 

    image_cap = image_cap.lower() 
    image_cap = image_cap.translate(translator)
    
    image_cap = image_cap.split(' ') 
    image_cap = [w for w in image_cap if w.isalpha()] 
    image_cap = [w for w in image_cap if len(w)>1]
    image_cap = '<START> ' + ' '.join(image_cap) + ' <END>' 
    
    #update maximum caption length
    if len(image_cap.split()) > max_imageCap_len:
        max_imageCap_len = len(image_cap.split())
    
    #add to dictionary
    if image_id not in image_captions:
        image_captions[image_id] = list()
    image_captions[image_id].append(image_cap)
    
    #add to train/dev/test dictionaries
    if image_id in training_set:
        if image_id not in image_captions_train:
            image_captions_train[image_id] = list() 
        image_captions_train[image_id].append(image_cap)
        #print('For {0}, the caption is {1}'.format(image_id, image_cap))
        corpus.extend(image_cap.split())
        
    elif image_id in dev_set:
        if image_id not in image_captions_dev:
            image_captions_dev[image_id] = list() 
        image_captions_dev[image_id].append(image_cap)
        
    elif image_id in test_set:
        if image_id not in image_captions_test:
            image_captions_test[image_id] = list()
        #print('For {0}, the caption is {1}'.format(image_id, image_cap))
        image_captions_test[image_id].append(image_cap)
    else:
        if image_id not in image_captions_other:
            image_captions_other[image_id] = list()
        #print('For {0}, the caption is {1}'.format(image_id, image_cap))
        image_captions_other[image_id].append(image_cap)

caption_train_tokenizer = Tokenizer()
caption_train_tokenizer.fit_on_texts(corpus)
    
fid = open("image_captions.pkl","wb")
dump(image_captions, fid)
fid.close()

fid = open("image_captions_train.pkl","wb")
dump(image_captions_train, fid)
fid.close()

fid = open("image_captions_dev.pkl","wb")
dump(image_captions_dev, fid)
fid.close()

fid = open("image_captions_test.pkl","wb")
dump(image_captions_test, fid)
fid.close()

fid = open("image_captions_other.pkl","wb")
dump(image_captions_other, fid)
fid.close()

fid = open("caption_train_tokenizer.pkl","wb")
dump(caption_train_tokenizer, fid)
fid.close()

fid = open("corpus.pkl","wb")
dump(corpus, fid)
fid.close()

corpus_count=Counter(corpus)
fid = open("corpus_count.pkl","wb")
dump(corpus_count, fid)
fid.close()

print("size of data =", len(image_captions), "size of training data =", len(image_captions_train), "size of dev data =", len(image_captions_dev), "size of test data =", len(image_captions_test), "size of unused data =", len(image_captions_other))
print("maximum image caption length =",max_imageCap_len)

size of data = 8092 size of training data = 6000 size of dev data = 1000 size of test data = 1000 size of unused data = 92
maximum image caption length = 33


In [26]:
embeddings_index = dict()
embeddings_data = 'C:\\Users\\aabhatia\\Downloads\\glove.6B.50d.txt\\glove.6B.50d.txt' 
fid = open(embeddings_data ,encoding="utf8")
for line in fid:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    #if word.isalpha():
    #print('For {0}, embedding is {1}'.format(word, coefs))
fid.close()

In [28]:
EMBEDDING_DIM = 50
word_index = caption_train_tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, idx in word_index.items():
    embed_vector = embeddings_index.get(word)
    if embed_vector is not None:
        embedding_matrix[idx] = embed_vector
        
fid = open("embedding_matrix.pkl","wb")
dump(embedding_matrix, fid)
fid.close()

In [29]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(np.squeeze(X1)), np.array(X2), np.array(y)

In [30]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, batch_size, vocab_size):
    # loop for ever over images
    current_batch_size=0
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            if current_batch_size == 0:
                X1, X2, Y = list(), list(), list()
            
            imageFeature_id = key.split('.')[0]
            photo = photos[imageFeature_id][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            X1.extend(in_img)
            X2.extend(in_seq)
            Y.extend(out_word)
            current_batch_size += 1
            if current_batch_size == batch_size:
                current_batch_size = 0
                yield [[np.array(X1), np.array(X2)], np.array(Y)]

In [31]:
from pickle import load
fid = open('features.pkl', 'rb')
image_features = load(fid)
fid.close()

In [32]:
# test the data generator
caption_max_length = 33
batch_size = 1
vocab_size = 7057
generator = data_generator(image_captions_train, image_features, caption_train_tokenizer, caption_max_length, batch_size, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(47, 4096)
(47, 33)
(47, 7057)


In [56]:
!pip install pydot



In [33]:
from tensorflow.keras.layers import concatenate
def define_model_concat(vocab_size, max_length, embedding_matrix):
    inputs1 = Input(shape=(4096,))
    image_feature = Dropout(0.5)(inputs1)
    image_feature = Dense(256, activation='relu')(image_feature)
    inputs2 = Input(shape=(max_length,))
    language_feature = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=max_length, trainable=False)(inputs2)
    language_feature = Dropout(0.5)(language_feature)
    language_feature = LSTM(256)(language_feature)
    output = concatenate([image_feature, language_feature])
    output = Dense(256, activation='relu')(output)
    output = Dense(vocab_size, activation='softmax')(output)
    model = Model(inputs=[inputs1, inputs2], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    print(model.summary())
    plot_model(model, to_file='model_concat.png', show_shapes=True)
    return model

fid = open("embedding_matrix.pkl","rb")
embedding_matrix = load(fid)
fid.close()

caption_max_length = 33
vocab_size = 7506
post_rnn_model_concat = define_model_concat(vocab_size, caption_max_length, embedding_matrix)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 33)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 33, 50)       375300      ['input_4[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_3[0][0]']                
                                                                                            

In [34]:
fid = open("features.pkl","rb")
image_features = load(fid)
fid.close()

fid = open("caption_train_tokenizer.pkl","rb")
caption_train_tokenizer = load(fid)
fid.close()

fid = open("image_captions_train.pkl","rb")
image_captions_train = load(fid)
fid.close()

fid = open("image_captions_dev.pkl","rb")
image_captions_dev = load(fid)
fid.close()

caption_max_length = 33
batch_size = 100
vocab_size = 7506
#generator = data_generator(image_captions_train, image_features, caption_train_tokenizer, caption_max_length, batch_size, vocab_size)

#epochs = 2
#steps = len(image_captions_train)
#steps_per_epoch = np.floor(steps/batch_size)

In [23]:
batch_size = 6
steps = len(image_captions_train)
steps_per_epoch = np.floor(steps/batch_size)

epochs = 100

for i in range(epochs):
	# create the data generator
	generator = data_generator(image_captions_train, image_features, caption_train_tokenizer, caption_max_length, batch_size, vocab_size)
	# fit for one epoch
	post_rnn_model_concat_hist=post_rnn_model_concat.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	post_rnn_model_concat.save('modelConcat_1_' + str(i) + '.h5')

  post_rnn_model_concat_hist=post_rnn_model_concat.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)




In [35]:
from pickle import load
from numpy import argmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.models import load_model

In [36]:
base_model = VGG16(include_top=True)
feature_extract_pred_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [67]:
caption_train_tokenizer = load(open('caption_train_tokenizer.pkl', 'rb'))
max_length = 33
pred_model = load_model('modelConcat_A.h5')

In [38]:
def generate_caption(pred_model, caption_train_tokenizer, photo, max_length):
    in_text = '<START>'
    caption_text = list()
    for i in range(max_length):
            # integer encode input sequence
            sequence = caption_train_tokenizer.texts_to_sequences([in_text])[0]
            # pad input
            sequence = pad_sequences([sequence], maxlen=max_length)
            # predict next word
            model_softMax_output = pred_model.predict([photo,sequence], verbose=0)
            # convert probability to integer
            word_index = argmax(model_softMax_output)
            # map integer to word
            word = caption_train_tokenizer.index_word[word_index]
            #print(word)
            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += ' ' + word
            # stop if we predict the end of the sequence
            if word != 'end':
                caption_text.append(word)
            if word == 'end':
                break
    return caption_text

In [39]:
def extract_feature(model, file_name):
    img = load_img(file_name, target_size=(224, 224)) #size is 224,224 by default
    x = img_to_array(img) #change to np array
    x = np.expand_dims(x, axis=0) #expand to include batch dim at the beginning
    x = preprocess_input(x) #make input confirm to VGG16 input format
    fc2_features = model.predict(x)
    return fc2_features

In [68]:
features = dict()
dir_data = "C:\\Users\\aabhatia\\Downloads\\testImages"

def generate_captions():
    for file in listdir(dir_data):
        caption_image_fileName = dir_data + '\\' + file
        print('Generating caption for {0}'.format(caption_image_fileName))
        photo = extract_feature(feature_extract_pred_model, caption_image_fileName)
        caption = generate_caption(pred_model, caption_train_tokenizer, photo, max_length)
        print(' '.join(caption))

generate_captions()

Generating caption for C:\Users\aabhatia\Downloads\testImages\114051287_dd85625a04.jpg
man in black shirt is standing on his bike
Generating caption for C:\Users\aabhatia\Downloads\testImages\1262583859_653f1469a9.jpg
man in yellow shirt rides unicycle bike along street
Generating caption for C:\Users\aabhatia\Downloads\testImages\2120411340_104eb610b1.jpg
dogs are playing with each other in the dirt
Generating caption for C:\Users\aabhatia\Downloads\testImages\2453971388_76616b6a82.jpg
children are playing in fountain fountain
Generating caption for C:\Users\aabhatia\Downloads\testImages\2473738924_eca928d12f.jpg
young boy in red shirt is running on the street
Generating caption for C:\Users\aabhatia\Downloads\testImages\2631300484_be8621d17b.jpg
man in red shirt is standing in front of crowd
Generating caption for C:\Users\aabhatia\Downloads\testImages\2723477522_d89f5ac62b.jpg
dogs are playing in fenced field
Generating caption for C:\Users\aabhatia\Downloads\testImages\2788945468_7

In [41]:
# evaluation of generated captions
from pickle import load
from keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Input
from keras.models import load_model
from numpy import argmax

In [42]:
fid = open("features.pkl","rb")
image_features = load(fid)
fid.close()

fid = open("caption_train_tokenizer.pkl","rb")
caption_train_tokenizer = load(fid)
fid.close()

fid = open("image_captions_test.pkl","rb")
image_captions_test = load(fid)
fid.close()

# pre-define the max sequence length (from training)
max_length = 33
# load the model
pred_model = load_model('modelConcat_1_9.h5')
#pred_model = load_model('model_3_0.h5')

base_model = VGG16(include_top=True) #define the image feature extraction model
feature_extract_pred_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [79]:
image_captions_candidate = dict()
for image_fileName, reference_captions in image_captions_test.items():
    image_fileName_feature = image_fileName.split('.')[0]
    photo = extract_feature(feature_extract_pred_model, caption_image_fileName)
    image_captions_candidate[image_fileName] = generate_caption(pred_model, caption_train_tokenizer, photo, max_length)
    
fid = open("test_captions_post_concat","wb")
dump(image_captions_candidate, fid)
fid.close()



In [43]:
#greedy bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from pickle import load

fid = open("test_captions_post_concat","rb")
image_captions_candidate = load(fid)
fid.close()

fid = open("image_captions_test.pkl","rb")
image_captions_test = load(fid)
fid.close()

chencherry = SmoothingFunction()

bleu_score = dict()
#bleu_score_beam5 = dict()
for image_fileName, reference_captions in image_captions_test.items():
    ref_cap_reformat=list()
    for cap in reference_captions:
        ref_cap_reformat.append(cap.split()[1:-1])
    
    bleu_score[image_fileName] = sentence_bleu(ref_cap_reformat, image_captions_candidate[image_fileName], smoothing_function=chencherry.method1)
    #bleu_score_beam5[image_fileName] = sentence_bleu(ref_cap_reformat, list(image_captions_candidate_beam5[image_fileName][-1].split()), smoothing_function=chencherry.method1)
    
    
#print(bleu_score)

In [44]:
import numpy as np
bleu_score_array = np.fromiter(bleu_score.values(), dtype=float)
print('mean bleu='+str(np.mean(bleu_score_array)) + '; median bleu='+str(np.median(bleu_score_array))+'; max bleu='+str(np.max(bleu_score_array))+'; min bleu='+str(np.min(bleu_score_array))+'; std bleu='+str(np.std(bleu_score_array)))

mean bleu=0.041603129621198096; median bleu=0.026658376817028866; max bleu=0.37700638045494705; min bleu=0.0; std bleu=0.041701279853638036


In [45]:
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x) for x in lst), [] )

In [46]:
def generate_caption_beam(pred_model, caption_train_tokenizer, photo, max_length, vocab_size, beam_width):
    sequence = caption_train_tokenizer.texts_to_sequences(['<START>'])[0]
    sequence = pad_sequences([sequence], maxlen=max_length)
    model_softMax_output = np.squeeze(pred_model.predict([photo,sequence], verbose=0))
    most_likely_seq=np.argsort(model_softMax_output)[-beam_width:]
    most_likely_prob=np.log(model_softMax_output[most_likely_seq])


    most_likely_cap = list()
    for j in range(beam_width):
        most_likely_cap.append(list())
        most_likely_cap[j] =[[caption_train_tokenizer.index_word[most_likely_seq[j]]]]

    for i in range(max_length):    
        temp_prob = np.zeros((beam_width, vocab_size))
        for j in range(beam_width):
            if most_likely_cap[j][-1] != ['end']: #if not terminated
                num_words = len(most_likely_cap[j])
                sequence = caption_train_tokenizer.texts_to_sequences(most_likely_cap[j])
                sequence = pad_sequences(np.transpose(sequence), maxlen=max_length)
                model_softMax_output = pred_model.predict([photo,sequence], verbose=0)
                temp_prob[j,] = (1/(num_words)) *(most_likely_prob[j]*(num_words-1) + np.log(model_softMax_output)) #update most likily prob
            else:
                temp_prob[j,] = most_likely_prob[j] + np.zeros(vocab_size) - np.inf
                temp_prob[j,0] = most_likely_prob[j]


        x_idx, y_idx = np.unravel_index(temp_prob.flatten().argsort()[-beam_width:], temp_prob.shape)

        most_likely_cap_temp = list()
        for j in range(beam_width):
            most_likely_prob[j] = temp_prob[x_idx[j],y_idx[j]]
            most_likely_cap_temp.append(list())
            most_likely_cap_temp[j] = most_likely_cap[x_idx[j]].copy()
            if most_likely_cap_temp[j][-1] != ['end']:
                most_likely_cap_temp[j].append([caption_train_tokenizer.index_word[y_idx[j]]])

        most_likely_cap = most_likely_cap_temp.copy()

        finished = True
        for j in range(beam_width):
            if most_likely_cap_temp[j][-1] != ['end']:
                finished = False

        if finished == True:
            break

    final_caption = list()

    for j in range(beam_width):
        final_caption.append(' '.join(flatten(most_likely_cap[j][0:-1])))


    return final_caption, most_likely_prob

In [71]:
features = dict()
dir_data = "C:\\Users\\aabhatia\\Downloads\\testImages"
vocab_size = 7506
beam_width = 10
max_length = 33
pred_model = load_model('modelConcat_1_89.h5')

def generate_captions_beams():
    for file in listdir(dir_data):
        caption_image_fileName = dir_data + '\\' + file
        print('Generating caption for {0}'.format(caption_image_fileName))
        photo = extract_feature(feature_extract_pred_model, caption_image_fileName)
        #caption = generate_caption_beam(pred_model, caption_train_tokenizer, photo, max_length)
        caption, prob = generate_caption_beam(pred_model, caption_train_tokenizer, photo, max_length,vocab_size,beam_width)
        print(caption[4])
        print(prob[4])

generate_captions_beams()

Generating caption for C:\Users\aabhatia\Downloads\testImages\114051287_dd85625a04.jpg


  temp_prob[j,] = (1/(num_words)) *(most_likely_prob[j]*(num_words-1) + np.log(model_softMax_output)) #update most likily prob


children man man looks out record out of camper
-1.0586933
Generating caption for C:\Users\aabhatia\Downloads\testImages\1262583859_653f1469a9.jpg
woman man man rides unicycle down suburban street
-0.72435635
Generating caption for C:\Users\aabhatia\Downloads\testImages\2120411340_104eb610b1.jpg
brown dogs play in field
-0.794203
Generating caption for C:\Users\aabhatia\Downloads\testImages\2453971388_76616b6a82.jpg
group children boys are playing in water fountain
-0.83135283
Generating caption for C:\Users\aabhatia\Downloads\testImages\2473738924_eca928d12f.jpg
girl girl in red shirt is playing with water
-1.3087059
Generating caption for C:\Users\aabhatia\Downloads\testImages\2631300484_be8621d17b.jpg
small large man in pirate stance with his eyes closed and looks at his neck
-1.2208562
Generating caption for C:\Users\aabhatia\Downloads\testImages\2723477522_d89f5ac62b.jpg
tan brown brown dogs play in grass
-0.7741197
Generating caption for C:\Users\aabhatia\Downloads\testImages\278