In [None]:
import numpy as np
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
import keras.preprocessing.image as pic
import os
import re
import string
import pickle as pk    #for saving features
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, Embedding, Add, LSTM, Dropout
from keras.models import Model

basepath = os.getcwd()

## GETTING AND PREPROCESSING RAW DATA

#### Extracting features from each image into a 4096 dimensional vector using VGG16 model

In [None]:
def encode_images():
    
    model = VGG16()
    model.layers.pop()  # The last softmax later is not required, we only need to 4096 dimensional encoding
    model = Model(inputs = model.inputs, outputs = model.layers[-1].output)
    
    img_set = 'Flicker8k_Dataset'
    img_dir = os.path.join(basepath, img_set)
    features = {}
    
    for img_id in os.listdir(img_dir):
        
        path = os.path.join(basepath, img_id)
        image = pic.load_img(path, target_size = (224, 224))   #input image in VGG model is (224, 224)
        image = pic.img_to_array(image) #converting image to pixels
        image = image.reshape(1, image.shape[0], image.shape[1], -1) #converting to (1, nh, nw, nc)
        image = preprocess_input(image)  #The images are converted from RGB to BGR, then each color channel is zero-centered with respect to the ImageNet dataset, without scaling.
        feature_image = model.predict(image)
        img_name = img_id.split('.')[0]  #each image in directory is "img_id.jpg"
        features[img_name] = feature_image
        
    return features

In [None]:
a = 'Flicker8k_Text\Flickr8k.token.txt'
b = os.path.join(basepath, a)
b

#### Getting captions associated with each image id

In [None]:
def get_captions():
    
    text = []
    captions = {}
    
    caption_file = 'Flicker8k_Text\Flickr8k.token.txt'
    with open(os.path.join(basepath, caption_file), 'r') as f:
        text = f.read()
    
    for text in text.split('\n'):
        
        words = text.split()
        if(len(words) < 2):  #if there is no caption provided
            continue
            
        img_id, caption = words[0], words[1:]
        img_id = img_id.split('.')[0]
        caption = ' '.join(caption)
        
        if(img_id not in captions.keys()):  #as each image id has multiple captions, we make a list of all of them
            captions[img_id] = []
            
        captions[img_id].append(caption)
    
    return captions
        

In [None]:
def clean_text(captions):
    
    punc = string.punctuation
    
    for image in captions.keys():
        for i, caption in enumerate(captions[image]):
            new_caption = []
            for word in caption.split():
                
                if(len(word) < 2):   #ignoring words like 'a'
                    continue
                word = word.lower()
                t = [w for w in word if w not in punc] #removing punctuations
                word = "".join(t)
                word = re.sub(r"\d+", "", word) #removing numbers
                new_caption.append(word)
                
            new_caption = 'startseq ' + ' '.join(new_caption) + ' endseq'
            captions[image][i] = new_caption

In [None]:
#features = encode_images()    #takes about 1 hr

In [None]:
''' #to save in file:
f = open('feature_dict.pkl', 'wb')
pk.dump(features, f)
f.close()'''

#to open file :
o = open('feature_dict.pkl', 'rb')
features = pk.load(o)

In [None]:
captions = get_captions()
clean_text(captions)

## MAKING INPUT DATA

#### Separating train, dev and test images

In [None]:
def get_dataset(file):
    
    text = []
    with open(file) as f:
        text = f.read()
    
    names = []
    text = text.split('\n')
    for image in text:
        img_id = image.split('.')[0]
        names.append(img_id)
    
    return names
        

train_id = get_dataset(os.path.join(basepath, 'Flicker8k_Text\Flickr_8k.trainImages.txt')
dev_id = get_dataset(os.path.join(basepath, 'Flicker8k_Text\Flickr_8k.devImages.txt')
test_id = get_dataset(os.path.join(basepath, 'Flicker8k_Text\Flickr_8k.testImages.txt')

In [None]:
features_train = {}
features_dev = {}
features_test = {}

for key in features.keys():
    val = features[key]
    if(key in train_id):
        features_train[key] = val
    elif(key in dev_id):
        features_dev[key] = val
    else:
        features_test[key] = val

#### Making Vocabulary using Tokenizer to convert the sequences into integers (Method 1)

In [None]:
def get_tokenizer(captions):
    
    all_captions = []
    for img_id in captions.keys():
        [all_captions.append(caption) for caption in captions[img_id]]
    
    Tx = max(len(d.split()) for d in all_captions)       #length of longest caption(caption with maximum number of words)
            
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    return Tx, tokenizer

In [None]:
Tx, tokenizer = get_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1  #plus 1 for including the padding sequence (shown later in embedding layer)

#### Making Input Data

In [None]:
def make_data(features, captions, Tx, vocab_size, tokenizer):
    
    X_img = []
    X_text = []
    Y = []
    
    for img_id in features.keys():
        for caption in captions[img_id]:
            indices = tokenizer.texts_to_sequences([caption])
            for i in range(1, len(caption.split())):
                inp = indices[0][:i]
                op = indices[0][i]
                inp = pad_sequences([inp], maxlen = Tx)[0] #output shape is (1,Tx), we only need the array of values not 2d array
                op = to_categorical([op], num_classes = vocab_size)[0] #output shape is (1, vocab_size)
                img_code = features[img_id][0]  #each feature[img_id] is a 2d np array of size (1, 4096). We don't need 2D array, we only want the 4096 features.
                X_text.append(inp)
                Y.append(op)
                X_img.append(img_code)
                
    return np.asarray(X_img), np.asarray(X_text), np.asarray(Y)

In [None]:
X_img_train, X_text_train, Y_train = make_data(features_train, captions, Tx, vocab_size, tokenizer)

In [None]:
X_img_dev, X_text_dev, Y_dev = make_data(features_dev, captions, Tx, vocab_size, tokenizer)

## MAKING THE MODEL

<img src = "merged_model.png">

In [None]:
def make_model(Tx, vocab_size):
    
    #Image Model
    X_img = Input((4096,))
    X1 = Dropout(0.5)(X_img)
    X1 = Dense(256, activation = 'relu')(X1)
    
    #Text Model
    X_text = Input((Tx, ))
    X2 = Embedding(vocab_size, 256, mask_zero = True)(X_text) #mask_zero : for padding sequence
    X2 = Dropout(0.5)(X2)
    X2 = LSTM(256)(X2)
    
    #Decoder(Merge) Model
    X = Add()([X1, X2])
    X = Dense(256, activation = 'relu')(X)
    X = Dense(vocab_size, activation = 'softmax')(X)
    
    model = Model(inputs = [X_img, X_text], outputs = X)
    return model

In [None]:
model = make_model(Tx, vocab_size)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit([X_img_train, X_text_train], Y_train, epochs = 20, batch_size = 1024, validation_data = ([X_img_dev, X_text_dev], Y_dev))

In [None]:
model.save('model.h5')

## GENERATING NEW CAPTIONS

In [None]:
model1 = VGG16()
model1.layers.pop()  
model1 = Model(inputs = model1.inputs, outputs = model1.layers[-1].output)

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_sequence(model, tokenizer, photo, Tx):
    
    inp = 'startseq'
    for i in range(Tx):
        sequence = tokenizer.texts_to_sequences([inp])
        sequence = pad_sequences(sequence, maxlen = Tx)
        yhat = model.predict([photo,sequence])
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None or word == 'endseq':
            break
        inp += ' ' + word
        
    return inp

def generate_caption(path):
    
    image = pic.load_img(path, target_size = (224, 224))
    image = pic.img_to_array(image)
    image = image.reshape(1, image.shape[0], image.shape[1], -1)
    image = preprocess_input(image) 
    feature_image = model1.predict(image)
    s = generate_sequence(model, tokenizer, feature_image, Tx)
    return s[9:]

In [None]:
path = None           #add path of required image
generate_caption(path)