In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import json
import collections
import pickle
import time
import os
import cv2
import gensim
from keras.applications.resnet50 import ResNet50,preprocess_input,decode_predictions
from keras.preprocessing import image
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Flatten,Embedding,Dropout,Input,LSTM,add
from keras.models import Model,load_model
from keras.utils.np_utils import to_categorical 
#%xmode verbose 

Using TensorFlow backend.


In [10]:
## Loading all descriptions
with open('./resources/descriptions.txt', 'r') as f:
    descriptions = f.read()
descriptions = json.loads(descriptions)

## Loading vocab
with open('./resources/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

## Loading training descriptions    
with open('./resources/train_descriptions.txt','r') as f:
    train_descriptions=f.read()
train_descriptions = json.loads(train_descriptions)

## Loading word to index mapping
with open('./resources/word2idx.pkl','rb') as f:
    word2idx=pickle.load(f)

## Loading index to word mapping
with open('./resources/idx2word.pkl','rb') as f:
    idx2word=pickle.load(f)

## Loading ResNet50 training image features 
with open("./resources/encoded_train_images.pkl","rb") as f:
    encoded_train_images = pickle.load(f)
    
## Loading ResNet50 testing image features     
with open("./resources/encoded_test_images.pkl","rb") as f:    
    encoded_test_images = pickle.load(f)

## Loading Glove embeddings for all words    
with open('./resources/embedding_matrix.pkl','rb') as f:
     embedding_matrix = pickle.load(f)  

In [11]:
## Getting max length of a caption from the training captions
maxlen=0
for item in train_descriptions.keys():
    for cap in train_descriptions[item]:
        if len(cap.split()) > maxlen:
            maxlen=len(cap.split())
print(maxlen)            

40


## Data Generator

In [12]:
def generate_data(train_descriptions,encoded_train,word2idx,max_len,batch_size):
    x1=[]
    x2=[]
    y=[]
    n=0
    while True:
        for img,cap_list in train_descriptions.items():
            n+=1
            photo = encoded_train[img+".jpg"]
            for cap in cap_list:
                seq = [word2idx[word] for word in cap.split() if word in word2idx]
                print(seq)
                for i in range(1,len(seq)):
                    xi = seq[:i]
                    yi = seq[i]
                    
                    xi = pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi = to_categorical([yi],num_classes=len(word2idx)+1)[0]
                    
                    x1.append(photo)
                    x2.append(xi)
                    y.append(yi)       
            if n==batch_size:
                yield [[np.array(x1),np.array(x2)],np.array(y)]
            
                x1=[]
                x2=[]
                y=[]
                n=0
#batch1=generate_data(train_descriptions,encoded_train_images,word2idx,maxlen,8)                

In [13]:
vocab_size = len(embedding_matrix)
print(vocab_size)

1858


## Model Architecture 

In [14]:
## Image 
input_image_features = Input(shape=(2048,))
inp1 = Dropout(0.3)(input_image_features)
inp2 = Dense(256,activation='relu')(inp1)

## Caption

input_captions = Input(shape=(maxlen,))
cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
cap2 = Dropout(0.3)(cap1)
cap3 = LSTM(256)(cap2)

In [15]:
decoder1 = add([inp2,cap3])
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(vocab_size,activation='softmax')(decoder2)

image_caption_model = Model(inputs=[input_image_features,input_captions],outputs=outputs)
image_caption_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 40, 50)       92900       input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 2048)         0           input_3[0][0]                    
____________________________________________________________________________________________

In [16]:
image_caption_model.layers[2].set_weights([embedding_matrix])
image_caption_model.layers[2].trainable=False

image_caption_model.compile(loss='categorical_crossentropy',optimizer='adam')

## Training

In [9]:
epochs = 24
batch_size = 3
steps = len(train_descriptions)//batch_size

def train():
    for i in range(epochs):
        generator = generate_data(train_descriptions,encoded_train_images,word2idx,maxlen,batch_size)
        image_caption_model.fit_generator(generator,epochs=1,steps_per_epoch=steps)
        image_caption_model.save('./model_weights/model_{}.h5'.format(i))

In [None]:
train()