In [0]:
import pickle
import numpy as np
from keras.layers.merge import add
from keras.preprocessing import image
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions

In [0]:
max_len = 74
vocab_size = 5121
embedding_dim = 50

In [0]:
with open('./drive/My Drive/features/encoded_captions.pkl', 'rb') as f:
  train_descriptions = pickle.load(f)
with open('./drive/My Drive/features/word2idx.pkl', 'rb') as f:
  word2idx = pickle.load(f)
with open('./drive/My Drive/features/idx2word.pkl', 'rb') as f:
  idx2word = pickle.load(f)
with open("./drive/My Drive/features/embedding_index.pkl", 'rb') as f:
  embedding_index = pickle.load(f)
with open('./drive/My Drive/features/encoded_image_features.pkl', 'rb') as f:
  encoded_img = pickle.load(f)

In [0]:
def data_generator(train_descriptions,encoded_img,word2idx,max_len,batch_size):
    X1,X2,Y = [],[],[]
    n = 0
    while True:
        for key,cap_list in train_descriptions.items():
            n += 1
            img = encoded_img[key]
            for cap in cap_list:
                seq = [word2idx[word] for word in cap.split() if word in word2idx]
                for i in range(1,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    xi = pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi = to_categorical([yi],num_classes=vocab_size)[0]
                    X1.append(img)
                    X2.append(xi)
                    Y.append(yi)
                if n == batch_size:
                    yield [[np.array(X1),np.array(X2)],np.array(Y)]
                    X1,X2,Y = [],[],[]
                    n = 0

In [0]:
def get_embedding_matrix():
  matrix = np.zeros((vocab_size,embedding_dim))
  for word,idx in word2idx.items():
    embed_vector = embedding_index.get(word)
    if embed_vector is not None:
      matrix[idx] = embed_vector
  return matrix
  
embedding_matrix = get_embedding_matrix()

In [0]:
# Image Features as Input
inp_img = Input(shape=(2048,))
inp_img1 = Dropout(0.03)(inp_img)
inp_img2 = Dense(256,activation='relu')(inp_img1)
# Captions as Input
inp_cap = Input(shape=(max_len,))
inp_cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(inp_cap)
inp_cap2 = Dropout(0.3)(inp_cap1)
inp_cap3 = LSTM(256)(inp_cap2)
# Combined Model
decoder1 = add([inp_img2,inp_cap3])
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(vocab_size,activation='softmax')(decoder2)
model = Model(inputs=[inp_img,inp_cap],outputs=outputs)
# Initializing The Embedding Layer
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False
# Compiling The Model
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 74)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 74, 50)       256050      input_6[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 2048)         0           input_5[0][0]                    
____________________________________________________________________________________________

In [0]:
epochs = 20
batch_size = 3
steps = len(train_descriptions)//batch_size

In [0]:
!mkdir model_weights

In [0]:
def train():
  for i in range(epochs):
    generator = data_generator(train_descriptions,encoded_img,word2idx,max_len,batch_size)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
    if i%5==0:
      model.save('./model_weights/model_'+str(i)+'.h5')

In [None]:
train()