In [1]:
import os
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input,Dropout,add,Embedding,Dense,LSTM
from tensorflow.keras.models import Model

In [2]:
with open('Flickr8k_text/Flickr_8k.trainImages.txt','r') as file_obj:
    data_train=file_obj.readlines()

In [3]:
with open('Flickr8k_text/Flickr_8k.devImages.txt','r') as file_obj:
    data_validate=file_obj.readlines()

In [4]:
with open('image_captions_pkl', 'rb') as f:
    img_captions_dict = pickle.load(f)

In [5]:
with open('image_features_pkl', 'rb') as f:
    img_features_dict = pickle.load(f)

In [4]:
with open('tokenizer_pkl', 'rb') as f:
    t = pickle.load(f)

In [5]:
vocab_len=len(t.word_index)+1
vocab_len

8362

In [10]:
def get_data(data):
    x1=[]
    x2=[]
    y=[]
    for cnt,i in enumerate(data):
        img_name=i.split('.')[0]
        captions=img_captions_dict[img_name]
        features=img_features_dict[img_name]
        for caption in captions:
            words=caption.split()
            for i in range(1,len(words)):
                input_str=words[:i]
                output_str=words[i]
                x1.append(features)
                x2.append(t.texts_to_sequences([input_str])[0])
                y.append(t.texts_to_sequences([output_str])[0][0]) 
    x1=np.array(x1) 
    x2=pad_sequences(x2,maxlen=33)    
    y=to_categorical(y,vocab_len)  
    return x1,x2,y

In [11]:
x1_train,x2_train,y_train=get_data(data_train)

In [12]:
print(x1_train.shape)
print(x2_train.shape)
print(y_train.shape)

(305623, 4096)
(305623, 33)
(305623, 8362)


In [16]:
x1_val,x2_val,y_val=get_data(data_validate)

In [17]:
print(x1_val.shape)
print(x2_val.shape)
print(y_val.shape)

(51499, 4096)
(51499, 33)
(51499, 8362)


In [7]:
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(33,))
se1 = Embedding(vocab_len, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_len, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [8]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 256)      2140672     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_3[0][0]                    
______________________________________________________________________________________________

In [None]:
epochs=5
for i in range(1,epochs+1):
    model.fit([x1_train,x2_train],y_train,epochs=1,validation_data=([x1_val,x2_val],y_val))
    model.save('model_image_captioning_'+str(i)+'.h5')

Train on 305623 samples, validate on 51499 samples
Train on 305623 samples, validate on 51499 samples
Train on 305623 samples, validate on 51499 samples
Train on 305623 samples, validate on 51499 samples
Train on 305623 samples, validate on 51499 samples
 67552/305623 [=====>........................] - ETA: 24:45 - loss: 3.4475 - accuracy: 0.3098