In [1]:
import json as js
from tqdm import tqdm
import string
import sys
import keras
import tensorflow as tf
from tensorflow.keras.layers import Input,Dropout,Dense,Embedding,LSTM,add
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer as Tokenizer
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
with open("train_captions.json") as file:
    tcap = js.load(file)
with open("test_captions.json") as file:
    testcap = js.load(file)
with open("validation_captions.json") as file:
    vcap = js.load(file)
    
table = str.maketrans('', '', string.punctuation)
for k in [tcap,testcap,vcap]:
    tcap_list = list(k.keys())
    for i in tqdm(range(len(tcap_list))):
        cap = k[tcap_list[i]]
        for j in range(len(cap)):
            desc = cap[j]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            cap[j] =  ' '.join(desc)
        k[tcap_list[i]] = cap

with open("train_captions.json",'w') as file:
    js.dump(tcap, file)
with open("test_captions.json",'w') as file:
    js.dump(testcap, file)
with open("validation_captions.json",'w') as file:
    js.dump(vcap, file)

100%|██████████████████████████████████████████████████████████████████████████| 49620/49620 [00:06<00:00, 7534.86it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16541/16541 [00:02<00:00, 7435.93it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16540/16540 [00:02<00:00, 7329.44it/s]


In [4]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        cap = descriptions[key]
        [all_desc.update(d.split()) for d in cap]
    return all_desc
   
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
 
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)


def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return array(X1), array(X2), array(y)

def RNN_LSTM_Model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model
def add_pre_post(descriptions):
    for k in descriptions:
        strings = descriptions[k]
        for string in range(len(strings)):
            strings[string] = "startseq " + strings[string] + " endseq"
        descriptions[k] = strings
    return descriptions
            

In [12]:
descriptions = dict(tcap)
tokenizer = create_tokenizer(tcap)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 19508


In [13]:
X1test, X2test, ytest = create_sequences(tokenizer, max_length(tcap), tcap, test_features, vocab_size)

In [5]:
describe = add_pre_post(tcap)

{'COCO_train2014_000000318556.jpg': ['startseq very clean and well decorated empty bathroom endseq',
  'startseq blue and white bathroom with butterfly themed wall tiles endseq',
  'startseq bathroom with border of butterflies and blue paint on the walls above it endseq',
  'startseq an angled view of beautifully decorated bathroom endseq',
  'startseq clock that blends in with the wall hangs in bathroom endseq'],
 'COCO_train2014_000000116100.jpg': ['startseq panoramic view of kitchen and all of its appliances endseq',
  'startseq panoramic photo of kitchen and dining room endseq',
  'startseq wide angle view of the kitchen work area endseq',
  'startseq multiple photos of brown and white kitchen endseq',
  'startseq kitchen that has checkered patterned floor and white cabinets endseq'],
 'COCO_train2014_000000134754.jpg': ['startseq the two people are walking down the beach endseq',
  'startseq two people carrying surf boards on beach endseq',
  'startseq two teenagers at white sande