In [1]:
from keras.layers import Input, Dense, Dropout, \
                         RepeatVector, LSTM, concatenate, \
                         Conv2D, MaxPooling2D, Flatten
from keras.models import Sequential, Model
from keras.layers import Bidirectional
from keras.optimizers import RMSprop
from keras import *

Using TensorFlow backend.


In [2]:
from keras.models import model_from_json


class AModel:
    def __init__(self, input_shape, output_size, output_path):
        self.model = None
        self.input_shape = input_shape
        self.output_size = output_size
        self.output_path = output_path
        self.name = ""

    def save(self):
        model_json = self.model.to_json()
        with open("{}/{}.json".format(self.output_path, self.name), "w") as json_file:
            json_file.write(model_json)
        self.model.save_weights("{}/{}.h5".format(self.output_path, self.name))

    def load(self, name=""):
        output_name = self.name if name == "" else name
        with open("{}/{}.json".format(self.output_path, output_name), "r") as json_file:
            loaded_model_json = json_file.read()
        self.model = model_from_json(loaded_model_json)
        self.model.load_weights("{}/{}.h5".format(self.output_path, output_name))

In [3]:
CONTEXT_LENGTH = 48
IMAGE_SIZE = 256
BATCH_SIZE = 64
EPOCHS = 10
STEPS_PER_EPOCH = 72000
weight_init="glorot_uniform"

In [4]:
class pix2code(AModel):
    def __init__(self, input_shape, output_size, output_path):
        AModel.__init__(self, input_shape, output_size, output_path)
        self.name = "pix2code"

        image_model = Sequential()
        image_model.add(Conv2D(32, (3, 3), padding='valid', activation='relu', input_shape=input_shape, kernel_initializer=weight_init))
        image_model.add(Conv2D(32, (3, 3), padding='valid', activation='relu', kernel_initializer=weight_init))
        image_model.add(MaxPooling2D(pool_size=(2, 2)))
        image_model.add(Dropout(0.25))

        image_model.add(Conv2D(64, (3, 3), padding='valid', activation='relu', kernel_initializer=weight_init))
        image_model.add(Conv2D(64, (3, 3), padding='valid', activation='relu', kernel_initializer=weight_init))
        image_model.add(MaxPooling2D(pool_size=(2, 2)))
        image_model.add(Dropout(0.25))

        image_model.add(Conv2D(128, (3, 3), padding='valid', activation='relu', kernel_initializer=weight_init))
        image_model.add(Conv2D(128, (3, 3), padding='valid', activation='relu', kernel_initializer=weight_init))
        image_model.add(MaxPooling2D(pool_size=(2, 2)))
        image_model.add(Dropout(0.25))

        image_model.add(Flatten())
        image_model.add(Dense(1024, activation='relu', kernel_initializer=weight_init))
        image_model.add(Dropout(0.3))
        image_model.add(Dense(1024, activation='relu', kernel_initializer=weight_init))
        image_model.add(Dropout(0.3))

        image_model.add(RepeatVector(CONTEXT_LENGTH))

        visual_input = Input(shape=input_shape)
        encoded_image = image_model(visual_input)

        language_model = Sequential()
        #LSTM
        #language_model.add(LSTM(128, return_sequences=True, input_shape=(CONTEXT_LENGTH, output_size)))
        #language_model.add(LSTM(128, return_sequences=True))

        #Bi-LSTM
        language_model.add(Bidirectional(LSTM(128, return_sequences=True,kernel_initializer=weight_init), input_shape=(CONTEXT_LENGTH, output_size)))
        language_model.add(Bidirectional(LSTM(128, return_sequences=True,kernel_initializer=weight_init)))

        textual_input = Input(shape=(CONTEXT_LENGTH, output_size))
        encoded_text = language_model(textual_input)

        decoder = concatenate([encoded_image, encoded_text])
        #LSTM
        #decoder = LSTM(512, return_sequences=True)(decoder)
        #decoder = LSTM(512, return_sequences=False)(decoder)
        #decoder = Dense(output_size, activation='softmax')(decoder)

        #Bi-LSTM
        decoder = Bidirectional(LSTM(512, return_sequences=True,kernel_initializer=weight_init))(decoder)
        decoder = Dropout(0.25)(decoder)
        decoder = Bidirectional(LSTM(512, return_sequences=False,kernel_initializer=weight_init))(decoder)
        decoder = Dense(output_size, activation='softmax',kernel_initializer=weight_init)(decoder)

        self.model = Model(inputs=[visual_input, textual_input], outputs=decoder)

        #LSTM
        #optimizer = RMSprop(lr=0.0001, clipvalue=1.0)

        #Bi-LSTM
        optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['accuracy'])

    def fit(self, images, partial_captions, next_words):
        self.model.fit([images, partial_captions], next_words, shuffle=False, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
        self.save()

    def fit_generator(self, generator, steps_per_epoch):
        history = self.model.fit_generator(generator, steps_per_epoch=steps_per_epoch, epochs=EPOCHS, verbose=1)
        self.save()
        return history

    def predict(self, image, partial_caption):
        return self.model.predict([image, partial_caption], verbose=0)[0]

    def predict_batch(self, images, partial_captions):
        return self.model.predict([images, partial_captions], verbose=1)