In [8]:
import keras
from keras_applications import inception_v3
from keras_preprocessing.image import img_to_array, load_img

img_path = "datasets/flickr8k/Flickr8k_Dataset/667626_18933d713e.jpg"
img = load_img(img_path, target_size=(299, 299))
img_array = img_to_array(img)
img_array = inception_v3.preprocess_input(img_array)

In [9]:
from keras_preprocessing.text import Tokenizer

all_sentences = ["A women lay in side of pool.", "A young girl is lying in the sand , while ocean water is surrounding her.", "Girl wearing a bikini lying on her back in a shallow pool of clear blue water."]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)

sentences_to_encode = all_sentences[:2]
encoded_sentences = tokenizer.texts_to_sequences(sentences_to_encode)

In [10]:
from keras_applications.inception_v3 import InceptionV3
from keras.layers import BatchNormalization, Dense, RepeatVector

image_model = InceptionV3(include_top=False, weights='imagenet', pooling='avg')

for layer in image_model.layers:
    layer.trainable = False
    
embedding_size = 300
dense_input = BatchNormalization(axis=-1)(image_model.output)
image_dense = Dense(units=embedding_size)(dense_input)

image_embedding = RepeatVector(1)(image_dense)
image_input = image_model.input

In [11]:
from keras.layers import Embedding, Input

vocab_size = 2536
embedding_size = 300

sentence_input = Input(shape=[None])
word_embedding = Embedding(input_dim=vocab_size,
                           output_dim=embedding_size
                           )(sentence_input)

In [12]:
import tensorflow as tf

def categorical_crossentropy_from_logits(y_true, y_pred):
  y_true = y_true[:, :-1, :]  # Discard the last timestep
  y_pred = y_pred[:, :-1, :]  # Discard the last timestep
  loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true,
                                                 logits=y_pred)
  return loss

def categorical_accuracy_with_variable_timestep(y_true, y_pred):
  y_true = y_true[:, :-1, :]  # Discard the last timestep
  y_pred = y_pred[:, :-1, :]  # Discard the last timestep

  # Flatten the timestep dimension
  shape = tf.shape(y_true)
  y_true = tf.reshape(y_true, [-1, shape[-1]])
  y_pred = tf.reshape(y_pred, [-1, shape[-1]])

  # Discard rows that are all zeros as they represent padding words.
  is_zero_y_true = tf.equal(y_true, 0)
  is_zero_row_y_true = tf.reduce_all(is_zero_y_true, axis=-1)
  y_true = tf.boolean_mask(y_true, ~is_zero_row_y_true)
  y_pred = tf.boolean_mask(y_pred, ~is_zero_row_y_true)

  accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true, axis=1),
                                              tf.argmax(y_pred, axis=1)),
                                    dtype=tf.float32))
  return accuracy

In [13]:
from keras.layers import (BatchNormalization, Concatenate, Dense, LSTM,TimeDistributed)
from keras.models import Model
from keras.optimizers import Adam

sequence_input = Concatenate(axis=1)([image_embedding, word_embedding])

learning_rate = 0.00051
lstm_output_size = 300
vocab_size = 2536
lstm_layers = 3
dropout_rate = 0.22
input_ = sequence_input

for _ in range(lstm_layers):
  input_ = BatchNormalization(axis=-1)(input_)
  lstm_out = LSTM(units=lstm_output_size,
                  return_sequences=True,
                  dropout=dropout_rate,
                  recurrent_dropout=dropout_rate)(input_)
  input_ = lstm_out
sequence_output = TimeDistributed(Dense(units=vocab_size))(lstm_out)

model = Model(inputs=[image_input, sentence_input],
              outputs=sequence_output)
model.compile(optimizer=Adam(lr=learning_rate),
              loss=categorical_crossentropy_from_logits,
              metrics=[categorical_accuracy_with_variable_timestep])


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [18]:
from .dataset_providers import DatasetProvider

dataset_provider = DatasetProvider()

model.fit_generator(generator=dataset_provider.training_set(),
                    steps_per_epoch=dataset_provider.training_steps,
                    epochs=1,
                    validation_data=dataset_provider.validation_set(),
                    validation_steps=dataset_provider.validation_steps)

ModuleNotFoundError: No module named 'dataset_providers'