# CNN-LSTM (with soft attention) model

This notebook aims the construction, training and test of a CNN-LSTM model purposed in Vinyals et
al. [15].

The CNN-LSTM model is a encoder-decoder designed for image caption generation.

### Setup of libraries

In the next cell we link the GPU hardware to tensorflow for use this component in the training process.

In [1]:
import csv # To read the captions of a csv
import tensorflow as tf # To build and train the ANN
from tensorflow.keras.preprocessing.text import Tokenizer # To use the tokenizer to split the words
from tensorflow.keras.preprocessing.sequence import pad_sequences # 
import numpy as np
import pickle
from datetime import datetime
import time
import pandas as pd

In [None]:
tf.config.set_visible_devices([], 'GPU')

## Data reading

First, we upload the descriptions of a CSV to a list and the image id to another list to build the path where the images are stored.

In [2]:
def idToPath(id_image):
    id_image = str(id_image)
    complete_name=id_image+".jpg"
    while len(complete_name)<16:
        complete_name="0"+complete_name
    return "data/train2017/"+complete_name

In [3]:
PATH = "data/train_machine_spanish.xlsx"
df = pd.read_excel(PATH, names=["id_image","caption"])
df['caption'] = df.apply(lambda x: "smark "+x['caption']+" emark", axis=1)
df['id_image'] = df.apply(lambda x: idToPath(x['id_image']),axis=1)

In [4]:
PATH = "data/validation.xlsx"
val_df = pd.read_excel(PATH, names=["id_image","caption"])
val_df['caption'] = val_df.apply(lambda x: "smark "+x['caption']+" emark", axis=1)
val_df['id_image'] = val_df.apply(lambda x: idToPath(x['id_image']),axis=1)

### Setup of text data

We load the tokenizer that we initialized in word embeddings notebook.

In [5]:
# loading
with open('items/tokenizer_spanish.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

We transform the descriptions into a list of integers that represent the index in the tokenizer's word index vocabulary.

In [6]:
sentences_x = tokenizer.texts_to_sequences(df['caption'])

And its equivalent for the labels that are the same lists with each element moved one position to the left

In [7]:
sentences_y = [sentence[1:] for sentence in sentences_x]

We put pads on the right for every sentence that has less than 15 words.

In [8]:
pad_sentences_x = pad_sequences(sentences_x, padding='post',maxlen=15)
pad_sentences_y = pad_sequences(sentences_y, padding='post',maxlen=15)

In [9]:
pad_sentences_x[0]

array([   2,    5,   55,    7,    1,  279,  230,    9, 2988,  450,    8,
        139,    3,    0,    0], dtype=int32)

In [10]:
pad_sentences_y[0]

array([   5,   55,    7,    1,  279,  230,    9, 2988,  450,    8,  139,
          3,    0,    0,    0], dtype=int32)

### Model building

The embedding layer loaded from the model stored in word embeddings notebook. This model transform word indexes to embeddings for encoder inputs.

In [11]:
EPOCHS = 100
BATCH_SIZE=64

In [12]:
class Embeddings_Model(tf.keras.Model):
    
    def __init__(self, max_length, embedding_dimension):
        super(Embeddings_Model, self).__init__()
        weights = None
        # Load the weight of the layer embedding pre-trained
        with open('items/embeddingLayerWeights_spanish.pkl', 'rb') as handle:
            weights = pickle.load(handle)
        self.embedding = tf.keras.layers.Embedding(max_length+1, embedding_dimension ,weights=[weights])
        
    def call(self, inputs):
        x = self.embedding(inputs)
        
        return x

In [13]:
class CNN_Model(tf.keras.Model):
    
    def __init__(self):
        super(CNN_Model, self).__init__()
        pre_trained_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet',
                                                            input_shape=(224, 224, 3))
        self.input_model = tf.keras.Sequential()
        for layer in pre_trained_model.layers[:-1]:
            self.input_model.add(layer)
    def call(self, inputs):
        x = self.input_model(inputs)
        return x

In [14]:
class AnnotationVectorInitializer(tf.keras.Model):
    
    def __init__(self, embedding_dimension):
        super(AnnotationVectorInitializer, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.dense = tf.keras.layers.Dense(embedding_dimension, activation='tanh')
    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.dense(x)
        return x

In [15]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
class LSTM_Model(tf.keras.Model):
    
    def __init__(self, max_length, embedding_dimension, num_result_words):
        super(LSTM_Model, self).__init__()
        self.embedding_dimension = embedding_dimension
        self.lstm = tf.keras.layers.LSTM(embedding_dimension, input_shape=(num_result_words, embedding_dimension),
                                         return_sequences=True,dropout=0.1, return_state=True,
                                        recurrent_initializer='glorot_uniform')
        self.output_model = tf.keras.layers.Dense(max_length)
        self.flatten = tf.keras.layers.Flatten()
        self.softAttention = BahdanauAttention(embedding_dimension)
        
    def call(self, captions, initial_state1, initial_state2, features):
        #captions, initial_state, features = inputs[0],inputs[1], inputs[2]
        captions = tf.reshape(captions, [-1,1,captions.shape[-1]])
        features = tf.reshape(features,[-1,features.shape[-3]*features.shape[-2],512])
        context_vector, attention_weights = self.softAttention(features, initial_state1)
        z = tf.concat([tf.expand_dims(context_vector, 1), captions], axis=-1)
        x, state1, state2 = self.lstm(z,initial_state=[initial_state1, initial_state2])
        x = self.output_model(x)

        return x, state1, state2, attention_weights

In [17]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [41]:
class CNN_LSTM_Model():
    
    # This function declare every layer of the model
    def __init__(self):        
        # Number of differents words in the tokenizer vocabulary.
        self.max_length = 14276
        # Embedding dimension
        self.embedding_dimension = 512
        # Number of LSTM cells
        self.cells=1024
        # Maximum number or words that the model is able to generate in a caption.
        self.num_result_words = 15
        # Optimizer used
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        # Initialize the encoder
        self.encoder = CNN_Model()
        # Initialize the embedding layer
        self.embedding = Embeddings_Model(self.max_length, self.embedding_dimension)
        # Initialize the decoder
        self.decoder = LSTM_Model(self.max_length, self.cells, self.num_result_words)
        # Initializers of hidden states
        self.initializer1 = AnnotationVectorInitializer(self.cells)
        self.initializer2 = AnnotationVectorInitializer(self.cells)
    
    # This function load all images for the encoder can transform them to image embeddings
    # Parameters:
    #            image_paths: Collection of path to images
    #            load: Boolean that indicate if load all images or read a file with the embeddings
    # Return:
    #        A list with the image embeddings
    def encoder_predict(self, image_paths, load=False, test=False):
        predictions = []
        contador=0
        if load or test:
            # If load=True then we need to predict all image embeddings with the encoder
            # and stored for future uses
            for image_path in image_paths:
                # Visual indicator for longs process
                if contador%1000==0:
                    print("Procesando imagen",contador)
                contador+=1
                # Read the file with tensorflow function
                image = tf.io.read_file(image_path)
                # Transform the image to jpeg in RGB color space
                image = tf.image.decode_jpeg(image, channels=3)
                # Resize to inception_v3 input size
                image = tf.image.resize(image, (224, 224))
                # Add a dimension to tensor for the model
                image = np.expand_dims(image, axis=0)
                # Add the result of the prediction to the list
                predictions.append(self.encoder.predict(image))
            # Store the image embeddings list
            if not test:
                with open('items/VGGimage_embeddings_list.pkl', 'wb') as handle:
                    pickle.dump(predictions, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            # Load the image embeddings list
            with open('items/VGGimage_embeddings_list.pkl', 'rb') as handle:
                predictions = pickle.load(handle)[:len(image_paths)]
        
        return predictions
    
    
    # This function preprocess the data
    # Parameters:
    #           X_image: Image path list
    #           X_caption: Caption list
    #           Y_caption: Label caption list for compare with the predict caption
    def preprocess_data(self,X_image,X_caption,Y_caption, test=False, load=False):
        # Get the image embeddings
        image_embeddings = np.array(self.encoder_predict(X_image,load=load,test=test))
        
        # Lambda function for map the word embeddings
        map_embeddings = lambda x: self.embedding.predict(np.array([x]))
        # Map the word embeddings
        X_caption_embeddings = map_embeddings(X_caption)
        
        # A set of transforms in numpy array sizes
        Y_caption_embeddings = np.array(Y_caption)
        X_caption_embeddings = X_caption_embeddings.reshape((
            -1, self.num_result_words, self.embedding_dimension))
        Y_caption_embeddings = Y_caption_embeddings.reshape((
            -1, self.num_result_words))
        
        return image_embeddings, X_caption_embeddings, Y_caption_embeddings
    
    
    # This function train the model
    # Parameters:
    #           X_image: Image path list
    #           X_caption: Caption list
    #           Y_caption: Label caption list for compare with the predict caption
    def train(self, X_image, X_caption, Y_caption):
        loss = 0
        
        # We manually iterate the epochs
        for epoch in range(EPOCHS):
            # We build the batchs
            for batch in range(0,X_image.shape[0],BATCH_SIZE):
                image_embeddings_batch = None
                X_caption_embedding_batch = None
                Y_caption_embedding_batch = None
                if batch+BATCH_SIZE>=len(X_image):
                    image_embeddings_batch = X_image[batch:]
                else:
                    image_embeddings_batch = X_image[batch:batch+BATCH_SIZE]
                if batch+BATCH_SIZE>=len(X_caption):
                    X_caption_embedding_batch = X_caption[batch:]
                else:
                    X_caption_embedding_batch = X_caption[batch:batch+BATCH_SIZE]
                if batch+BATCH_SIZE>=len(Y_caption):
                    Y_caption_embedding_batch = Y_caption[batch:]
                else:
                    Y_caption_embedding_batch = Y_caption[batch:batch+BATCH_SIZE]
                # We manually compute the gradients from the loss
                # Reshape of the data
                image_embeddings_batch, X_caption_embedding_batch, Y_caption_embedding_batch = self.preprocess_data(
                    image_embeddings_batch,
                    X_caption_embedding_batch,
                    Y_caption_embedding_batch,
                    test=True,
                    load=False)
                with tf.GradientTape() as tape:
                    # Get the initial values of the hidden states
                    state_t1 = self.initializer1(tf.reduce_mean(image_embeddings_batch, axis=-1))
                    state_t2 = self.initializer2(tf.reduce_mean(image_embeddings_batch, axis=-1))
                    loss = 0.
                    # Each step computes the loss of error words in the sentences
                    for step in range(0,X_caption_embedding_batch.shape[1]):
                        token_t = X_caption_embedding_batch[:,:,step]
                        output_t, state_t1,state_t2, attention_weights = self.decoder(
                            token_t,
                            state_t1,
                            state_t2,
                            image_embeddings_batch)
                        loss += loss_function(Y_caption_embedding_batch[:,step],output_t[:,0])
                if(batch%100*BATCH_SIZE==0):
                    print("Epoch",epoch,"batch",batch,"Mean loss",float(loss/X_caption_embedding_batch.shape[2]),
                         datetime.now().strftime("%d-%b-%Y (%H:%M:%S.%f)"))
                # Get the trainable weights
                trainable_variables=self.decoder.trainable_variables+self.initializer1.trainable_variables+self.initializer2.trainable_variables
                # Compute the gradients
                gradients = tape.gradient(loss, trainable_variables)
                # Apply gradients to trainable weights
                self.optimizer.apply_gradients(zip(gradients, trainable_variables))
                
    # This function get the prediction by trained model
    # Parameters:
    #           X_image: Image path list
    #           X_caption: Caption list
    #           Y_caption: Label caption list for compare with the predict caption
    def predict(self, X_image, X_caption, Y_caption):
        
        # Reshape of the data
        image_embeddings, X_caption_embeddings, Y_caption_embeddings = self.preprocess_data(
            X_image,
            X_caption,
            Y_caption,
            test=True)
        
        images_list = []
        captions_list = []
        attention_list = []
        for example in range(0,len(image_embeddings)):
            tokens = []
            at = []
            input_token = tokenizer.word_index['smark']
            hidden_state1 = self.initializer1(tf.reduce_mean(tf.expand_dims(image_embeddings[example], 0), axis=-1))
            hidden_state2 = self.initializer2(tf.reduce_mean(tf.expand_dims(image_embeddings[example], 0), axis=-1))
            while input_token != tokenizer.word_index['emark'] and len(tokens)<self.num_result_words:
                tokens.append(input_token)
                tokens_array = self.embedding.predict([input_token])
                tokens_array = tf.expand_dims(tokens_array, 0)
                output_t, hidden_state1, hidden_state2, attention_weights = self.decoder(
                    tokens_array,
                    hidden_state1,
                    hidden_state2,
                    image_embeddings[example])
                input_token = np.argmax(output_t)
                at.append(attention_weights)
                
            images_list.append(X_image[example])
            captions_list.append(tokenizer.sequences_to_texts([tokens]))
            attention_list.append(at)
            if example%100==0:
                print("%d images predicted"%example)
        
        return images_list,captions_list,attention_list

In [42]:
model = CNN_LSTM_Model()

In [26]:
model.train(df['id_image'], pad_sentences_x, pad_sentences_y)

('Procesando imagen', 0)
('batch', 0)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Epoch', 0, 'batch', 0, 'Mean loss', 0.21451999247074127, '26-May-2020 (20:07:49.929571)')
('Procesando imagen', 0)
('batch', 64)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 128)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 192)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 256)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 320)
(

('Procesando imagen', 0)
('batch', 2944)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 3008)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 3072)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 3136)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 3200)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Epoch', 0, 'batch', 3200, 'Mean loss', 0.12354955822229385, '26-May-2020 (20:09:36.703887)')
('Procesando imagen', 0)
('bat

('Procesando imagen', 0)
('batch', 5888)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 5952)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 6016)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 6080)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 6144)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 6208)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))

('Procesando imagen', 0)
('batch', 8832)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 8896)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 8960)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 9024)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 9088)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 9152)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))

('Procesando imagen', 0)
('batch', 11776)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 11840)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 11904)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 11968)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 12032)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 12096)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('Procesando imagen', 0)
('batch', 14720)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 14784)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 14848)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 14912)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 14976)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 15040)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('batch', 17664)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 17728)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 17792)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 17856)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 17920)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 17984)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings

('Procesando imagen', 0)
('batch', 20672)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 20736)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 20800)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Epoch', 0, 'batch', 20800, 'Mean loss', 0.10629589110612869, '26-May-2020 (20:19:33.662316)')
('Procesando imagen', 0)
('batch', 20864)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 20928)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)

('Procesando imagen', 0)
('batch', 23616)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 23680)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 23744)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 23808)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 23872)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 23936)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('Procesando imagen', 0)
('batch', 26560)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 26624)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 26688)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 26752)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 26816)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 26880)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('Procesando imagen', 0)
('batch', 29504)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 29568)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 29632)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 29696)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 29760)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 29824)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('Procesando imagen', 0)
('batch', 32448)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 32512)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 32576)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 32640)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 32704)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 32768)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('Procesando imagen', 0)
('batch', 35392)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 35456)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 35520)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 35584)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 35648)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 35712)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

('batch', 38336)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 38400)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Epoch', 0, 'batch', 38400, 'Mean loss', 0.10676458477973938, '26-May-2020 (20:29:32.300943)')
('Procesando imagen', 0)
('batch', 38464)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 38528)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 38592)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 38656)
('X_cap

('Procesando imagen', 0)
('batch', 41280)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 41344)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 41408)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 41472)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 41536)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64, 15))
('image_embeddings_batch', (64, 1, 14, 14, 512))
('Procesando imagen', 0)
('batch', 41600)
('X_caption_embedding_batch', (64, 15, 512))
('Y_caption_embedding_batch', (64

KeyboardInterrupt: 

In [28]:
PATH = "data/validation.xlsx"
df_test = pd.read_excel(PATH, names=["id_image","caption"])
df_test['caption'] = df_test.apply(lambda x: "smark "+x['caption']+" emark", axis=1)
df_test['id_image'] = df_test.apply(lambda x: idToPath(x['id_image']),axis=1)

In [29]:
test_sentences_x = tokenizer.texts_to_sequences(df_test['caption'])

In [30]:
test_sentences_y = [sentence[1:] for sentence in test_sentences_x]

In [31]:
test_pad_sentences_x = pad_sequences(test_sentences_x, padding='post',maxlen=15)
test_pad_sentences_y = pad_sequences(test_sentences_y, padding='post',maxlen=15)

In [43]:
img, captions, attentions = model.predict(df_test['id_image'], test_pad_sentences_x, test_pad_sentences_y)

('Procesando imagen', 0)
('Procesando imagen', 1000)
('Procesando imagen', 2000)
('Procesando imagen', 3000)
('Procesando imagen', 4000)
0 images predicted
100 images predicted


KeyboardInterrupt: 

In [None]:
for i in range(len(img)):
    print(img[i])
    print(captions[i])

In [None]:
result_df = pd.DataFrame(columns=["id_image","caption"])
for i in range(len(img)):
    image = img[i][:-4].split('/')[-1]
    caption = captions[i][0][6:]
    result_df.loc[i] = [image,caption]

In [None]:
result_df.to_csv("results/val_M019_I.csv", encoding = 'utf-8',index=False)

## Implementacion del modelo

In [None]:
class CNN_LSTM_Model():
    
    def __init__(self):        
        self.max_length = 4028
        self.embedding_dimension = 512
        self.cells=1024
        self.num_result_words = 15
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.encoder = CNN_Model(self.embedding_dimension)
        self.embedding = Embeddings_Model(self.max_length, self.embedding_dimension)
        self.decoder = LSTM_Model(self.max_length, self.cells, self.num_result_words)
        
        self.initializer1 = AnnotationVectorInitializer(self.cells)
        self.initializer2 = AnnotationVectorInitializer(self.cells)
    
    # Esta funcion recibe un batch de imagenes que pre-procesa y alimenta a la GoogleNet.
    # La salida es el batch correspondiente a sus image embeddings.
    def encoder_predict(self, image_paths):
        predictions = []
        contador=0
        for image_path in image_paths:
            if contador%1000==0:
                print("Procesando imagen",contador,
                     datetime.now().strftime("%d-%b-%Y (%H:%M:%S.%f)"))
            contador+=1
            image = tf.io.read_file(image_path)
            image = tf.image.decode_jpeg(image, channels=3)
            image = tf.image.resize(image, (224, 224))
            #image = tf.keras.applications.VGG16.preprocess_input(image)
            image = np.expand_dims(image, axis=0)
            predictions.append(self.encoder.predict(image))
        
        return predictions
    
    # This function preprocess the data
    # Parameters:
    #           X_image: Image path list
    #           X_caption: Caption list
    #           Y_caption: Label caption list for compare with the predict caption
    def preprocess_data(self,X_image,X_caption,Y_caption, test=False):
        # Get the image embeddings
        image_embeddings = np.array(self.encoder_predict(X_image, test))
        
        # Lambda function for map the word embeddings
        map_embeddings = lambda x: self.embedding.predict(np.array([x]))
        # Map the word embeddings
        X_caption_embeddings = map_embeddings(X_caption)
        
        # A set of transforms in numpy array sizes
        Y_caption_embeddings = np.array(Y_caption)
        X_caption_embeddings = X_caption_embeddings.reshape((
            -1, self.num_result_words, self.embedding_dimension))
        Y_caption_embeddings = Y_caption_embeddings.reshape((
            -1, self.num_result_words))
        
        return image_embeddings, X_caption_embeddings, Y_caption_embeddings
    
    # This function train the model
    # Parameters:
    #           X_image: Image path list
    #           X_caption: Caption list
    #           Y_caption: Label caption list for compare with the predict caption
    def train(self, X_image, X_caption, Y_caption):
        loss = 0
        
        # Reshape of the data
        image_embeddings, X_caption_embeddings, Y_caption_embeddings = self.preprocess_data(
            X_image,
            X_caption,
            Y_caption,
            False)
        
        split_threshold = int(len(X_image)*-0.1)
        
        # We manually iterate the epochs
        for epoch in range(EPOCHS):
            # We build the batchs
            for batch in range(0,image_embeddings[:split_threshold].shape[0],BATCH_SIZE):
                image_embeddings_batch = None
                X_caption_embedding_batch = None
                Y_caption_embedding_batch = None
                if batch+BATCH_SIZE>=len(image_embeddings[:split_threshold]):
                    image_embeddings_batch = image_embeddings[:split_threshold][batch:]
                else:
                    image_embeddings_batch = image_embeddings[:split_threshold][batch:batch+BATCH_SIZE]
                if batch+BATCH_SIZE>=len(X_caption_embeddings[0][:split_threshold]):
                    X_caption_embedding_batch = X_caption_embeddings[:,:split_threshold][:,batch:]
                else:
                    X_caption_embedding_batch = X_caption_embeddings[:,:split_threshold][:,batch:batch+BATCH_SIZE]
                if batch+BATCH_SIZE>=len(Y_caption_embeddings[:split_threshold]):
                    Y_caption_embedding_batch = Y_caption_embeddings[:split_threshold][batch:]
                else:
                    Y_caption_embedding_batch = Y_caption_embeddings[:split_threshold][batch:batch+BATCH_SIZE]
                # We manually compute the gradients from the loss
                with tf.GradientTape() as tape:
                    # Get the initial values of the hidden states
                    state_t1 = self.initializer1(tf.reduce_mean(image_embeddings_batch, axis=-1))
                    state_t2 = self.initializer2(tf.reduce_mean(image_embeddings_batch, axis=-1))
                    loss = 0.
                    # Each step computes the loss of error words in the sentences
                    for step in range(0,X_caption_embedding_batch.shape[2]):
                        token_t = X_caption_embedding_batch[:,:,step]
                        output_t, state_t1,state_t2, attention_weights = self.decoder(
                            token_t,state_t1,state_t2,image_embeddings_batch)
                        loss += loss_function(Y_caption_embedding_batch[:,step],output_t[:,0])
                if(batch%100*BATCH_SIZE==0):
                    print("Epoch",epoch,"batch",batch,"Mean loss",float(loss/X_caption_embedding_batch.shape[2]),
                         datetime.now().strftime("%d-%b-%Y (%H:%M:%S.%f)"))
                # Get the trainable weights
                trainable_variables=self.decoder.trainable_variables+self.initializer1.trainable_variables+self.initializer2.trainable_variables
                # Compute the gradients
                gradients = tape.gradient(loss, trainable_variables)
                # Apply gradients to trainable weights
                self.optimizer.apply_gradients(zip(gradients, trainable_variables))
            

        images_list = []
        captions_list = []
        attention_list = []
        image_val = image_embeddings[split_threshold:]
        for example in range(len(image_val)):
            tokens = []
            at = []
            input_token = tokenizer.word_index['smark']
            hidden_state1 = self.initializer1(tf.reduce_mean(tf.expand_dims(image_val[example], 0), axis=-1))
            hidden_state2 = self.initializer2(tf.reduce_mean(tf.expand_dims(image_val[example], 0), axis=-1))
            while input_token != tokenizer.word_index['emark'] and len(tokens)<self.num_result_words:
                tokens.append(input_token)
                tokens_array = map_embeddings([input_token])
                tokens_array = tf.expand_dims(tokens_array, 0)
                #tokens_array = tokens_array.reshape((1, -1, self.embedding_dimension))
                output_t, hidden_state1, hidden_state2, attention_weights = self.decoder(tokens_array,hidden_state1,hidden_state2,image_val[example])
                input_token = np.argmax(output_t)
                at.append(attention_weights)
            images_list.append(X_image[split_threshold:][example])
            captions_list.append(tokenizer.sequences_to_texts([tokens]))
            attention_list.append(at)
        
        return images_list,captions_list,attention_list

In [None]:
class AnnotationVectorInitializer(tf.keras.Model):
    
    def __init__(self, embedding_dimension):
        super(AnnotationVectorInitializer, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.dense = tf.keras.layers.Dense(embedding_dimension, activation='tanh')
    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.dense(x)
        return x

In [None]:
class CNN_Model(tf.keras.Model):
    
    def __init__(self, embedding_dimension):
        super(CNN_Model, self).__init__()
        pre_trained_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet',
                                                            input_shape=(224, 224, 3))
        self.input_model = tf.keras.Sequential()
        for layer in pre_trained_model.layers[:-1]:
            self.input_model.add(layer)
    def call(self, inputs):
        x = self.input_model(inputs)
        return x

In [None]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
class LSTM_Model(tf.keras.Model):
    
    def __init__(self, max_length, embedding_dimension, num_result_words):
        super(LSTM_Model, self).__init__()
        self.embedding_dimension = embedding_dimension
        self.lstm = tf.keras.layers.LSTM(embedding_dimension, input_shape=(num_result_words, embedding_dimension),
                                         return_sequences=True,dropout=0.1, return_state=True,
                                        recurrent_initializer='glorot_uniform')
        self.output_model = tf.keras.layers.Dense(max_length)
        self.flatten = tf.keras.layers.Flatten()
        self.softAttention = BahdanauAttention(embedding_dimension)
        
    def call(self, captions, initial_state1, initial_state2, features):
        #captions, initial_state, features = inputs[0],inputs[1], inputs[2]
        captions = tf.reshape(captions, [-1,1,captions.shape[-1]])
        features = tf.reshape(features,[-1,features.shape[-3]*features.shape[-2],512])
        context_vector, attention_weights = self.softAttention(features, initial_state1)
        z = tf.concat([tf.expand_dims(context_vector, 1), captions], axis=-1)
        x, state1, state2 = self.lstm(z,initial_state=[initial_state1, initial_state2])
        x = self.output_model(x)

        return x, state1, state2, attention_weights

In [None]:
class Embeddings_Model(tf.keras.Model):
    
    def __init__(self, max_length, embedding_dimension):
        super(Embeddings_Model, self).__init__()
        weights = None
        with open('embeddingLayerWeights512.pkl', 'rb') as handle:
            weights = pickle.load(handle)
        self.embedding = tf.keras.layers.Embedding(max_length+1, embedding_dimension ,weights=[weights])
        
    def call(self, inputs):
        x = self.embedding(inputs)
        
        return x

In [None]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """
    #y_true = tf.reshape(y_true, [-1,15])
    y_true = tf.dtypes.cast(y_true, tf.int32)
    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)
    
    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [None]:
model = CNN_LSTM_Model()

In [None]:
len(pad_sentences_y)

In [None]:
img, captions, attentions = model.train_on_batch(path_dataset[:9000], pad_sentences_x[:9000], pad_sentences_y[:9000])

In [None]:
for i in range(len(img)):
    print(img[i])
    print(captions[i])

In [None]:
!pip install Pillow

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
attention_plot = np.zeros((15, 196))

print(len(attentions[0][0]))

attention_plot = attention_plot[:len(result), :]
len_result = len(img)
for i in range(len(img)): 
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (14, 14))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()


In [None]:
self.embedding.predict(np.array([1]))

In [None]:
#contador = len(path_dataset)-len(predictions)
#print(contador)
for caption in predictions:
    index_list = []
    for token in caption:
        index_list.append(np.argmax(token))
    tokens_list = tokenizer.sequences_to_texts([index_list])
    print(path_dataset[contador])
    print(tokens_list)
    contador+=1

In [None]:
# This variable adjusts the maximum length of the vocabulary
# If the number is less than the actual length, we remove the least used words
max_length = 4028
# This variable adjust the dimensions of the embeddings. A high value may represent more complex embeddings
# but the artificial neural network will be larger.
embedding_dimension=1000

num_words = 42

Para el encoder vamos a usar los embeddings previamente creados de las imagenes, los cuales anadimos al modelo mediante una fully connected. La salida sera el estado inicial que reciba el decoder.

In [None]:
encoder_input = tf.keras.layers.Input(shape=(embedding_dimension, ))

In [None]:
encoder = tf.keras.layers.Dense(embedding_dimension,
                             activation='tanh')

In [None]:
num_batches = (len(image_embeddings)/32)
h0 = []
print(num_batches)
for i in range(len(image_embeddings)):
    h0.append(np.array(image_embeddings[i]))
h0 = np.array(h0)

Para el decoder tendremos una capa embedding previamente entrenada que transformara los integers a embeddings. Estos se introducen en el input de la LSTM.

In [None]:
decoder_input = tf.keras.layers.Input(shape=(None, ))

In [None]:
embedding_layer = tf.keras.layers.Embedding(max_length+1, embedding_dimension)
embedding_layer_weights = None
# loading
with open('embeddingLayerWeights.pkl', 'rb') as handle:
    embedding_layer_weights = pickle.load(handle)

In [None]:
lstm = tf.keras.layers.LSTM(42)

In [None]:
dense_layer = tf.keras.layers.Dense(len(tokenizer.word_index),
                             activation='tanh')

In [None]:
decoder_output = dense_layer(lstm(embedding_layer(decoder_input), initial_state=encoder(h0)))

In [None]:
model = tf.keras.Model(inputs=[encoder_input, decoder_input],
                      outputs=[decoder_output])