In [1]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu

# Load data

We want to create:
* a set that contains all the photos names
* a dictionary that maps the names of the photos to the descriptions
* another dictionary that maps the names of the photos to the features extracted

### Photo ids set

In [2]:
def load_data(filename):
    # Open file
    with open(filename, 'r') as f:
        data = f.read()

    # Divide the data into each photo's name
    data = data.split('\n')
    # Init result set
    result = set()
    # Loop through each photo's name
    for line in data:
        # Avoid empty lines
        if len(line) < 1:
            continue
        # Remove .jpg and add to result set
        result.add(line.split('.')[0])

    return result

### Dictionary that maps photo's id to descriptions

In [3]:
def load_descriptions(filename, ids):
    # Open file
    with open(filename, 'r') as f:
        data = f.read()

    # Divide the data into each photo description    
    data = data.split('\n')
    # Init result dictionary
    result = dict()

    # Loop through each line in the data
    for line in data:
        # Divide photo id and description
        line = line.split()
        photo_id = line[0]
        description = ' '.join(line[1:])

        # Check if photo id is in ids
        if photo_id in ids:
            # Check if this photo id key has already been initiated
            if photo_id not in result:
                # Set the value to a list
                result[photo_id] = list()

            # Add to the description a start and end sequence tokens
            description = 'STARTSEQ ' + description + ' ENDSEQ'

            # Add description to list of values of the corresponding photo id
            result[photo_id].append(description)

    return result

### Dictionary that maps photo's id to features

In [4]:
def load_features(filename, ids):
    # Load the file that contains all the feature
    data = pickle.load(open(filename, 'rb'))
    # Map every photo id to the features saved inside the file
    result = {k: data[k] for k in ids}
    
    return result

# GloVe

In [5]:
def all_descs(data):
    all_descriptions = []
    for i in data.values():
        for j in i:
            all_descriptions.append(j)
            
    return all_descriptions

In [6]:
def create_tokenizer(data):
    all_descriptions = all_descs(data)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_descriptions)
    return tokenizer

In [7]:
embeddings_dict = {}

def create_emb_dict():
  with open(f'glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [8]:
# Save embeddings dictionary
def save_embeddings(save=True):
    if save:
        pickle.dump(embeddings_dict, open(f'./saved_data/embeddings.pkl', 'wb'))

In [9]:
def init_emb(tokenizer, vocab_size):
  all_embs = np.stack(embeddings_dict.values())
  emb_mean, emb_std = all_embs.mean(), all_embs.std()
  embed_size = all_embs.shape[1]


  word_index = tokenizer.word_index

  embedding_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, embed_size))

  for word, i in word_index.items():
      embedding_vector = embeddings_dict.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector

  return embed_size, embedding_matrix

In [10]:
def create_sequence(tokenizer, maxlength, photos_to_descriptions, photos_to_features, vocab_size):
    X1, X2, y = [], [], []
    
    for k, v in photos_to_descriptions.items():
        for desc in v:
            seq = tokenizer.texts_to_sequences([desc])[0]

            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=maxlength)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                X1.append(photos_to_features[k][0])
                X2.append(in_seq)
                y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

# Prepare for training

In [11]:
photo_ids_train = load_data('./Flickr8k_text/Flickr_8k.trainImages.txt')

photo_to_descriptions_train = load_descriptions('./saved_data/descriptions.txt', ids=photo_ids_train)

photo_to_features_train = load_features('./saved_data/features.pkl', ids=photo_ids_train)

In [12]:
tokenizer = create_tokenizer(photo_to_descriptions_train)
vocab_size = len(tokenizer.word_index) + 1

# Number of words in the longest description
maxlength = max(len(d.split()) for d in all_descs(photo_to_descriptions_train))

## Glove set up

In [13]:
# Embeddings Dictionary
LOAD_EMBEDDINGS = True
if LOAD_EMBEDDINGS:
    embeddings_dict = pickle.load(open('./saved_data/embeddings.pkl', 'rb'))
else:
    create_emb_dict()

In [14]:
# Embedding Matrix
embed_size, embedding_matrix = init_emb(tokenizer=tokenizer, vocab_size=vocab_size)

In [15]:
save_embeddings(save=False)

# Building the Model

Structure of the model:
* ```Photo Features Extractor``` --> the <em>pre-trained VGG16 model</em> (the one we used to pre-process the images features)
* ``Sequence Processor`` --> <em>word embedding layer</em> + <em>LSTM layer</em>
* ``Decoder`` --> it process the outputs of the **Photo Feature Extractor** and **Sequence Processor** and merges them together using a Dense layer to make a prediction

In [16]:
# Features extraction model
inputs1 = tf.keras.Input(shape=(4096,))
feature_extraction_1 = tf.keras.layers.Dropout(0.5)(inputs1)
feature_extraction_2 = tf.keras.layers.Dense(256, activation='relu')(feature_extraction_1)

# Sequence processor model
inputs2 = tf.keras.Input(shape=(maxlength, ))
sequence_processor_1 = tf.keras.layers.Embedding(vocab_size, embedding_matrix.shape[1], mask_zero=True, weights=[embedding_matrix], trainable=False)(inputs2)
sequence_processor_2 = tf.keras.layers.Dropout(0.5)(sequence_processor_1)
sequence_processor_3 = tf.keras.layers.LSTM(256)(sequence_processor_2)

# Decoder model
decoder_1 = tf.keras.layers.add([feature_extraction_2, sequence_processor_3])
decoder_2 = tf.keras.layers.Dense(256, activation='relu')(decoder_1)
outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(decoder_2)

# Put everything together
model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam'
)

print(model.summary())
#print(tf.keras.utils.plot_model(model, show_shapes=True))

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 37)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 37, 100)      759300      input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_1[0][0]                    
_______________________________________________________________________________________

## Train with progessive overloading

We use **progressing overloading** to reduce the request of resources to the computer.

In [17]:
def create_sequence_progressive_overloading(tokenizer, maxlength, desc_list, photos_to_features, vocab_size):
    X1, X2, y = [], [], []
    
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]

        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=maxlength)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            X1.append(photos_to_features)
            X2.append(in_seq)
            y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

In [18]:
def data_generator(photos_to_descriptions, photos_to_features, tokenizer, maxlength, vocab_size):
    # Loop forever over images
    while True:
        for k, v in photos_to_descriptions.items():
            photo = photos_to_features[k][0]
            in_img, in_seq, out_word = create_sequence_progressive_overloading(tokenizer, maxlength, v, photo, vocab_size)
            yield ([np.array(in_img), np.array(in_seq)], np.array(out_word))

In [19]:
# Train the model
epochs = 30
steps = len(photo_to_descriptions_train)
for i in range(epochs):
    # Create generator
    generator = data_generator(photo_to_descriptions_train, photo_to_features_train, tokenizer, maxlength, vocab_size)
    # Fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # Save model in every iteration
    model.save('./models/model_' + str(i) + '.h5')

Instructions for updating:
Please use Model.fit, which supports generators.


# Evaluate Model

In [20]:
# Function that given an integer returns the corresponding word based on the tokenization
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word

    return None

In [21]:
def generate_description(model, tokenizer, photo, maxlength):
    # Word that starts the sequence
    in_text = 'STARTSEQ'
    # Iterate over the sequence
    for i in range(maxlength):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=maxlength)
        # Predict next word
        yhat = model.predict([photo, sequence], verbose=0)
        # Get the integer with the biggest probability
        yhat = np.argmax(yhat)
        # Get the word corresponding to the integer the model returned
        word = word_for_id(yhat, tokenizer)
        # Stop if no valid word was predicted
        if word is None:
            break
        # Append the predicted word as input for the next word
        in_text += ' ' + word
        # Stop if we predicted the end of the sequence
        if word.upper() == 'ENDSEQ':
            break

    return in_text

In [22]:
# Evaluate the skill of the model
def evaluate_model(model, photos_to_descriptions, photos_to_features, tokenizer, maxlength):
    actual, predicted = [], []

    i = 0

    for k, v in photos_to_descriptions.items():
        # Generate description
        yhat = generate_description(model, tokenizer, photos_to_features[k], maxlength)

        references = [d.split() for d in v]
        actual.append(references)
        predicted.append(yhat.split())
    
        # Calculate BLEU score
        # BLEU scores are used in text translation for evaluating translated text against one or more reference translations
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
        print('')

        # Keep track of the number of times the BLEU score was checked
        if i == 2:
            break
        else:
            i += 1

## Test the model

In [24]:
test = load_data('./Flickr8k_text/Flickr_8k.testImages.txt')

test_desc = load_descriptions('./saved_data/descriptions.txt', test)
test_features = load_features('./saved_data/features.pkl', test)

evaluate_model(model, test_desc, test_features, tokenizer, maxlength)

BLEU-1: 0.692308
BLEU-2: 0.339683
BLEU-3: 0.000000
BLEU-4: 0.000000

BLEU-1: 0.624426
BLEU-2: 0.337458
BLEU-3: 0.000000
BLEU-4: 0.000000

BLEU-1: 0.594369
BLEU-2: 0.350178
BLEU-3: 0.189938
BLEU-4: 0.000000

