In [48]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu

# Load data

We want to create:
* a set that contains all the photos names
* a dictionary that maps the names of the photos to the descriptions
* another dictionary that maps the names of the photos to the features extracted

### Photo ids set

In [2]:
def load_data(filename):
    # Open file
    with open(filename, 'r') as f:
        data = f.read()

    # Divide the data into each photo's name
    data = data.split('\n')
    # Init result set
    result = set()
    # Loop through each photo's name
    for line in data:
        # Avoid empty lines
        if len(line) < 1:
            continue
        # Remove .jpg and add to result set
        result.add(line.split('.')[0])

    return result

### Dictionary that maps photo's id to descriptions

In [3]:
def load_descriptions(filename, ids):
    # Open file
    with open(filename, 'r') as f:
        data = f.read()

    # Divide the data into each photo description    
    data = data.split('\n')
    # Init result dictionary
    result = dict()

    # Loop through each line in the data
    for line in data:
        # Divide photo id and description
        line = line.split()
        photo_id = line[0]
        description = ' '.join(line[1:])

        # Check if photo id is in ids
        if photo_id in ids:
            # Check if this photo id key has already been initiated
            if photo_id not in result:
                # Set the value to a list
                result[photo_id] = list()

            # Add to the description a start and end sequence tokens
            description = 'STARTSEQ ' + description + ' ENDSEQ'

            # Add description to list of values of the corresponding photo id
            result[photo_id].append(description)

    return result

### Dictionary that maps photo's id to features

In [4]:
def load_features(filename, ids):
    # Load the file that contains all the feature
    data = pickle.load(open(filename, 'rb'))
    # Map every photo id to the features saved inside the file
    result = {k: data[k] for k in ids}
    
    return result

### Results

In [5]:
# print(f"Number of photo ids: {len(photo_ids)}, Number of key-value pairs of photos and descriptions: {len(photo_to_descriptions)}, Number of key-value pairs of photos and features: {len(photo_to_features)}")
# print('')
# print('Example of the content of photo ids:', next(iter(photo_ids)))
# print('')
# print('Example of the content of photo ids to descriptions:', photo_to_descriptions['1859726819_9a793b3b44'])
# print('')
# print('Example of the content of photo ids to features:', photo_to_features['1859726819_9a793b3b44'])

In [6]:
def all_descs(data):
    all_descriptions = []
    for i in data.values():
        for j in i:
            all_descriptions.append(j)
            
    return all_descriptions

In [7]:
def create_tokenizer(data):
    all_descriptions = all_descs(data)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_descriptions)
    return tokenizer

In [8]:
def create_sequence(tokenizer, maxlength, photos_to_descriptions, photos_to_features, vocab_size):
    X1, X2, y = [], [], []
    
    for k, v in photos_to_descriptions.items():
        for desc in v:
            seq = tokenizer.texts_to_sequences([desc])[0]

            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=maxlength)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                X1.append(photos_to_features[k][0])
                X2.append(in_seq)
                y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

## Train Dataset

In [9]:
photo_ids_train = load_data('./Flickr8k_text/Flickr_8k.trainImages.txt')

photo_to_descriptions_train = load_descriptions('./descriptions.txt', ids=photo_ids_train)

photo_to_features_train = load_features('./features.pkl', ids=photo_ids_train)

tokenizer_train = create_tokenizer(photo_to_descriptions_train)
vocab_size_train = len(tokenizer_train.word_index) + 1

# Number of words in the longest description
maxlength_train = max(len(d.split()) for d in all_descs(photo_to_descriptions_train))

X1train, X2train, ytrain = create_sequence(tokenizer_train, maxlength_train, photo_to_descriptions_train, photo_to_features_train, vocab_size_train)

## Test Dataset

In [10]:
photo_ids_test = load_data('./Flickr8k_text/Flickr_8k.devImages.txt')

photo_to_descriptions_test = load_descriptions('./descriptions.txt', ids=photo_ids_test)

photo_to_features_test = load_features('./features.pkl', ids=photo_ids_test)

tokenizer_test = create_tokenizer(photo_to_descriptions_test)
vocab_size_test = len(tokenizer_test.word_index) + 1

# Number of words in the longest description
maxlength_test = max(len(d.split()) for d in all_descs(photo_to_descriptions_test))

X1test, X2test, ytest = create_sequence(tokenizer_test, maxlength_test, photo_to_descriptions_test, photo_to_features_test, vocab_size_test)

# Building the Model

Structure of the model:
* ```Photo Features Extractor``` --> the <em>pre-trained VGG16 model</em> (the one we used to pre-process the images features)
* ``Sequence Processor`` --> <em>word embedding layer</em> + <em>LSTM layer</em>
* ``Decoder`` --> it process the outputs of the **Photo Feature Extractor** and **Sequence Processor** and merges them together using a Dense layer to make a prediction

In [29]:
# Features extraction model
inputs1 = tf.keras.Input(shape=(4096,))
feature_extraction_1 = tf.keras.layers.Dropout(0.5)(inputs1)
feature_extraction_2 = tf.keras.layers.Dense(256, activation='relu')(feature_extraction_1)

# Sequence processor model
inputs2 = tf.keras.Input(shape=(maxlength_train, ))
sequence_processor_1 = tf.keras.layers.Embedding(vocab_size_train, 256, mask_zero=True)(inputs2)
sequence_processor_2 = tf.keras.layers.Dropout(0.5)(sequence_processor_1)
sequence_processor_3 = tf.keras.layers.LSTM(256)(sequence_processor_2)

# Decoder model
decoder_1 = tf.keras.layers.add([feature_extraction_2, sequence_processor_3])
decoder_2 = tf.keras.layers.Dense(256, activation='relu')(decoder_1)
outputs = tf.keras.layers.Dense(vocab_size_train, activation='softmax')(decoder_2)

# Put everything together
model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam'
)

print(model.summary())
#print(tf.keras.utils.plot_model(model, show_shapes=True))

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 37)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 37, 256)      1943808     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 4096)         0           input_3[0][0]                    
_______________________________________________________________________________________

### Fit the model

We start by defining the filepath of the save model and when the model should be saved (which means when the model achives some results)

In [12]:
filepath = './models/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [13]:
# We can then fit the model
#model.fit([X1train, X2train], ytrain, epochs=1, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

### Train with progessive overloading

In [33]:
def create_sequence_progressive_overloading(tokenizer, maxlength, desc_list, photos_to_features, vocab_size):
    X1, X2, y = [], [], []
    
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]

        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=maxlength)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            X1.append(photos_to_features)
            X2.append(in_seq)
            y.append(out_seq)

    return np.array(X1), np.array(X2), np.array(y)

In [34]:
def data_generator(photos_to_descriptions, photos_to_features, tokenizer, maxlength, vocab_size):
    # Loop forever over images
    while True:
        for k, v in photos_to_descriptions.items():
            photo = photos_to_features[k][0]
            in_img, in_seq, out_word = create_sequence_progressive_overloading(tokenizer, maxlength, v, photo, vocab_size)
            yield ([np.array(in_img), np.array(in_seq)], np.array(out_word))

In [35]:
# Train the model
epochs = 1
steps = len(photo_to_descriptions_train)
for i in range(epochs):
    # Create generator
    generator = data_generator(photo_to_descriptions_train, photo_to_features_train, tokenizer_train, maxlength_train, vocab_size_train)
    # Fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # Save model
    model.save('./models/model_' + str(i) + '.h5')



# Evaluate Model

In [39]:
# Function that given an integer returns the corresponding word based on the tokenization
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word

    return None

In [40]:
def generate_description(model, tokenizer, photo, maxlength):
    # Word that starts the sequence
    in_text = 'STARTSEQ'
    # Iterate over the sequence
    for i in range(maxlength):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=maxlength)
        # Predict next word
        yhat = model.predict([photo, sequence], verbose=0)
        # Get the integer with the biggest probability
        yhat = np.argmax(yhat)
        # Get the word corresponding to the integer the model returned
        word = word_for_id(yhat, tokenizer)
        # Stop if no valid word was predicted
        if word is None:
            break
        # Append the predicted word as input for the next word
        in_text += ' ' + word
        # Stop if we predicted the end of the sequence
        if word.upper() == 'ENDSEQ':
            break

    return in_text

In [46]:
# Evaluate the skill of the model
def evaluate_model(model, photos_to_descriptions, photos_to_features, tokenizer, maxlength):
    actual, predicted = [], []

    for k, v in photos_to_descriptions.items():
        # Generate description
        yhat = generate_description(model, tokenizer, photos_to_features[k], maxlength)

        references = [d.split() for d in v]
        actual.append(references)
        predicted.append(yhat.split())
    
        # Calculate BLEU score
        # BLEU scores are used in text translation for evaluating translated text against one or more reference translations
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

## Test set

In [49]:
test = load_data('./Flickr8k_text/Flickr_8k.testImages.txt')

test_desc = load_descriptions('./descriptions.txt', test)
test_features = load_features('./features.pkl', test)

evaluate_model(model, test_desc, test_features, tokenizer_train, maxlength_train)

BLEU-1: 0.184211
BLEU-2: 0.099786
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.184211
BLEU-2: 0.122213
BLEU-3: 0.096690
BLEU-4: 0.000000
BLEU-1: 0.166667
BLEU-2: 0.102521
BLEU-3: 0.077049
BLEU-4: 0.000000
BLEU-1: 0.157895
BLEU-2: 0.097988
BLEU-3: 0.068787
BLEU-4: 0.000000


KeyboardInterrupt: 

# New predictions

* We can save the tokenizer with pickle, and then load it as well as the model
* We extract the features
* We generate a description

In [50]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model

In [55]:
def extract_features(file):
    print('Extracting features...')

    # Load model
    model = VGG16()
    # Re-structure the model (we remove the last layer from the model because we don't need to classify the photos)
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    # Load image from file
    image = load_img(file, target_size=(224, 224))
    # Convert the image pixels to a numpy array
    image = img_to_array(image)
    # Reshape image for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # Prepare image for the VGG model
    image = preprocess_input(image)
    # Extract features
    features = model.predict(image, verbose=0)
    return features

In [57]:
photo = extract_features('example.jpg')
description = generate_desc(model, tokenizer_train, photo, maxlength_train)
print(description)

Extracting features...
STARTSEQ a man in a red shirt is standing on a street endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq endseq
