# Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

In [2]:
# preprocess_input() is used to preprocess any given image to extract features of that image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
# load_img() is used to load image from file as a pil image
# img_to_array() is used to convert pil image instance to a numpy array so that our model can understand/interpret the image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
# Model() can be instanciated to include the necessary layers given some input arrays or tensors to output arrays or tensors
from tensorflow.keras.models import Model
# Pickle is used to serialize and deserialize the python object structure so that any object on python can be pickled and saved to the disk
# So pickle.dumb() fuction is used to save object data to the file
from pickle import dump

# Clean Image Descriptions

In [3]:
# This function takes document text as arguments
# And returns dictonary of image identifiers and corresponding descriptions
def read_image_descriptions(filename):
    # Open the file as read only
    file = open(filename, 'r')
    # Read all the text from the file
    text = file.read()
    # After reading the file completely close the file
    file.close()
    
    # Initializing a dictionary named image_id_dict
    image_id_dict = dict()
    
    # Incorporating for loop to read each sentence from the text by splitting them on the basis of new line character ('\n')
    for line in text.split('\n'):
        # Splitting each sentence obtained from above on the basis of white space into indivudial words/tokens
        # You can also use nltk classes like words_tokenize() to tokenize sentence to words/tokens
        tokens = line.split()
        if len(line) < 2:
            continue
        # Now for each sentence we treat first token as image id and rest of the tokens as image description
        image_id, image_description = tokens[0], tokens[1:]
        # Remove file extension from the image_id
        image_id = image_id.split('.')[0]
        # Convert image description tokens back to string
        image_description = ' '.join(image_description)
        # Create the list if needed
        if image_id not in image_id_dict:
            image_id_dict[image_id] = list()
        # Store description
        image_id_dict[image_id].append(image_description)
        
    return image_id_dict

In [4]:
import string

In [5]:
# Fucntion to clean description of image
# This function takes dictionary of image_id and description as input
def clean_description_text(description):
    # Prepare translation table for removing punctuations
    # This uses the 3-argument version of str.maketrans with arguments (x, y, z) where 'x' and 'y' must be equal-length strings
    # Characters in 'x' are replaced by characters in 'y'.
    # 'z' is a string (string.punctuation here) where each character in the string is mapped to None
    translator = str.maketrans('', '', string.punctuation)
    
    for key, desc_list in description.items():
        for i in range(len(desc_list)):
            # Get the description at i'th index from all the given five descriptions for that image
            desc = desc_list[i]
            # Tokenize(grab each words as list) the description by splitting them based on whitespaces
            desc = desc.split()
            # Convert the words to lower case
            desc = [word.lower() for word in desc]
            # Remove punctions from each token
            # translate() method returns a string where each character is mapped to its corresponding character in the translation table
            desc = [word.translate(translator) for word in desc]
            # Removing single charactered words
            # Removing hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # Take only alpahabets containing words and exclude those containing numbers or other special characters
            desc = [word for word in desc if word.isalpha()]
            # Convert the i'th image description tokens back to string
            desc_list[i] = ' '.join(desc)

In [6]:
# Function to save description to file
# description argument is a dictionary which contains mapping of image identifiers to the corresponding descriptions
# filename argument is the name, we want to give to the file in which we want to save the mapping of image identifiers to cleaned descriptions
def save_description(description, filename):
    lines = list()
    for key, desc_list in description.items():
        for desc in desc_list:
            # Save description of the image preceded by the image identifier
            lines.append(key + " " + desc)
    # Convert all the descriptions list as string into new lines
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [7]:
# Calling the above created functions in a series
filename = './Datasets/Flickr8k.token.txt'
descriptions = read_image_descriptions(filename)
print(f"Total number of descriptions: {len(descriptions)}")
clean_description_text(descriptions)
save_description(descriptions, 'InceptionV3_descriptions.txt')

Total number of descriptions: 8092


# Image Feature Extraction

In [8]:
# Fuction to extract features from each image to the directory
def get_image_features(directory):
    # Create an instance of the InceptionV3 model
    model = InceptionV3()
    # Restructuring our InceptionV3 model by removing/Popping off the last layer of the model
    # The last layer is used to classify the images. Since we are not classifying images here, we're removing the last layer
    model.layers.pop()
    # Keras model represents the actual neural network model.
    # Keras provides a two mode to create the model, simple and easy to use Sequential API as well as more flexible and advanced Functional API.
    # A ANN model can be created by simply calling Sequential() API
    # Sequential API is used to create models layer-by-layer
    # Functional model, you can define multiple input or output that share layers.
    # First, we create an instance for model using Model class and connect to the layers to access input and output to the model
    # model.inputs is the input fed to the model and model.layers[-1].output is the output of the last(-1) layer of the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    # Print the summary of the model
    print(model.summary())
    # This empty dictionary will be used to store image features
    features = dict()
    
    # Iterate through each images in given directory using for loop
    for file_name in tqdm(os.listdir(directory)):
        filename = f"{directory}/{file_name}"
        # load each filename as image and resize the image to given target_size
        image = load_img(filename, target_size=(299, 299))
        # Convert image pixels to numpy array
        image = img_to_array(image)
        # Before presenting any data to CNN you may sometimes need to reshape your data
        # We are reshaping the data without changing its content
        image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
        # Preparing the image to fit to the InceptionV3 model
        image = preprocess_input(image)
        # Extracting the features from the image
        # By setting verbose 0, 1 or 2 you just say how do you want to 'see' the training progress for each epoch.
        # verbose=0 will show you nothing (silent)
        # verbose=1 will show you an animated progress bar like this: [===============================]
        # verbose=2 will just mention the number of epoch like this: Epoch 1/10
        feature = model.predict(image, verbose=0)
        # Getting the image id i.e. image12.jpg gets an image_id of image12
        image_id = file_name.split('.')[0]
        # Store the extracted feature to the empty 'features' dictionary created earlier
        features[image_id] = feature

    return features

In [9]:
directory = './Datasets/Flicker8k_Dataset'
features = get_image_features(directory)
print("The length of all the extracted features is:", len(features))

  0%|          | 0/8091 [00:00<?, ?it/s]

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 149, 149, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 149, 149, 32) 0           batch_normalization[0][0]        
_______________________________________________________________________________________

100%|██████████| 8091/8091 [11:01<00:00, 12.23it/s]

The length of all the extracted features is: 8091





In [10]:
# Save all the features to pickle file
dump(features, open('./InceptionV3_features.pkl', 'wb'))

# Create Vocabulary of Image Descriptions

In [11]:
# Fuction to transform descriptions into sets
# The set object is the vacabulary of all the words
def create_vocabulary(description):
    all_desc = set()
    # Creating a list of words in description text and each word is added to set created above
    for key in description.keys():
        [all_desc.update(desc.split()) for desc in description[key]]
    return all_desc

In [12]:
# Calling the above created functions in a series
filename = './Datasets/Flickr8k.token.txt'
descriptions = read_image_descriptions(filename)
print(f"Total number of descriptions: {len(descriptions)}")
clean_description_text(descriptions)
vocabulary = create_vocabulary(descriptions)
print(f"The size of vocabulary is: {len(vocabulary)}")

Total number of descriptions: 8092
The size of vocabulary is: 8763


# Get Descriptions of Each Image

In [13]:
# Function to load a predefined list of image identifiers
# This function first creates the list of image identifiers and then convert them to set object
def get_identifiers_set(filename):
    # Read the text from given file
    # Open the file as read only
    file = open(filename, 'r')
    # Read all the text from the file
    text = file.read()
    # After reading the file completely close the file
    file.close()
    
    identifiers = list()
    # Iterate though each line using for loop
    for line in text.split('\n'):
        # Skip empty lines in case they exist
        if len(line) < 1:
            continue
        # Grab the image identifier from each line and then add it to a list object created above
        image_id = line.split('.')[0]
        identifiers.append(image_id)
    return set(identifiers)

In [14]:
# Fucntion to load pre-processed cleaned descriptions
# We will read descriptions from 'InceptionV3_descriptions.txt' file created earlier
def get_clean_descriptions(filename, identifiers):
    # Read the text from given file
    # Open the file as read only
    file = open(filename, 'r')
    # Read all the text from the file
    text = file.read()
    # After reading the file completely close the file
    file.close()
    
    descriptions = dict()
    # Iterate though each line using for loop
    for line in text.split('\n'):
        # Tokenize the lines by splitting them according to whitespaces
        tokens = line.split()
        # Separate image identifiers from their descriptions
        image_id, image_desc = tokens[0], tokens[1:]
        # Skip all the images that do not belong to the above create set of image identifiers
        if image_id in identifiers:
            # Since single image contains 5 descriptions, we should not repeat image_id for all 5 descriptions separately
            # If image_id is inserted for first description, other 4 descriptions should be added to the same key
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # Wrap descriptions in tokens
            # starttoken and endtoken are tokens to signal the start and end of the caption
            # We need these tokens because the captions are generated one word at a time
            # These tokens are added to descriptions as they are loaded
            # Later we need to encode these descriptions in our walk through
            # We need to add these texts before encoding the text so that these tokens also get encoded
            desc = "starttoken " + ' '.join(image_desc) + " endtoken"
            descriptions[image_id].append(desc)
            
    return descriptions

In [15]:
from pickle import load

In [16]:
# Function to load the image features from the given dataset
# filename argument is the pickle file (InceptionV3_features.pkl) that we created earlier
# dataset argument is the training dataset
def get_image_features(filename, dataset):
    # Load all features
    all_features = load(open(filename, 'rb'))
    # Load features
    features = {k: all_features[k] for k in dataset}
    return features

In [17]:
# Load training dataset (6000 out of 2000) as present in Flickr_8k.trainImages.txt
filename = './Datasets/Flickr_8k.trainImages.txt'
# Get identifiers of all the training images
train_id = get_identifiers_set(filename)
print(f"The length of train identifiers is: {len(train_id)}")
# Get cleaned/pre-processed descriptions from the file saved earlier for the training images
train_desc = get_clean_descriptions('./InceptionV3_descriptions.txt', train_id)
print(f"The length of train descriptions is: {len(train_desc)}")
# Get features of all the training images from the pickle file saved earlier
train_features = get_image_features('./InceptionV3_features.pkl', train_id)
print(f"The length of train features is: {len(train_features)}")
# Print list of training descriptions for given image_id
# train_desc is the dictionary of image_id that consists of list of descriptions for that image_id
train_desc['1000268201_693b08cb0e']

The length of train identifiers is: 6000
The length of train descriptions is: 6000
The length of train features is: 6000


['starttoken child in pink dress is climbing up set of stairs in an entry way endtoken',
 'starttoken girl going into wooden building endtoken',
 'starttoken little girl climbing into wooden playhouse endtoken',
 'starttoken little girl climbing the stairs to her playhouse endtoken',
 'starttoken little girl in pink dress going into wooden cabin endtoken']

# Tokenize Description

No machine learning model can operate with text based data as input. And so, here we will convert our text data to numerical data so that our model can understand this data. Inorder to encode the data we create the mapping of the words to numerical values.

In [18]:
# Function to convert dictionary of cleaned descriptions to list of cleaned descriptions
def to_list(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(one_of_five_desc) for one_of_five_desc in descriptions[key]]
    return all_desc

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [20]:
# Fucntion to create tokens out of descriptions
# i.e. Fucntion to tokenize the descriptions corpus
def create_tokenizer(descriptions):
    desc_list = to_list(descriptions)
    # Keras has Tokenizer class that can learn the mapping of words to numerical values from loaded descriptions
    tokenizer = Tokenizer()
    # fit_on_texts() method updates internal vocabulary based on a list of texts
    # This method creates the vocabulary index based on word frequency
    # So if you give it something like, "The cat sat on the mat."
    # It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2
    # So lower integer means more frequent word
    # It is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [21]:
# Get the size of vocabulary
tokenizer = create_tokenizer(train_desc)
# Adding 1 to the length because indexing starts from zero
vocab_size = len(tokenizer.word_index) + 1
print(f"The size of the vocabulary is: {vocab_size}")

The size of the vocabulary is: 7579


# Encoding/Mapping Descriptions to Numerical Values

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [23]:
# Funtion to calculate the length of the description having most words
def max_lengths(descriptions):
    desc_list = to_list(descriptions)
    return max(len(desc.split()) for desc in desc_list)

In [25]:
# Now we are ready to encode the text
# Funtion to create a sequences of images, descriptions and outputs (next_words)
def create_encoded_sequence(tokenizer, max_length, desc_list, image_feat):
    # Instancing list objects to store image features, image descriptions and predicted next words
    # Input text is encoded into numerical values
    # So output text i.e predicted next word will also be one-hot encoded values
    image_features, image_desc, next_words = list(), list(), list()
    
    # We'll iterate thorough each of the five descriptions of the given image using for loop
    for desc in desc_list:
        # texts_to_sequences() transforms each text in texts to a sequence of integers i.e. encode the sequence
        # So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.
        # Only top (num_words-1) most frequent words will be taken into account. Only words known by the tokenizer will be taken into account.
        seq = tokenizer.texts_to_sequences([desc])[0]
        
        # Using for loop to split one sequence to multiple x,y pairs
        for i in range(1, len(seq)):
            # Split into input and output pairs
            in_seq, out_seq = seq[:i], seq[i]
            # pad_sequences() is used to ensure that all sequences in a list have the same length.
            # By default this is done by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence.
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # to_categorical() method that can be used to one-hot encode integer data.
            # If the integer data represents all the possible values of the classes, then the to_categorical() method can be used directly
            # Otherwise, the number of classes can be passed to the method as the num_classes parameter.
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            image_features.append(image_feat)
            image_desc.append(in_seq)
            next_words.append(out_seq)
            
    return np.array(image_features), np.array(image_desc), np.array(next_words)

# Merging CNN based models with LSTM models together

Here we have two different models: 
1. Sequence Processor (RNN/LSTM)
2. Feature Extractor (CNN)

Both Image Feature Extractor and Sequence Processor generate fixed length output vectors separately. These separate output vectors are passed though a common Decoder and get merged together. Here both the vectors are processed by a Dense layer to generate the image description output.

In [26]:
from tensorflow.keras.layers import Input, LSTM, Dropout, Embedding, Dense
from tensorflow.keras.layers import Add

In [27]:
# Function that houses the entire structure of our model
def create_model(vocab_size, max_length):
    # Sequence processor model
    input1 = Input(shape=(max_length,))
    seq1 = Embedding(vocab_size, 256, mask_zero=True)(input1)
    seq2 = Dropout(0.5)(seq1)
    seq3 = LSTM(256)(seq2)
    
    # Feature extractor model
    input2 = Input(shape=(2048,))
    feat1 = Dropout(0.5)(input2)
    feat2 = Dense(256, activation='relu')(feat1)
    
    # Decoder model
    decoder1 = Add()([feat2, seq3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    output = Dense(vocab_size, activation='softmax')(decoder2)
    
    # Tie all the image features and word sequence together using keras Model class
    model = Model(inputs=[input2, input1], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    print(model.summary())
    return model

# Training/Fitting model

In [29]:
# Function to load one image worth of data per batch
def data_generator(descriptions, image_feats, tokenizer, max_length):
    # while loop is used to go over each image
    while 1:
        # for loop is used to extract features for given image
        for key, desc_list in descriptions.items():
            # Retrive the image features
            image_feat = image_feats[key][0]
            # Create sequence for single image (not entire data) for given batch
            image_features, desc, next_words = create_encoded_sequence(tokenizer, max_length, desc_list, image_feat)
            # yield is used to return from a function without destroying the states of its local variable
            # When the function is called, the execution starts from the last yield statement.
            # Any function that contains a yield keyword is termed as generator
            yield ([image_features, desc], next_words)

In [30]:
# Load training dataset (6000 out of 2000) as present in Flickr_8k.trainImages.txt
filename = './Datasets/Flickr_8k.trainImages.txt'
# Get identifiers of all the training images
train_id = get_identifiers_set(filename)
print(f"The length of train identifiers is: {len(train_id)}")
# Get cleaned/pre-processed descriptions from the file saved earlier for the training images
train_desc = get_clean_descriptions('./InceptionV3_descriptions.txt', train_id)
print(f"The length of train descriptions is: {len(train_desc)}")
# Get features of all the training images from the pickle file saved earlier
train_features = get_image_features('./InceptionV3_features.pkl', train_id)
print(f"The length of train features is: {len(train_features)}")
# Prepare tokenizer
tokenizer = create_tokenizer(train_desc)
vocab_size = len(tokenizer.word_index) + 1
print(f"The size of the vocabulary is: {vocab_size}")
# Determine the maximum sequence length
max_length = max_lengths(train_desc)
print(f"The maximum length of train descriptions is: {max_length}")

The length of train identifiers is: 6000
The length of train descriptions is: 6000
The length of train features is: 6000
The size of the vocabulary is: 7579
The maximum length of train descriptions is: 34


In [None]:
# Here we will train the model and save our model after each epoch as .h5 filename begining with model_
model = create_model(vocab_size, max_length)
epochs = 10
# Steps is the size of train descriptions
steps = len(train_desc)

for i in range(epochs):
    # Create data generator
    generator = data_generator(train_desc, train_features, tokenizer, max_length)
    model.fit_generator(generator=generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save(f"./InceptionV3_Models/model_{i}.h5" )

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 34, 256)      1940224     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 2048)         0           input_3[0][0]                    
_______________________________________________________________________________________

# Model Evaluation using BLEU Score

In Natural Language Processing, we may arise with multiple senarios where there may be multiple correct outputs. In such case, Accuracy Score is not a great metics to use and in those cases BLEU comes into paly.

BLEU score stands for Bilingual Evaluation Understudy Score. BLEU, is a score for comparing a candidate translation of text to one or more reference translations. In simple language, BLEU Score is used to check how close the generated text is with respect to the expected text. Although developed for translation, it can be used to evaluate text generated for a suite of natural language processing tasks.

The value of BLEU Score ranges from 0 to 1. Higher the BLEU Score, better will be the predicted text.

In [32]:
# Function to map a word using its corresponding word id
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        # Check if we have a match for given word at corresponding index
        # If match found return the word else return None
        if index == integer:
            return word
    return None

In [33]:
# Funtion to generate a description for an image
def generate_desc(model, tokenizer, image_feat, max_length):
    # Seed/start the generation process
    in_text = 'starttoken'
    
    # Iterate over the entire length of the sequence
    # Here we will generate one word at a time by calling model recursively until 'END' string is detected
    for i in range(max_length):
        # Intiger encoded input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # Pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # Predict next word
        yhat = model.predict([image_feat, sequence], verbose=0)
        # Concert probability to integer
        yhat = np.argmax(yhat)
        # Map integer to word
        word = word_for_id(yhat, tokenizer)
        # Stop if we cannot map the word
        if word is None:
            break
        # Add the generated next word to the original sequence
        in_text += " " + word
        if word == 'endtoken':
            break
    return in_text

In [34]:
from nltk.translate.bleu_score import corpus_bleu

In [35]:
# Function to evaluate the skill of the model
# i.e. determine how good the model is
def evaluate_model(model, descriptions, image_feat, tokenizer, max_length):
    # Instancing two lists one to store actual description and next to store predicted descriptions
    reference, candidate = list(), list()
    
    # Iterate for the entire set of images
    for key, desc_list in tqdm(descriptions.items()):
        # Generate descriptions
        yhat = generate_desc(model, tokenizer, image_feat[key], max_length)
        # Storing actual/reference and predicted/candidate descriptions
        references = [desc.split() for desc in desc_list]
        reference.append(references)
        candidate.append(yhat.split())
    
    print(f"Cumulative 1-gram: {corpus_bleu(reference, candidate, weights=(1,0,0,0))}")
    print(f"Cumulative 2-gram: {corpus_bleu(reference, candidate, weights=(0.5,0.5,0,0))}")
    print(f"Cumulative 3-gram: {corpus_bleu(reference, candidate, weights=(0.33,0.33,0.33,0))}")
    print(f"Cumulative 4-gram: {corpus_bleu(reference, candidate, weights=(0.25,0.25,0.25,0.25))}")

In [36]:
# Load training dataset (6000 out of 2000) as present in Flickr_8k.trainImages.txt
filename_train = './Datasets/Flickr_8k.trainImages.txt'
# Get identifiers of all the training images
train_id = get_identifiers_set(filename_train)
print(f"The length of train identifiers is: {len(train_id)}")
# Get cleaned/pre-processed descriptions from the file saved earlier for the training images
train_desc = get_clean_descriptions('./InceptionV3_descriptions.txt', train_id)
print(f"The length of train descriptions is: {len(train_desc)}")

# Prepare tokenizer
tokenizer = create_tokenizer(train_desc)
# Save all the features to pickle file
dump(tokenizer, open('./InceptionV3_tokenizer.pkl', 'wb'))

vocab_size = len(tokenizer.word_index) + 1
print(f"The size of the vocabulary is: {vocab_size}")
# Determine the maximum sequence length
max_length = max_lengths(train_desc)
print(f"The maximum length of train descriptions is: {max_length}")

The length of train identifiers is: 6000
The length of train descriptions is: 6000
The size of the vocabulary is: 7579
The maximum length of train descriptions is: 34


In [37]:
# Load test dataset (2000 out of 2000) as present in Flickr_8k.trainImages.txt
filename_test = './Datasets/Flickr_8k.testImages.txt'
# Get identifiers of all the test images
test_id = get_identifiers_set(filename_test)
print(f"The length of test identifiers is: {len(test_id)}")
# Get cleaned/pre-processed descriptions from the file saved earlier for the test images
test_desc = get_clean_descriptions('./InceptionV3_descriptions.txt', test_id)
print(f"The length of test descriptions is: {len(test_desc)}")

# Load test image features
test_features = get_image_features('./InceptionV3_features.pkl', test_id)
print(f"The length of test features is: {len(test_desc)}")

The length of test identifiers is: 1000
The length of test descriptions is: 1000
The length of test features is: 1000


In [38]:
from tensorflow.keras.models import load_model

In [39]:
# Load the model which has minimum loss
# Here I am working with model_8.h5
filename = './InceptionV3_Models/model_8.h5'
model = load_model(filename)
# Evaluate model
evaluate_model(model, test_desc, test_features, tokenizer, max_length)

100%|██████████| 1000/1000 [06:27<00:00,  2.58it/s]


Cumulative 1-gram: 0.52
Cumulative 2-gram: 0.27882658104112307
Cumulative 3-gram: 0.1629873472384631
Cumulative 4-gram: 0.09018026878364217


# Generating Image Description

In [40]:
# Fuction to extract features from each image to the directory
def extract_test_features(filename):
    # Create an instance of the InceptionV3 model
    model = InceptionV3()
    # Restructuring our InceptionV3 model by removing/Popping off the last layer of the model
    # The last layer is used to classify the images. Since we are not classifying images here, we're removing the last layer
    model.layers.pop()
    # Keras model represents the actual neural network model.
    # Keras provides a two mode to create the model, simple and easy to use Sequential API as well as more flexible and advanced Functional API.
    # A ANN model can be created by simply calling Sequential() API
    # Sequential API is used to create models layer-by-layer
    # Functional model, you can define multiple input or output that share layers.
    # First, we create an instance for model using Model class and connect to the layers to access input and output to the model
    # model.inputs is the input fed to the model and model.layers[-1].output is the output of the last(-1) layer of the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

    # load each filename as image and resize the image to given target_size
    image = load_img(filename, target_size=(299, 299))
    # Convert image pixels to numpy array
    image = img_to_array(image)
    # Before presenting any data to CNN you may sometimes need to reshape your data
    # We are reshaping the data without changing its content
    image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
    # Preparing the image to fit to the InceptionV3 model
    image = preprocess_input(image)
    # Extracting the features from the image
    # By setting verbose 0, 1 or 2 you just say how do you want to 'see' the training progress for each epoch.
    # verbose=0 will show you nothing (silent)
    # verbose=1 will show you an animated progress bar like this: [===============================]
    # verbose=2 will just mention the number of epoch like this: Epoch 1/10
    feature = model.predict(image, verbose=0)
    return feature

In [42]:
tokenizer = load(open('InceptionV3_tokenizer.pkl', 'rb'))
# Pre-define the maximum sequence length (from taining)
max_length = 34
# Load the model with minimum error
model = load_model('./InceptionV3_Models/model_8.h5')
# Load the image of which you need to generate description
test_image = extract_test_features('./cycle.jpg')
# Generate description of the image
description = generate_desc(model, tokenizer, test_image, max_length)
print(description)

starttoken man in red shirt is riding bike down the street endtoken


In [44]:
# Remove start and end tokens
query = description
stopwords = ['starttoken', 'endtoken']
query_words = query.split()
result = [word for word in query_words if word not in stopwords]
result = ' '.join(result)
print(result)

man in red shirt is riding bike down the street
