# Generating Image Description

In [1]:
import numpy as np
# preprocess_input() is used to preprocess any given image to extract features of that image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
# load_img() is used to load image from file as a pil image
# img_to_array() is used to convert pil image instance to a numpy array so that our model can understand/interpret the image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
# Model() can be instanciated to include the necessary layers given some input arrays or tensors to output arrays or tensors
from tensorflow.keras.models import Model
# Pickle is used to serialize and deserialize the python object structure so that any object on python can be pickled and saved to the disk
# So pickle.load() fuction is used to load object data from the pickle file
from pickle import load
# load_model() fuction is used to load saved models from .h5 file
from tensorflow.keras.models import load_model

In [2]:
# Function to map a word using its corresponding word id
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        # Check if we have a match for given word at corresponding index
        # If match found return the word else return None
        if index == integer:
            return word
    return None

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Funtion to generate a description for an image
def generate_desc(model, tokenizer, image_feat, max_length):
    # Seed/start the generation process
    in_text = 'starttoken'
    
    # Iterate over the entire length of the sequence
    # Here we will generate one word at a time by calling model recursively until 'END' string is detected
    for i in range(max_length):
        # Intiger encoded input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # Pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # Predict next word
        yhat = model.predict([image_feat, sequence], verbose=0)
        # Concert probability to integer
        yhat = np.argmax(yhat)
        # Map integer to word
        word = word_for_id(yhat, tokenizer)
        # Stop if we cannot map the word
        if word is None:
            break
        # Add the generated next word to the original sequence
        in_text += " " + word
        if word == 'endtoken':
            break
    return in_text

In [5]:
# Fuction to extract features from each image to the directory
def extract_test_features(filename):
    # Create an instance of the VGG16 model
    model = VGG16()
    # Restructuring our VGG16 model by removing/Popping off the last layer of the model
    # The last layer is used to classify the images. Since we are not classifying images here, we're removing the last layer
    model.layers.pop()
    # Keras model represents the actual neural network model.
    # Keras provides a two mode to create the model, simple and easy to use Sequential API as well as more flexible and advanced Functional API.
    # A ANN model can be created by simply calling Sequential() API
    # Sequential API is used to create models layer-by-layer
    # Functional model, you can define multiple input or output that share layers.
    # First, we create an instance for model using Model class and connect to the layers to access input and output to the model
    # model.inputs is the input fed to the model and model.layers[-1].output is the output of the last(-1) layer of the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

    # load each filename as image and resize the image to given target_size
    image = load_img(filename, target_size=(224, 224))
    # Convert image pixels to numpy array
    image = img_to_array(image)
    # Before presenting any data to CNN you may sometimes need to reshape your data
    # We are reshaping the data without changing its content
    image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
    # Preparing the image to fit to the VGG16 model
    image = preprocess_input(image)
    # Extracting the features from the image
    # By setting verbose 0, 1 or 2 you just say how do you want to 'see' the training progress for each epoch.
    # verbose=0 will show you nothing (silent)
    # verbose=1 will show you an animated progress bar like this: [===============================]
    # verbose=2 will just mention the number of epoch like this: Epoch 1/10
    feature = model.predict(image, verbose=0)
    return feature

In [6]:
tokenizer = load(open('./VGG16_tokenizer.pkl', 'rb'))
# Pre-define the maximum sequence length (from taining)
max_length = 34
# Load the model with minimum error
model = load_model('./VGG16_Models/model_9.h5')
# Load the image of which you need to generate description
test_image = extract_test_features('./test.jpg')
# Generate description of the image
description = generate_desc(model, tokenizer, test_image, max_length)
print(description)

starttoken little boy in pink shirt is playing in the air endtoken


In [7]:
# Remove start and end tokens
query = description
stopwords = ['starttoken', 'endtoken']
query_words = query.split()
result = [word for word in query_words if word not in stopwords]
result = ' '.join(result)
print(result)

little boy in pink shirt is playing in the air


# Convert Generated Text to Audio

In [None]:
import pyttsx3

In [None]:
engine = pyttsx3.init('espeak')

rate = engine.getProperty('rate')
engine.setProperty('rate', 200)

volume = engine.getProperty('volume')
engine.setProperty('volume', 1.0)

voices = engine.getProperty('voices')
engine.setProperty('voice', voices[1].id)

In [None]:
def speak(sound):
    engine.say(sound)
    engine.runAndWait()

In [None]:
if __name__ == '__main__':
    speak(result)