In [1]:
from keras.models import load_model
import pickle as pkl
from data_preprocessing.image_processing import FeatureExtractor
from commons.utils import load_pickle_file
from keras.preprocessing.sequence import pad_sequences
from numpy import argmax
#from models.model_eval import BeamSearch

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
tokenizer = load_pickle_file('data/preprocessed/tokenizer.pkl')

In [3]:
featureExtractor = FeatureExtractor('xception')

In [13]:
model = load_model('xception.epoch_02-loss_3.74.hdf5')

In [14]:
import heapq

class Beam:
    """
    Beam object inspired from this post from M.Tanti:
    https://geekyisawesome.blogspot.ie/2016/10/using-beam-search-to-generate-most.html
    """
    def __init__(self, n_top):
        """
        Initialise Beam object to keep track of n_top sequences
        :param n_top: number of best sequences to keep
        """
        self.n_top = n_top
        self.heap = list()

    def append(self, proba, done, sequence):
        """
        Add a sequence to the heap of Beam object
        :param proba: probability of the sequence
        :param done: True if the sequence is finished, False otherwise
        :param sequence: The sequence
        :return: None
        """
        heapq.heappush(self.heap, (proba, done, sequence))
        if len(self.heap) > self.n_top:
            heapq.heappop(self.heap)

    def __iter__(self):
        """
        Return elements of the heap if iterated on
        :return: heap tuple containing (proba, is_finished, sequence)
        """
        return iter(self.heap)

    def __str__(self):
        heap_str = "Beam search of {} top sequences\n".format(self.n_top)
        for proba, _, seq in self.heap:
            heap_str += "{} with {:.2E} probability\n".format(seq, proba)
        return heap_str


class BeamSearch:
    def __init__(self, model, tokenizer, clip=50, n_top=3,
                 sentence_boundaries=('startseq', 'endseq')):
        """
        Init beam search
        :param model: model used for prediction
        :param tokenizer: tokenizer used for word indexing
        :param clip: maximum caption length
        :param n_top: number of best possible sequences kept each time
        :param sentence_boundaries: tuple of (start, end) of sequence tokens
        """
        self.model = model
        self.tokenizer = tokenizer
        self.clip = clip
        self.n_top = n_top
        self.start_token = sentence_boundaries[0]
        self.end_token = sentence_boundaries[1]
        self.rev_word_idx = {v: k for k, v in tokenizer.word_index.items()}

    def _make_pred(self, features, curr_beam, elem):
        """
        Predict possible next word and append them to beam search.
        Only n_top possibilities will be kept.
        :param features: features of a given image
        :param curr_beam: Beam object currently used
        :param elem: tuple of (probability, done, sequence) from previous beam
        :return: None
        """
        input_text = " ".join(elem[2])
        sequence = self.tokenizer.texts_to_sequences([input_text])[0]
        sequence = pad_sequences([sequence], maxlen=self.clip)
        y_pred = self.model.predict([features, sequence])[0]
        for i, proba in enumerate(y_pred):
            word = self.rev_word_idx.get(i)
            if word is None:
                continue
            if word == self.end_token:
                curr_beam.append(elem[0] * proba, True, elem[2])
            else:
                curr_beam.append(elem[0] * proba, False, elem[2] + [word])

    def search(self, features):
        """
        Performs beam search from image features
        :param features: features extracted from the image
        :return: (best_caption, probability)
        """
        last_beam = Beam(self.n_top)
        last_beam.append(1.0, False, [self.start_token])
        while True:
            curr_beam = Beam(self.n_top)
            for elem in last_beam:
                if elem[1] is True:
                    curr_beam.append(*elem)
                else:
                    self._make_pred(features, curr_beam, elem)
            (best_proba, done, best_seq) = max(curr_beam)
            print(curr_beam)
            best_seq = best_seq[1:]
            if done is True or len(best_seq) >= self.clip:
                return " ".join(best_seq), best_proba
            last_beam = curr_beam

In [15]:
beam_search = BeamSearch(model, tokenizer)

In [16]:
def test_caption(image_filename, beam):
    features = featureExtractor.extract_features(image_filename)
    sentence, proba = beam.search(features)
    print("{} ({:.2E})".format(sentence, proba))

<img src="data/demo_images/boat.jpg" width="50%">

In [17]:
test_caption("data/demo_images/boat.jpg", beam_search)

Beam search of 3 top sequences
['startseq', 'sunsetendseq'] with 1.54E-01 probability
['startseq', 'ship'] with 2.46E-01 probability
['startseq', 'a'] with 3.01E-01 probability

Beam search of 3 top sequences
['startseq', 'sunsetendseq', 'ship'] with 1.25E-01 probability
['startseq', 'ship', 'ship'] with 1.36E-01 probability
['startseq', 'a', 'ship'] with 2.40E-01 probability

Beam search of 3 top sequences
['startseq', 'ship', 'ship', 'ship'] with 3.46E-02 probability
['startseq', 'a', 'ship', 'ship'] with 1.40E-01 probability
['startseq', 'sunsetendseq', 'ship', 'ship'] with 4.57E-02 probability

Beam search of 3 top sequences
['startseq', 'a', 'ship', 'ship', 'ship'] with 2.18E-02 probability
['startseq', 'a', 'ship', 'ship', 'sunsetendseq'] with 2.38E-02 probability
['startseq', 'a', 'ship', 'ship', 'on'] with 2.88E-02 probability

Beam search of 3 top sequences
['startseq', 'a', 'ship', 'ship', 'on', 'sunsetendseq'] with 5.80E-03 probability
['startseq', 'a', 'ship', 'ship', 'on',

<img src="data/demo_images/cat.jpg" width="50%">

In [9]:
test_caption("data/demo_images/cat.jpg", beam_search)

Beam search of 3 top sequences
['startseq', 'white'] with 0.12 probability
['startseq', 'summer'] with 0.21 probability
['startseq', 'a'] with 0.35 probability

Beam search of 3 top sequences
['startseq', 'a', 'summer'] with 0.06 probability
['startseq', 'a', 'white'] with 0.14 probability
['startseq', 'summer', 'summer'] with 0.06 probability

Beam search of 3 top sequences
['startseq', 'summer', 'summer', 'summer'] with 0.02 probability
['startseq', 'a', 'white', 'frisbee'] with 0.03 probability
['startseq', 'a', 'white', 'white'] with 0.04 probability

Beam search of 3 top sequences
['startseq', 'a', 'white', 'white', 'frisbee'] with 0.01 probability
['startseq', 'a', 'white', 'white', 'white'] with 0.01 probability
['startseq', 'summer', 'summer', 'summer', 'summer'] with 0.01 probability

Beam search of 3 top sequences
['startseq', 'a', 'white', 'white', 'white', 'white'] with 0.00 probability
['startseq', 'summer', 'summer', 'summer', 'summer', 'summer'] with 0.00 probability
['s

<img src="data/demo_images/children.jpg" width="50%">

In [10]:
test_caption("data/demo_images/children.jpg", beam_search)

Beam search of 3 top sequences
['startseq', 'night'] with 0.14 probability
['startseq', 'children'] with 0.18 probability
['startseq', 'a'] with 0.15 probability

Beam search of 3 top sequences
['startseq', 'children', 'kick'] with 0.03 probability
['startseq', 'children', 'playing'] with 0.03 probability
['startseq', 'children', 'chasing'] with 0.04 probability

Beam search of 3 top sequences
['startseq', 'children', 'playing', 'chasing'] with 0.01 probability
['startseq', 'children', 'chasing', 'kick'] with 0.01 probability
['startseq', 'children', 'chasing', 'night'] with 0.01 probability

Beam search of 3 top sequences
['startseq', 'children', 'chasing', 'kick', 'kick'] with 0.00 probability
['startseq', 'children', 'chasing', 'kick', 'night'] with 0.00 probability
['startseq', 'children', 'playing', 'chasing', 'night'] with 0.00 probability

Beam search of 3 top sequences
['startseq', 'children', 'chasing', 'kick', 'kick', 'kick'] with 0.00 probability
['startseq', 'children', 'ch

<img src="data/demo_images/dog.jpg" width="50%">

In [11]:
test_caption("data/demo_images/dog.jpg", beam_search)

Beam search of 3 top sequences
['startseq', 'a'] with 0.02 probability
['startseq', 'eating'] with 0.39 probability
['startseq', 'cream'] with 0.53 probability

Beam search of 3 top sequences
['startseq', 'cream', 'eating'] with 0.19 probability
['startseq', 'cream', 'cream'] with 0.29 probability
['startseq', 'eating', 'cream'] with 0.23 probability

Beam search of 3 top sequences
['startseq', 'cream', 'eating', 'cream'] with 0.12 probability
['startseq', 'eating', 'cream', 'cream'] with 0.15 probability
['startseq', 'cream', 'cream', 'cream'] with 0.17 probability

Beam search of 3 top sequences
['startseq', 'cream', 'eating', 'cream', 'cream'] with 0.08 probability
['startseq', 'cream', 'cream', 'cream', 'cream'] with 0.10 probability
['startseq', 'eating', 'cream', 'cream', 'cream'] with 0.11 probability

Beam search of 3 top sequences
['startseq', 'cream', 'eating', 'cream', 'cream', 'cream'] with 0.06 probability
['startseq', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.07

Beam search of 3 top sequences
['startseq', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.00 probability
['startseq', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.00 probability

Beam search of 3 top sequences
['startseq', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.00 probability
['startseq', 'cream', 'eating'

Beam search of 3 top sequences
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream

Beam search of 3 top sequences
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'ends', 'cream', 'ends', 'labrador'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'ends', 'labrador', 'labrador'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 

Beam search of 3 top sequences
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'ends', 'labrador', 'ends', 'labrador', 'ends', 'labrador', 'ends', 'labrador', 'ends', 'ends'] with 0.00 probability
['startseq', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'cream', 'eating', 'cream', 'cream', 'cream', 'cream', 'cream', 'ends', 'labrador', 'ends', 'labrador', 'ends', 'ends', 'ends', 'ends', 'ends', 'ends'] with 0.00 probability
['startseq', 'eating', 'c