In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tensorflow==2.0.0
!pip install keras==2.3.1

In [3]:

import tensorflow
print(tensorflow.__version__)
import keras_preprocessing
print(keras_preprocessing.__version__)
import keras
print(keras.__version__)

2.0.0
1.1.2
2.3.1


Using TensorFlow backend.


In [None]:
from keras import backend as K

def preprocess_input(x, dim_ordering='default'):
    if dim_ordering == 'default':
        dim_ordering = K.image_dim_ordering()
    assert dim_ordering in {'tf', 'th'}

    if dim_ordering == 'th':
        x[:, 0, :, :] -= 104.006
        x[:, 1, :, :] -= 116.669
        x[:, 2, :, :] -= 122.679
        # 'RGB'->'BGR'
        x = x[:, ::-1, :, :]
    else:
        x[:, :, :, 0] -= 104.006
        x[:, :, :, 1] -= 116.669
        x[:, :, :, 2] -= 122.679
        # 'RGB'->'BGR'
        x = x[:, :, :, ::-1]
    return x


In [None]:
# -*- coding: utf-8 -*-
'''VGG16-places365 model for Keras

# Reference:
- [Places: A 10 million Image Database for Scene Recognition](http://places2.csail.mit.edu/PAMI_places.pdf)
'''

from __future__ import division, print_function
import os

import warnings
import numpy as np

from keras import backend as K
from keras.layers import Input
from keras.layers.core import Activation, Dense, Flatten
from keras.layers.pooling import MaxPooling2D
from keras.models import Model
from keras.layers import Conv2D
from keras.regularizers import l2
from keras.layers.core import Dropout
from keras.layers import GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras_applications.imagenet_utils import _obtain_input_shape
from keras.engine.topology import get_source_inputs
from keras.utils.data_utils import get_file
from keras.utils import layer_utils
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input

WEIGHTS_PATH = 'https://github.com/GKalliatakis/Keras-VGG16-places365/releases/download/v1.0/vgg16-places365_weights_tf_dim_ordering_tf_kernels.h5'
WEIGHTS_PATH_NO_TOP = 'https://github.com/GKalliatakis/Keras-VGG16-places365/releases/download/v1.0/vgg16-places365_weights_tf_dim_ordering_tf_kernels_notop.h5'


def VGG16_Places365(include_top=True, weights='places',
                    input_tensor=None, input_shape=None,
                    pooling=None,
                    classes=365):

    if not (weights in {'places', None} or os.path.exists(weights)):
        raise ValueError('The `weights` argument should be either '
                         '`None` (random initialization), `places` '
                         '(pre-training on Places), '
                         'or the path to the weights file to be loaded.')

    if weights == 'places' and include_top and classes != 365:
        raise ValueError('If using `weights` as places with `include_top`'
                         ' as true, `classes` should be 365')


    # Determine proper input shape
    input_shape = _obtain_input_shape(input_shape,
                                      default_size=224,
                                      min_size=48,
                                      data_format=K.image_data_format(),
                                      require_flatten =include_top)

    if input_tensor is None:
        img_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            img_input = Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor

    # Block 1
    x = Conv2D(filters=64, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block1_conv1')(img_input)

    x = Conv2D(filters=64, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block1_conv2')(x)

    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block1_pool", padding='valid')(x)

    # Block 2
    x = Conv2D(filters=128, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block2_conv1')(x)

    x = Conv2D(filters=128, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block2_conv2')(x)

    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block2_pool", padding='valid')(x)

    # Block 3
    x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block3_conv1')(x)

    x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block3_conv2')(x)

    x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block3_conv3')(x)

    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block3_pool", padding='valid')(x)

    # Block 4
    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block4_conv1')(x)

    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block4_conv2')(x)

    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block4_conv3')(x)

    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block4_pool", padding='valid')(x)

    # Block 5
    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block5_conv1')(x)

    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block5_conv2')(x)

    x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
               kernel_regularizer=l2(0.0002),
               activation='relu', name='block5_conv3')(x)

    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block5_pool", padding='valid')(x)

    if include_top:
        # Classification block
        x = Flatten(name='flatten')(x)
        x = Dense(4096, activation='relu', name='fc1')(x)
        x = Dropout(0.5, name='drop_fc1')(x)

        x = Dense(4096, activation='relu', name='fc2')(x)
        x = Dropout(0.5, name='drop_fc2')(x)
        
        x = Dense(365, activation='softmax', name="predictions")(x)

    else:
        if pooling == 'avg':
            x = GlobalAveragePooling2D()(x)
        elif pooling == 'max':
            x = GlobalMaxPooling2D()(x)

    # Ensure that the model takes into account
    # any potential predecessors of `input_tensor`.
    if input_tensor is not None:
        inputs = get_source_inputs(input_tensor)
    else:
        inputs = img_input

    # Create model.
    model = Model(inputs, x, name='vgg16-places365')

    # load weights
    if weights == 'places':
        if include_top:
            weights_path = get_file('vgg16-places365_weights_tf_dim_ordering_tf_kernels.h5',
                                    WEIGHTS_PATH,
                                    cache_subdir='models')
        else:
            weights_path = get_file('vgg16-places365_weights_tf_dim_ordering_tf_kernels_notop.h5',
                                    WEIGHTS_PATH_NO_TOP,
                                    cache_subdir='models')

        model.load_weights(weights_path)

        if K.backend() == 'theano':
            layer_utils.convert_all_kernels_in_model(model)

        if K.image_data_format() == 'channels_first':
            if include_top:
                maxpool = model.get_layer(name='block5_pool')
                shape = maxpool.output_shape[1:]
                dense = model.get_layer(name='fc1')
                layer_utils.convert_dense_weights_data_format(dense, shape, 'channels_first')

            if K.backend() == 'tensorflow':
                warnings.warn('You are using the TensorFlow backend, yet you '
                              'are using the Theano '
                              'image data format convention '
                              '(`image_data_format="channels_first"`). '
                              'For best performance, set '
                              '`image_data_format="channels_last"` in '
                              'your Keras config '
                              'at ~/.keras/keras.json.')

    elif weights is not None:
        model.load_weights(weights)

    return model




In [None]:
#Import Necessary Libraries
from os import listdir
from pickle import dump
#from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
#from keras.applications.vgg16 import preprocess_input
from keras.models import Model

# extract features from each photo in the directory
def extract_features(directory):
	# load the model
	model = VGG16_Places365(weights='places')
	# re-structure the model
	model.layers.pop()
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		print(len(features) ,':', name)
	return features

# extract features from all images
directory = '/content/drive/MyDrive/ICG/Flickr8k/Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/features.pkl', 'wb'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3093 : 3353278454_2f3a4d0bbc.jpg
3094 : 2676651833_3bb42bbb32.jpg
3095 : 2539933563_17ff0758c7.jpg
3096 : 394563330_68b566368c.jpg
3097 : 1468429623_f001988691.jpg
3098 : 3213622536_31da7f6682.jpg
3099 : 2888702775_0939a6680e.jpg
3100 : 3106791484_13e18c33d8.jpg
3101 : 2205336881_d9ee4179d3.jpg
3102 : 98377566_e4674d1ebd.jpg
3103 : 1022454428_b6b660a67b.jpg
3104 : 1348957576_c4a78eb974.jpg
3105 : 2555535057_007501dae5.jpg
3106 : 2655647656_ee450446ed.jpg
3107 : 2685788323_ceab14534a.jpg
3108 : 2716903793_fb7a3d8ba6.jpg
3109 : 961611340_251081fcb8.jpg
3110 : 1663751778_90501966f0.jpg
3111 : 2995461857_dd26188dcf.jpg
3112 : 3319586526_3994e9cd58.jpg
3113 : 3561639055_5ac66ae92f.jpg
3114 : 3211577298_14296db6fd.jpg
3115 : 3241726740_6d256d61ec.jpg
3116 : 3259002340_707ce96858.jpg
3117 : 2768972186_92787cd523.jpg
3118 : 3495490064_8db40a83af.jpg
3119 : 1475046848_831245fc64.jpg
3120 : 1473250020_dc829a090f.jpg
3121 : 34901860

In [None]:
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# extract descriptions for images
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# filename = 'Flickr8k_text/Flickr8k.token.txt'
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, '/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt')


Loaded: 8092 
Vocabulary Size: 8763


In [None]:
from pickle import load

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

# load training dataset (6K)
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', train)
print('Descriptions train: =%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/features.pkl', train)
print('Photos train: =%d' % len(train_features))


Dataset: 6000
Descriptions train: =6000
Photos train: =6000


In [None]:
from numpy import array
import tensorflow
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint




# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features



# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer



# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
	X1, X2, y = list(), list(), list()
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return array(X1), array(X2), array(y)

# define the captioning model
def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.4)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.4)(se1)
	se3 = LSTM(256)(se2)
	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize model
	print(model.summary())
	
    
	return model

#Below code is used to progressively load the batch of data
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
			yield [[in_img, in_seq], out_word]

			
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', train)
print('Descriptions train: =%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/features.pkl', train)
print('Photos train: =%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

# train the model
model = define_model(vocab_size, max_length)

# train the model, run epochs manually and save after each epoch
epochs = 10
steps = len(train_descriptions)
for i in range(epochs):
	# create the data generator
	generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
	# fit for one epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	print("Epoch " + str(i+1) + 'saved')
	model.save('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/0.4/model' + str(i+1) + '.h5')


Dataset: 6000
Descriptions train: =6000
Photos train: =6000
Vocabulary Size: 7579
Description Length: 34
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 256)      1940224     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_1[0][0]        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
Epoch 1saved
Epoch 1/1
Epoch 2saved
Epoch 1/1
Epoch 3saved
Epoch 1/1
Epoch 4saved
Epoch 1/1
Epoch 5saved
Epoch 1/1
Epoch 6saved
Epoch 1/1
Epoch 7saved
Epoch 1/1
Epoch 8saved
Epoch 1/1
Epoch 9saved
Epoch 1/1
1111/6000 [====>.........................] - ETA: 22:12 - loss: 2.9770 - accuracy: 0.3365

In [None]:
          
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu



# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer



# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process
	in_text = 'startseq'
	# iterate over the whole length of the sequence
	for i in range(max_length):
		# integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# predict next word
		yhat = model.predict([photo,sequence], verbose=0)
		# convert probability to integer
		yhat = argmax(yhat)
		# map integer to word
		word = word_for_id(yhat, tokenizer)
		# stop if we cannot map the word
		if word is None:
			break
		# append as input for generating the next word
		in_text += ' ' + word
		# stop if we predict the end of the sequence
		if word == 'endseq':
			break
	return in_text

# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		references = [d.split() for d in desc]
		actual.append(references)
		predicted.append(yhat.split())
		#print('Actual:    %s' % desc)
		#print('Predicted: %s' % yhat)
		if len(actual) >= 5:
			break
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


# prepare training set

# load training dataset (6K)
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Training Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', train)
print('Descriptions train: =%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)


# prepare test set

# load test set
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Testing Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', test)
print('Descriptions test: =%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/features.pkl', test)
print('Photos test: =%d' % len(test_features))

# load the model which has minimum loss, in this case it was model_1
models = 10
for i in range(models):
  # load the model which has minimum loss, in this case it was model_1
  print('BLEU Score of epoch ' + str(i+1) )
  filename = '/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/0.4/model' + str(i+1) + '.h5'
  model = load_model(filename)
  # evaluate model
  evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)


Training Dataset: 6000
Descriptions train: =6000
Vocabulary Size: 7579
Description Length: 34
Testing Dataset: 1000
Descriptions test: =1000
Photos test: =1000
BLEU Score of epoch 1


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.304348
BLEU-2: 0.137919
BLEU-3: 0.270491
BLEU-4: 0.371375
BLEU Score of epoch 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.540000
BLEU-2: 0.289828
BLEU-3: 0.187838
BLEU-4: 0.281731
BLEU Score of epoch 3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.580000
BLEU-2: 0.340588
BLEU-3: 0.208950
BLEU-4: 0.305408
BLEU Score of epoch 4


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.591837
BLEU-2: 0.347933
BLEU-3: 0.213691
BLEU-4: 0.310643
BLEU Score of epoch 5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.510204
BLEU-2: 0.215365
BLEU-3: 0.108354
BLEU-4: 0.185704
BLEU Score of epoch 6


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.557692
BLEU-2: 0.288202
BLEU-3: 0.184153
BLEU-4: 0.277534
BLEU Score of epoch 7


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.480000
BLEU-2: 0.178885
BLEU-3: 0.119498
BLEU-4: 0.082227
BLEU Score of epoch 8


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.574468
BLEU-2: 0.261513
BLEU-3: 0.180090
BLEU-4: 0.272883
BLEU Score of epoch 9


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.549020
BLEU-2: 0.244287
BLEU-3: 0.115823
BLEU-4: 0.195323
BLEU Score of epoch 10


OSError: ignored

In [4]:
!pip install rouge
!pip install nltk==3.4.5
import nltk
nltk.download('wordnet')

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0
Collecting nltk==3.4.5
[?25l  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.7MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-cp37-none-any.whl size=1449906 sha256=e5a6e31a4386d329e019d06c14e6ed4920adb9d81738856c7810f98500f1cdc1
  Stored in directory: /root/.cache/pip/wheels/96/86/f6/68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:

from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import nltk
from rouge import Rouge 


# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer



# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None



# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process
	in_text = 'startseq'
	# iterate over the whole length of the sequence
	for i in range(max_length):
		# integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# predict next word
		yhat = model.predict([photo,sequence], verbose=0)
		# convert probability to integer
		yhat = argmax(yhat)
		# map integer to word
		word = word_for_id(yhat, tokenizer)
		# stop if we cannot map the word
		if word is None:
			break
		# append as input for generating the next word
		in_text += ' ' + word
		# stop if we predict the end of the sequence
		if word == 'endseq':
			break
	return in_text





def evaluate_ROUGE(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		#references = [d.split() for d in desc]
		#actual.join(references)
		actual = desc
		predicted = yhat
		#predicted.join(yhat)
		
		#print('Actual:    %s' %actual)
		#print('Predected:  %s' %predicted)
		if len(actual) >= 5:
			break
		
		
	# calculate ROUGE score
	rouge = Rouge()
	scores = rouge.get_scores(predicted, ", ".join(actual), avg=True)
	print('ROUGE: ', scores)


def evaluate_METEOR(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = '',''
	# step over the whole set
	
	for key, desc in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		#references = [d.split() for d in desc]
		#actual.join(references)
		actual = desc
		predicted = yhat
		#predicted.join(yhat)
		
		#print('Actual:    %s' %actual)
		#print('Predected:  %s' %predicted)
		if len(actual) >= 5:
			break
		
		
	# calculate METEOR score
	print('METEOR: %f' % nltk.translate.meteor_score.meteor_score(actual, predicted))
		

# prepare training set

# load training dataset (6K)
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Training Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', train)
print('Descriptions train: =%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)


# prepare test set

# load test set
filename = '/content/drive/MyDrive/ICG/Flickr8k/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Testing Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/descriptions.txt', test)
print('Descriptions test: =%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/features.pkl', test)
print('Photos test: =%d' % len(test_features))

# load the model which has minimum loss, in this case it was model_1
models = 10
for i in range(models):
  # load the model which has minimum loss, in this case it was model_1
  filename = '/content/drive/MyDrive/ICG/VGG16_Places365_with_LSTM/0.2/model' + str(i+1) + '.h5'
  model = load_model(filename)
  # evaluate model
  print('METEOR Score of epoch ' + str(i+1) )
  evaluate_METEOR(model, test_descriptions, test_features, tokenizer, max_length)
  print('ROUGE Score of epoch ' + str(i+1) )
  evaluate_ROUGE(model, test_descriptions, test_features, tokenizer, max_length)




Training Dataset: 6000
Descriptions train: =6000
Vocabulary Size: 7579
Description Length: 34
Testing Dataset: 1000
Descriptions test: =1000
Photos test: =1000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 1
METEOR: 0.506051
ROUGE Score of epoch 1
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 2
METEOR: 0.506051
ROUGE Score of epoch 2
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 3
METEOR: 0.506051
ROUGE Score of epoch 3
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 4
METEOR: 0.357692
ROUGE Score of epoch 4
ROUGE:  {'rouge-1': {'f': 0.17073170448542538, 'p': 0.5, 'r': 0.10294117647058823}, 'rouge-2': {'f': 0.024999997278125297, 'p': 0.07692307692307693, 'r': 0.014925373134328358}, 'rouge-l': {'f': 0.2641509398932005, 'p': 0.5833333333333334, 'r': 0.17073170731707318}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 5
METEOR: 0.506051
ROUGE Score of epoch 5
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 6
METEOR: 0.356067
ROUGE Score of epoch 6
ROUGE:  {'rouge-1': {'f': 0.24999999648760338, 'p': 0.55, 'r': 0.16176470588235295}, 'rouge-2': {'f': 0.0465116244645757, 'p': 0.10526315789473684, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.2807017503477994, 'p': 0.5, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 7
METEOR: 0.506051
ROUGE Score of epoch 7
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 8
METEOR: 0.506051
ROUGE Score of epoch 8
ROUGE:  {'rouge-1': {'f': 0.2025316431725685, 'p': 0.7272727272727273, 'r': 0.11764705882352941}, 'rouge-2': {'f': 0.05194804968797446, 'p': 0.2, 'r': 0.029850746268656716}, 'rouge-l': {'f': 0.3076923043565089, 'p': 0.7272727272727273, 'r': 0.1951219512195122}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 9
METEOR: 0.160000
ROUGE Score of epoch 9
ROUGE:  {'rouge-1': {'f': 0.16470587915294121, 'p': 0.4117647058823529, 'r': 0.10294117647058823}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.17857142464923478, 'p': 0.3333333333333333, 'r': 0.12195121951219512}}


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


METEOR Score of epoch 10
METEOR: 0.361165
ROUGE Score of epoch 10
ROUGE:  {'rouge-1': {'f': 0.17283950347812838, 'p': 0.5384615384615384, 'r': 0.10294117647058823}, 'rouge-2': {'f': 0.02531645311969262, 'p': 0.08333333333333333, 'r': 0.014925373134328358}, 'rouge-l': {'f': 0.25925925560356655, 'p': 0.5384615384615384, 'r': 0.17073170731707318}}
