In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
from os import listdir
from numpy import argmax
from pickle import dump
from pickle import load
from nltk.translate.bleu_score import corpus_bleu
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [3]:
def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

In [4]:
def load_set(filename):
  doc=load_doc(filename)
  dataset=list()
  for i in doc.split("\n"):
    if len(i)<1:
      continue
    r=i.split(".")[0]
    dataset.append(r)
  return set(dataset)



In [5]:
def load_clean_description(filename,dataset):
  doc=load_doc(filename)
  descriptions = dict()
  for line in doc.split('\n'):
    tokens = line.split(" ")
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in dataset:
      if image_id not in descriptions:
        descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        descriptions[image_id].append(desc)
        return descriptions

In [6]:
def load_photo_features(filename, dataset):
	all_features = load(open(filename, 'rb'))
	features = {k: all_features[k] for k in dataset}
	return features

In [7]:
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [8]:
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

In [9]:
filename="/content/drive/My Drive/image_captioning/Flickr_8k.trainImages.txt"
dataset=load_set(filename)


In [10]:
train_desc=load_clean_description('/content/drive/My Drive/image_captioning/description.txt',dataset)
tokenizer = create_tokenizer(train_desc)
train_feat=load_photo_features("/content/drive/My Drive/image_captioning/features.pkl",dataset)

In [11]:
vocab_size = len(tokenizer.word_index) + 1
max_length = max_length(train_desc)

In [12]:
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
		# walk through each description for the image
		for desc in desc_list:
			# encode the sequence
			seq = tokenizer.texts_to_sequences([desc])[0]
			# split one sequence into multiple X,y pairs
			for i in range(1, len(seq)):
				# split into input and output pair
				in_seq, out_seq = seq[:i], seq[i]
				# pad input sequence
				in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
				# encode output sequence
				out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
				# store
				X1.append(photos[key][0])
				X2.append(in_seq)
				y.append(out_seq)
	return array(X1), array(X2), array(y)


In [13]:
def define_model(vocab_size, max_length):
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')

	return model

In [14]:
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_desc, train_feat, vocab_size)

In [15]:
filename="/content/drive/My Drive/image_captioning/Flickr_8k.devImages.txt"
dataset2=load_set(filename)
test_desc=load_clean_description('/content/drive/My Drive/image_captioning/description.txt',dataset2)
test_feat=load_photo_features("/content/drive/My Drive/image_captioning/features.pkl",dataset2)

In [16]:
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_desc, test_feat, vocab_size)

In [17]:
model = define_model(vocab_size, max_length)
filepath ="/content/drive/My Drive/image_captioning/checkpoint.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

Epoch 1/20
1/1 - 7s - loss: 3.9302 - val_loss: 2.9674

Epoch 00001: val_loss improved from inf to 2.96745, saving model to /content/drive/My Drive/image_captioning/checkpoint.h5
Epoch 2/20
1/1 - 0s - loss: 3.4475 - val_loss: 3.2128

Epoch 00002: val_loss did not improve from 2.96745
Epoch 3/20
1/1 - 0s - loss: 3.0617 - val_loss: 3.3025

Epoch 00003: val_loss did not improve from 2.96745
Epoch 4/20
1/1 - 0s - loss: 3.2020 - val_loss: 3.2011

Epoch 00004: val_loss did not improve from 2.96745
Epoch 5/20
1/1 - 0s - loss: 2.7832 - val_loss: 3.0995

Epoch 00005: val_loss did not improve from 2.96745
Epoch 6/20
1/1 - 0s - loss: 2.7730 - val_loss: 3.0724

Epoch 00006: val_loss did not improve from 2.96745
Epoch 7/20
1/1 - 0s - loss: 2.8459 - val_loss: 3.1117

Epoch 00007: val_loss did not improve from 2.96745
Epoch 8/20
1/1 - 0s - loss: 2.9830 - val_loss: 3.0759

Epoch 00008: val_loss did not improve from 2.96745
Epoch 9/20
1/1 - 0s - loss: 2.8986 - val_loss: 3.0303

Epoch 00009: val_loss did

<tensorflow.python.keras.callbacks.History at 0x7f487018a810>

In [18]:
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
def generate_desc(model, tokenizer, photo, max_length):
	in_text = 'startseq'
	for i in range(max_length):
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		sequence = pad_sequences([sequence], maxlen=max_length)
		yhat = model.predict([photo,sequence], verbose=0)

		yhat = argmax(yhat)
		word = word_for_id(yhat, tokenizer)
		if word is None:
			break
		in_text += ' ' + word
		if word == 'endseq':
			break
	return in_text

In [19]:
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc_list in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		references = [d.split() for d in desc_list]
		actual.append(references)
		predicted.append(yhat.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [23]:
evaluate_model(model, test_desc, test_feat, tokenizer, max_length)

BLEU-1: 0.058824
BLEU-2: 0.242536
BLEU-3: 0.427430
BLEU-4: 0.492479


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
