In [0]:
# https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
# https://towardsdatascience.com/image-captioning-in-deep-learning-9cd23fb4d8d2
# https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8

In [1]:
import os
os.chdir('/Users/Adrien Delpierre/Documents/Projet')

In [21]:
!pip install pillow
!pip install nltk

Collecting nltk
  Downloading https://files.pythonhosted.org/packages/6f/ed/9c755d357d33bc1931e157f537721efb5b88d2c583fe593cc09603076cc3/nltk-3.4.zip (1.4MB)
Collecting singledispatch (from nltk)
  Downloading https://files.pythonhosted.org/packages/c5/10/369f50bcd4621b263927b0a1519987a04383d4a98fb10438042ad410cf88/singledispatch-3.4.0.3-py2.py3-none-any.whl
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk: started
  Running setup.py bdist_wheel for nltk: finished with status 'done'
  Stored in directory: C:\Users\Adrien Delpierre\AppData\Local\pip\Cache\wheels\4b\c8\24\b2343664bcceb7147efeb21c0b23703a05b23fcfeaceaa2a1e
Successfully built nltk
Installing collected packages: singledispatch, nltk
Successfully installed nltk-3.4 singledispatch-3.4.0.3


In [2]:
from os import listdir
from PIL import *
import numpy as np
from pickle import dump, load
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
def extract_features(directory):
	model = VGG16()
	# remove the classifier layers
	model.layers.pop()
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	model.summary()
	# extract features from each photo
	features = {}
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		#print('>%s' % name)
	return features

features = extract_features('Flicker8k_Dataset')
# Saving features in a file
dump(features, open('features.pkl', 'wb'));
 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [5]:
# On associe à l'id de chaque photo ses descriptions dans le dictionnaire mapping.
def load_desc(file_name):
  mapping = {}
  with open(file_name) as f:
    data = f.readlines()
    for line in data:
      tokens = line.split()
      if len(line) < 2:
        continue
      # take the first token as the image id, the rest as the description
      image_id, image_desc = tokens[0], tokens[1:]
      # remove filename from image id
      image_id = image_id.split('.')[0]
      # convert description tokens back to string
      image_desc = ' '.join(image_desc)
      # create the list if needed
      if image_id not in mapping:
        mapping[image_id] = []
      # store description
      mapping[image_id].append(image_desc)
  return mapping


def clean_desc(descriptions):
  for desc_list in descriptions.values():
    desc_list = [text_to_word_sequence(desc, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~)0123456789')
                 for desc in desc_list]


def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc


def save_desc(descriptions, filename):
  with open(filename, 'w') as f:
    lines = []
    for key, desc_list in descriptions.items():
      lines = lines + [key + ' ' + desc for desc in desc_list]
    f.write('\n'.join(lines))
    

filename = 'Flickr8k_text/Flickr8k.token.txt'
# parse descriptions
descriptions = load_desc(filename)
print('%d descriptions chargées ' % len(descriptions))
# clean descriptions
clean_desc(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Taille du vocabulaire : %d ' % len(vocabulary))
# save to file
save_desc(descriptions, 'Flickr8k_text/descriptions.txt')

8092 descriptions chargées 
Taille du vocabulaire : 9630 


In [6]:
# load a pre-defined list of photo identifiers
def load_set(filename):
  with open(filename) as f:
    dataset = [line.split('.')[0] for line in f.readlines() if len(line) >= 1]
  return set(dataset)

def load_clean_desc(filename, dataset):
  with open(filename) as f:
    descriptions = {}
    for line in f.readlines():
      tokens = line.split()
      # split id from description
      image_id, image_desc = tokens[0], tokens[1:]
      # skip images not in the set
      if image_id in dataset:
        if image_id not in descriptions:
          descriptions[image_id] = []
        # wrap description in tokens
        desc = 'seqstart ' + ' '.join(image_desc) + ' seqend'
        descriptions[image_id].append(desc)
  return descriptions


def load_photo_features(filename, dataset):
	# load all features
  with open(filename, 'rb') as f:
    all_features = load(f)
    # filter features
    features = {k: all_features[k] for k in dataset}
  return features


filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset : %d' % len(train))
train_descriptions = load_clean_desc('Flickr8k_text/descriptions.txt', train)
print('Descriptions : train = %d' % len(train_descriptions))
train_features = load_photo_features('features.pkl', train)
print('Photos : train = %d' % len(train_features))

# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(desc):
	text = to_lines(descriptions)
	tok = Tokenizer()
	tok.fit_on_texts(text)
	return tok

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Taille du vocabulaire : %d' % vocab_size)

# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

Dataset : 6000
Descriptions : train = 6000
Photos : train = 6000
Taille du vocabulaire : 8494


In [7]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
	X1, X2, y = [], [], []
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return np.array(X1), np.array(X2), np.array(y)

In [16]:
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint

# define the captioning model
def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(1000,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_l,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	# compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# summarize model
	model.summary()
	#plot_model(model, to_file='model.png', show_shapes=True)
	return model

In [9]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
			yield [[in_img, in_seq], out_word]

In [10]:
# train dataset

# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_desc('Flickr8k_text/descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_l = max_length(train_descriptions)
print('Description Length: %d' % max_l)


Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 8494
Description Length: 40


In [17]:
# define the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
	# create the data generator
	generator = data_generator(train_descriptions, train_features, tokenizer, max_l)
	# fit for one epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	model.save('model_' + str(i) + '.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 40)           0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 40, 256)      2174464     input_12[0][0]                   
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 1000)         0           input_11[0][0]                   
__________________________________________________________________________________________________
dropout_5 

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
