In [33]:
import os
import glob
import string

from tensorflow.keras.applications import MobileNet
import tensorflow.keras.applications.mobilenet

from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow.keras.applications.inception_v3

from tqdm import tqdm
import tensorflow.keras.preprocessing.image
import pickle
from time import time
import numpy as np

import sys
from PIL import Image
sys.modules['Image'] = Image 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers

from tensorflow.keras.models import Model, load_model

from tensorflow.keras.layers import add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import pandas as pd

import  nltk.translate.bleu_score as bleu

DESCRIPTION_LIMIT = 32
WORD_OCCURANCE_LIMIT = 10

START_TOKEN = '<start>'
END_TOKEN = '<end>'

In [34]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"

In [35]:
from google.colab import drive
drive.mount('/content/drive')
root_captioning = "/content/drive/My Drive/Colab Notebooks/captions"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Iterate through captions and generate vocabulary

In [36]:
def create_vocabulary(lex, lookup):
  all_captions = []
  for image, captions in lookup.items():
    all_captions.extend(captions)

  word_count = {}
  for caption in all_captions:
    for w in caption.split():
      word_count[w] = word_count.get(w, 0) + 1

  vocab = ['<padding>']
  for word in lex:
    if (word_count.get(word,0) >= WORD_OCCURANCE_LIMIT):
      vocab.append(word)
  vocab.append('<unk>')
    
  word_to_idx = {}
  idx_to_word = {}
  for i, w in enumerate(vocab):
    idx_to_word[i] = w
    word_to_idx[w] = i

  return vocab, word_to_idx, idx_to_word

null_punct = str.maketrans('','', string.punctuation)
lookup = dict()

with open( os.path.join(root_captioning,'Flickr30k_Text','results.csv'), 'r') as fp:
  fp.readline()

  max_length = 0

  for line in tqdm(fp.read().splitlines()):
    sections = line.split('|')
    if len(sections) >= 3:
      id = sections[0].split(',')[0]
      desc = sections[2].split()

      desc = [word.lower().strip() for word in desc]
      desc = [w.translate(null_punct) for w in desc]
      desc = [word for word in desc if len(word) > 1]
      desc = [word for word in desc if word.isalpha()]

      if len(desc) <= DESCRIPTION_LIMIT:
        max_length = max(max_length, len(desc))

        if id not in lookup:
          lookup[id] = list()
        lookup[id].append(f'{START_TOKEN} {" ".join(desc)} {END_TOKEN}')

lex = set()
for key in lookup:
  [lex.update(d.split()) for d in lookup[key]]


max_length = max_length + 2
vocab, word_to_idx, idx_to_word = create_vocabulary(lex, lookup)
vocab_size = len(vocab)

100%|██████████| 158915/158915 [00:02<00:00, 64467.56it/s]


In [6]:
print(len(lookup)) # How many unique words
print(len(lex)) # The dictionary
print(max_length) # Maximum length of a caption (in words)

31783
19629
34


### Image processing


In [37]:

WIDTH = 299
HEIGHT = 299
OUTPUT_DIM = 2048

model = InceptionV3(weights='imagenet')
encode_model = Model(model.input, model.layers[-2].output)

model.summary()

images = lookup.keys()
images_folder = "Flickr30k_Dataset"
pickle_file = "pickles/embeddings.pickle"

def encode_image(img):
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
  
  #Convert to numpy array
  x = tensorflow.keras.preprocessing.image.img_to_array(img)
  #Expand to 2D array
  x = np.expand_dims(x, axis=0)
  #Perform any preprocessing needed by InceptionV3
  tensorflow.keras.applications.inception_v3.preprocess_input(x)
  # Call InceptionV3 to extract the smaller feature set for the image.
  x = encode_model.predict(x) # Get the encoding vector for the image
  # Shape to correct form to be accepted by LSTM captioning network.
  x = np.reshape(x, OUTPUT_DIM )

  return x

pickle_file_path = os.path.join(root_captioning, pickle_file)
if not os.path.exists(pickle_file_path):
  image_encoddings = {}
  for id in images:
    image_path = os.path.join(root_captioning, images_folder, id)
    img = tensorflow.keras.preprocessing.image.load_img(image_path, target_size=(HEIGHT, WIDTH))
    image_encoddings[id] = encode_image(img)

  with open(pickle_file_path, "wb") as fp:
    pickle.dump(image_encoddings, fp)

else:
  print('exists')
  with open(pickle_file_path, "rb") as fp:
    image_encoddings = pickle.load(fp)




Model: "inception_v3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d_94 (Conv2D)              (None, 149, 149, 32) 864         input_4[0][0]                    
__________________________________________________________________________________________________
batch_normalization_94 (BatchNo (None, 149, 149, 32) 96          conv2d_94[0][0]                  
__________________________________________________________________________________________________
activation_94 (Activation)      (None, 149, 149, 32) 0           batch_normalization_94[0][0]     
_______________________________________________________________________________________

### Split dataset

In [38]:
keys_train, keys_test = train_test_split(list(image_encoddings.keys()), test_size = 0.033)
train_img_encoddings = {k: v for k, v in image_encoddings.items() if k in keys_train}
test_img_encoddings = {k: v for k, v in image_encoddings.items() if k in keys_test}
train_img_descriptions = {k: v for k, v in lookup.items() if k in keys_train}
test_img_descriptions = {k: v for k, v in lookup.items() if k in keys_test}
print(len(train_img_encoddings))
print(len(test_img_encoddings))
print(len(train_img_descriptions))
print(len(test_img_descriptions))

30734
1049
30734
1049


1000092795.jpg ['<start> two young guys with shaggy hair look at their hands while hanging out in the yard <end>', '<start> two young white males are outside near many bushes <end>', '<start> two men in green shirts are standing in yard <end>', '<start> man in blue shirt standing in garden <end>', '<start> two friends enjoy time spent together <end>']
10002456.jpg ['<start> several men in hard hats are operating giant pulley system <end>', '<start> workers look down from up above on piece of equipment <end>', '<start> two men working on machine wearing hard hats <end>', '<start> four men on top of tall structure <end>', '<start> three men on large rig <end>']
1000268201.jpg ['<start> child in pink dress is climbing up set of stairs in an entry way <end>', '<start> little girl in pink dress going into wooden cabin <end>', '<start> little girl climbing the stairs to her playhouse <end>', '<start> little girl climbing into wooden playhouse <end>', '<start> girl going into wooden building 

### Load Glove embedding

In [39]:
EMBEDDING_DIM = 200

def load_glove_embeddings(glove_file):
  glove_embeddings = {}
  with open(glove_file, "r") as f:
    for line in tqdm(f):
      tokens = line.split()
      word = tokens[0]
      coeffs = np.asarray(tokens[1:], dtype='float32')
      glove_embeddings[tokens[0]] = coeffs

  return glove_embeddings


word_embeddings = load_glove_embeddings(os.path.join(root_captioning, 'glove.6B/glove.6B.200d.txt'))
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_to_idx.items():
  embedding_vector = word_embeddings.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector



400001it [00:24, 16426.69it/s]


### Model creation

In [40]:
IMAGE_OUTPUT_DIM = 2048

inputs1 = Input(shape=(IMAGE_OUTPUT_DIM,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
# Mask zero = True is introduced because of the padding
se1 = Embedding(vocab_size, EMBEDDING_DIM, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
# Default learning rate of 0.001
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')
caption_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 200)      1080800     input_6[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 2048)         0           input_5[0][0]                    
____________________________________________________________________________________________

In [41]:
def data_generator(descriptions, image_encoddings, word_to_idx, max_length, vocab_size, num_photos_per_batch):
  x1, x2, y = [], [], []
  n=0
  while True:
    for key, desc_list in descriptions.items():
      n+=1
      photo = image_encoddings[key]

      #each photo has several descriptions
      for desc in desc_list:
        seq = list(map(
            lambda word: word_to_idx[word] if word in word_to_idx else word_to_idx['<unk>'],
            desc.split()
        ))

        for i in range(1 , len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
          x1.append(photo)
          x2.append(in_seq)
          y.append(out_seq)
      if n==num_photos_per_batch:
        yield ([np.array(x1), np.array(x2)], np.array(y))
        x1, x2, y = [],[],[]
        n=0

### Actual training

In [43]:
EPOCHS = 1


model_path = os.path.join(root_captioning, 'model/caption-model.hdf5')
if not os.path.exists(model_path):
  number_pics_per_bath = 3
  steps = len(train_img_descriptions) //  number_pics_per_bath
  for i in tqdm(range(EPOCHS*2)):
    generator = data_generator(train_img_descriptions, train_img_encoddings, word_to_idx, max_length, vocab_size, number_pics_per_bath)
    caption_model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

  caption_model.optimizer.lr = 1e-4
  number_pics_per_bath = 6
  steps = len(train_img_descriptions)//number_pics_per_bath
  for i in range(EPOCHS):
    generator = data_generator(train_img_descriptions, train_img_encoddings, word_to_idx, max_length, vocab_size, number_pics_per_bath)
    caption_model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)  
        
  caption_model.save(model_path)
else:
  load_model(model_path)

  0%|          | 0/2 [00:00<?, ?it/s]



 50%|█████     | 1/2 [1:17:01<1:17:01, 4621.75s/it]



100%|██████████| 2/2 [2:33:59<00:00, 4619.54s/it]




In [45]:

START = "<start>"
STOP = "<end>"

def generateCaption(photo):
    in_text = START
    for i in range(max_length):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [79]:
def evaluateGeneratedCaption(imageIndex, candidate):
  descriptions = test_img_descriptions[imageIndex]
  references = list(map(lambda d: d.split(), descriptions))
  cleaned_references = list(map(lambda r: list(filter(lambda x: x != '<start>' and x != '<end>', r)), references))
  score = bleu.sentence_bleu(cleaned_references, candidate.split())
  return score

In [80]:
for z in range(20):
  pic = list(test_img_encoddings.keys())[z]
  image = test_img_encoddings[pic].reshape((1,OUTPUT_DIM))
  print(os.path.join(root_captioning,'Flickr30k_Dataset', pic))
  x=plt.imread(os.path.join(root_captioning,'Flickr30k_Dataset', pic))
  plt.imshow(x)
  plt.show()
  caption = generateCaption(image)
  score = evaluateGeneratedCaption(pic, caption)
  print("Caption:",generateCaption(image))
  print("BLEU Score:", score)
  print("_____________________________________")

Output hidden; open in https://colab.research.google.com to view.