In [48]:
tf.__version__

In [46]:
import tensorflow as tf
import matplotlib.pyplot as plt
import collections
import random
import numpy as np
from tensorflow.keras.initializers import Constant
import pandas as pd
import os
from tensorflow import keras
import time
import re
from tqdm import tqdm
import json
from PIL import Image
from tensorflow.keras.layers.experimental.preprocessing import Resizing
image_path = '../input/flickr8k/Images/'
df = pd.read_csv('../input/flickr8k/captions.txt')

In [49]:
def auto_select_accelerator():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        print("Not running on TPUs")
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy

strategy = auto_select_accelerator()

In [50]:
def process_caption(text):
    text = text.lower()
    text = re.sub("[^A-Za-z]"," ", str(text))
    text = text[:-2]
    return text
df['caption'] = df['caption'].apply(lambda x: process_caption(x))

In [51]:
image_path = '../input/flickr8k/Images/'
image_path_to_caption = collections.defaultdict(list)

captions = df['caption']
train_captions = []
img_name_vector = []

for index,row in tqdm(df.iterrows()):
    caption = f"<start> {row['caption']} <end>"
    image_path_new  = image_path + row['image']
    image_path_to_caption[image_path_new].append(caption)
print(len(image_path_to_caption.keys()))

In [52]:
##change this to full dataset when training
train_image_paths = list(image_path_to_caption.keys())
train_image_paths = train_image_paths[:]

random.shuffle(train_image_paths)

In [53]:
train_captions = []
img_name_vector = []

for path in train_image_paths:
      caption_list = image_path_to_caption[path]
      train_captions.extend(caption_list)
      img_name_vector.extend([path] * len(caption_list))

In [54]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = Resizing(299, 299)(img)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [55]:
image_model = tf.keras.applications.InceptionV3(include_top=False,weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [56]:
encode_train = sorted(set(img_name_vector))

image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(32)

In [59]:
for img, path in image_dataset:
    print(img,path)
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))
    
    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        path_of_feature = path_of_feature[25:]
        np.save(path_of_feature, bf.numpy())

In [60]:
caption_dataset = tf.data.Dataset.from_tensor_slices(train_captions)

def standardize(inputs):
  inputs = tf.strings.lower(inputs)
  return tf.strings.regex_replace(inputs,
                                  r"!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~", "")

max_length = 50
vocabulary_size = 5000
tokenizer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens =vocabulary_size,
                                                                         standardize=standardize,
                                                                        output_sequence_length=50)
tokenizer.adapt(caption_dataset)

def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tokenizer(text)

cap_vector = caption_dataset.map(vectorize_text)

In [61]:
#loading glove
embedding_dict={}
with open('../input/glove-embeddings/glove.6B.200d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
vocab = len(tokenizer.get_vocabulary()) 
embedding_matrix = np.zeros((vocab,200))
            
for token, word in tqdm(enumerate(tokenizer.get_vocabulary())):
    if(word in embedding_dict.keys()):
        embedding_matrix[token] = embedding_dict[word]

In [None]:
word_to_index = tf.keras.layers.experimental.preprocessing.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary())

index_to_word = tf.keras.layers.experimental.preprocessing.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary(),
    invert=True)

In [None]:
img_to_cap_vector = collections.defaultdict(list)
for img, cap in zip(img_name_vector, cap_vector):
    cap = tf.squeeze(cap)
    img_to_cap_vector[img].append(cap)

img_keys = list(img_to_cap_vector.keys())
random.shuffle(img_keys)

slice_index = int(len(img_keys)*0.95)
img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[slice_index:]

img_name_train = []
cap_train = []
for imgt in img_name_train_keys:
    capt_len = len(img_to_cap_vector[imgt])
    img_name_train.extend([imgt] * capt_len)
    cap_train.extend(img_to_cap_vector[imgt])

img_name_val = []
cap_val = []

for imgv in img_name_val_keys:
    capv_len = len(img_to_cap_vector[imgv])
    img_name_val.extend([imgv] * capv_len)
    cap_val.extend(img_to_cap_vector[imgv])


In [None]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 200
units = 512
num_steps = len(img_name_train) // BATCH_SIZE

# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [None]:
def map_func(img_name, cap):
  img_name = img_name.decode('utf-8')[25:]
  img_tensor = np.load('./'+ img_name  +'.npy')
  return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int64]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    # attention_hidden_layer shape == (batch_size, 64, units)
    attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                         self.W2(hidden_with_time_axis)))

    # score shape == (batch_size, 64, 1)
    # This gives you an unnormalized score for each image feature.
    score = self.V(attention_hidden_layer)

    # attention_weights shape == (batch_size, 64, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights


In [None]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer=Constant(embedding_matrix),
                                               trainable=False)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    # defining attention as a separate model
    context_vector, attention_weights = self.attention(features, hidden)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)

    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)

    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))


In [None]:
with strategy.scope():
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, len(tokenizer.get_vocabulary()))
    optimizer = tf.keras.optimizers.Adam() 
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [None]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  # restoring the latest checkpoint in checkpoint_path
  ckpt.restore(ckpt_manager.latest_checkpoint)


In [None]:
loss_plot = []
Train = True

In [None]:
@tf.function
def train_step(img_tensor, target):
  loss = 0

  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])

  dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)

  with tf.GradientTape() as tape:
      features = encoder(img_tensor)
      #print(features.shape)
      #print(target)
      for i in range(1, target.shape[1]):
          # passing the features through the decoder
          predictions, hidden, _ = decoder(dec_input, features, hidden)
          #print(predictions,hidden)

          loss += loss_function(target[:, i], predictions)

          # using teacher forcing
          dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))

  return loss, total_loss


In [None]:
EPOCHS = 15
if Train:
    
    for epoch in tqdm(range(start_epoch, EPOCHS)):
        start = time.time()
        total_loss = 0

        for (batch, (img_tensor, target)) in enumerate(dataset):
            
            batch_loss, t_loss = train_step(img_tensor, target)
            total_loss += t_loss

            if batch % 100 == 0:
                average_batch_loss = batch_loss.numpy()/int(target.shape[1])
                print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
        # storing the 
        #epoch end loss value to plot later
        loss_plot.append(total_loss / num_steps)

#         if epoch % 5 == 0:
#           ckpt_manager.save()

        print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
        print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')
else:
    encoder = keras.models.load_model(model_directory)
    decoder = keras.models.load_model(model_directory)


In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

In [None]:
try:
    os.makedirs('./images/')
except:
    pass

In [None]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0],-1, img_tensor_val.shape[3]))

    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([word_to_index('<start>')], 0)
    result = []
    result.append('<start>')

    for i in range(max_length):
        
        predictions, hidden, attention_weights = decoder(dec_input,features,hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        #print(predicted_id)
        predicted_word = tf.compat.as_text(index_to_word(int(predicted_id)).numpy())
        result.append(predicted_word)

        if predicted_word == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [None]:
def plot_attention(image, result, attention_plot, display_attention=False):
    temp_image = np.array(Image.open(image))
    plt.imshow(temp_image)

    fig = plt.figure(figsize=(10,10))

    len_result = len(result)
    
    if display_attention:
        for i in range(len_result):
            temp_att = np.resize(attention_plot[i], (8, 8))
            grid_size = max(int(np.ceil(len_result/2)), 2)
            ax = fig.add_subplot(grid_size, grid_size, i+1)
            ax.set_title(result[i])
            img = ax.imshow(temp_image)
            ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

        plt.tight_layout()
    plt.show()
    #plt.plot()


In [None]:
rid = np.random.randint(0, len(img_name_val))
image = img_name_val[rid]
real_caption = ' '.join([tf.compat.as_text(index_to_word(i).numpy())
                         for i in cap_val[rid] if i not in [0]])
result, attention_plot = evaluate(image)

print('Real Caption:', real_caption)
print('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot,True)

## Checking the bleu score

In [None]:
from nltk.translate.bleu_score import sentence_bleu

BLEU1 = []
BLEU2 = []
BLEU3 = []
BLEU4 = []

counter = 0 
def convert_to_caption(real_caption):
    return ' '.join([tf.compat.as_text(index_to_word(i).numpy())
                         for i in real_caption if i not in [0]])

for i in range(0,len(img_name_val),5):
    references = []
    [references.append(convert_to_caption(j)[8:-6]) for j in cap_val[i:i+5]]
    
    #print(references)
    counter = counter+1
    result,_ = evaluate(img_name_val[i])
    predicted_caption = ' '.join(result)
    #print(predicted_caption)
    predicted_caption = predicted_caption[8:-5]

    #print(predicted_caption)
    bleu = sentence_bleu(references, predicted_caption, weights=(1, 0, 0, 0))
    if bleu > 0.7 and counter<=5:
        print(predicted_caption)
        plot_attention(img_name_val[i], result, attention_plot)
        print("-----------------------\n")
        
    BLEU1.append(sentence_bleu(references, predicted_caption, weights=(1, 0, 0, 0)))
    BLEU2.append(sentence_bleu(references, predicted_caption, weights=(0, 1, 0, 0)))
    BLEU3.append(sentence_bleu(references, predicted_caption, weights=(0, 0, 1, 0)))
    BLEU4.append(sentence_bleu(references, predicted_caption, weights=(0, 0, 0, 1)))

In [None]:
print("The Average unigram Blue score on the validation set is", np.mean(np.array(BLEU1)))
print("The Average bigram Blue score on the validation set is", np.mean(np.array(BLEU2)))
print("The Average trigram Blue score on the validation set is", np.mean(np.array(BLEU3)))
print("The Average fourgram Blue score on the validation set is", np.mean(np.array(BLEU4)))

In [None]:
# os.mkdir('results/')
# np.save('results/',np.array(scores))
# os.mkdir('models/')

# encoder.save_weights('models/encoder_big_new')
# decoder.save_weights('models/decoder_big_new')

In [None]:
# rid = np.random.randint(0, len(img_name_val)//5)*5+1
# for i in range(rid, rid+5,1):
#     caption_decoded = ' '.join([tf.compat.as_text(index_to_word(j).numpy())
#                          for j in cap_val[i] if j not in [0]])
#     print(caption_decoded)
    
#     print(df.iloc[i]['caption'])
#     print("-----------------")
    
# # image = img_name_val[rid]
# # temp_image = np.array(Image.open(image))
# # plt.imshow(temp_image)
# #for i in (rid, rid+5):
    
# #for i in range(0)

In [None]:
# rid = np.random.randint(0, len(img_name_val))
# image = img_name_val[rid]
# real_caption = ' '.join([tf.compat.as_text(index_to_word(i).numpy())
#                          for i in cap_val[rid] if i not in [0]])
# result, attention_plot = evaluate(image)
# prediction_caption =  ' '.join(result)
# print("the Blue score for the image is ", sentence_bleu(prediction_caption, real_caption))
# print('Real Caption:', real_caption)
# print('Prediction Caption:', ' '.join(result))
# plot_attention(image, result, attention_plot)

In [None]:
# from nltk.translate.bleu_score import sentence_bleu
# index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])


# nkeep = 5
# pred_good, pred_bad, bleus = [], [], [] 
# count = 0 
# for jpgfnm, image_feature, tokenized_text in zip(fnm_test,di_test,dt_test):
#     count += 1
#     if count % 200 == 0:
#         print("  {:4.2f}% is done..".format(100*count/float(len(fnm_test))))
    
#     caption_true = [ index_word[i] for i in tokenized_text ]     
#     caption_true = caption_true[1:-1] ## remove startreg, and endreg
#     ## captions
#     caption = predict_caption(image_feature.reshape(1,len(image_feature)))
#     caption = caption.split()
#     caption = caption[1:-1]## remove startreg, and endreg
    
#     bleu = sentence_bleu([caption_true],caption)
#     bleus.append(bleu)
#     if bleu > 0.7 and len(pred_good) < nkeep:
#         pred_good.append((bleu,jpgfnm,caption_true,caption))
#     elif bleu < 0.3 and len(pred_bad) < nkeep:
#         pred_bad.append((bleu,jpgfnm,caption_true,caption))
