In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from copy import deepcopy
import random
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import time

In [6]:
devices = tf.config.experimental.list_physical_devices("GPU")
for device in devices:
    tf.config.experimental.set_memory_growth(device,enable=True)

# Preprocessing Data

In [8]:
captions_path = "../images/instagram_data/captions_csv.csv"
images_path = "../images/instagram_data/img"
def convertFileToPath(filename):
    pathname = "../images/instagram_data/"+filename+".jpg"
    return pathname
captions_csv = pd.read_csv(captions_path).dropna()
keys_vector = captions_csv["Image File"].values
for i in range(len(keys_vector)):
    keys_vector[i] = convertFileToPath(keys_vector[i])

values_vector = captions_csv["Caption"].values
caption_dict = {}
for key,value in zip(keys_vector,values_vector):
    caption_dict[key]=value


training_paths = deepcopy(keys_vector)
random.shuffle(training_paths)

training_captions = []
for path in training_paths:
    training_captions.append(caption_dict[path])


FileNotFoundError: [Errno 2] No such file or directory: '../images/instagram_data/captions_csv.csv'

In [None]:
def preprocessImage(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image,channels=3)
    image = tf.image.resize(image, (299,299))
    image = tf.keras.applications.inception_v3.preprocess_input(image)
    return image,image_path


In [None]:
inception_model = tf.keras.applications.inception_v3.InceptionV3(include_top=False,weights="imagenet")
new_input = inception_model.input
hidden_layer = inception_model.layers[-1].output
feature_extraction_network = tf.keras.Model(new_input,hidden_layer)

In [None]:
encoded_data = sorted(set(keys_vector))
image_dataset = tf.data.Dataset.from_tensor_slices(encoded_data)
image_dataset = image_dataset.map(preprocessImage).batch(32)

In [None]:
# This took me about 25 mins on my laptop to finish processing
for image,path in tqdm(image_dataset):
    features = feature_extraction_network(image)
    features = tf.reshape(features, (features.shape[0],-1,features.shape[3]))
    for feat,p in zip(features,path):
        features_path = p.numpy().decode("utf-8")
        np.save(features_path, feat.numpy())

In [None]:
training_captions = ["<start> "+x for x in training_captions]
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 5000, oov_token = "<unk>",filters = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~')
tokenizer.fit_on_texts(training_captions)

In [None]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_captions)
training_sequences_vector = tf.keras.preprocessing.sequence.pad_sequences(training_sequences,padding = "post")

In [None]:
max_length = max(len(t) for t in training_sequences)

In [None]:
sequences_dict = {}
for image, caption in zip(keys_vector,training_sequences_vector):
    sequences_dict[image]=caption

In [None]:
image_keys = list(sequences_dict.keys())
random.shuffle(image_keys)

index = int(len(image_keys)*.8)
training_keys,validation_keys = image_keys[:index],image_keys[index:]

In [None]:
training_caps = []
for key in training_keys:
    training_caps.append(sequences_dict[key])
validation_caps = []
for key in validation_keys:
    validation_caps.append(sequences_dict[key])

In [None]:
batch_size = 16
buffer_size = 500
embedding_dim = 256
units = 512
vocab_size = 5001
num_steps = len(training_keys)
features_shape = 2048
attention_features_shape = 16

In [None]:
def mapCacheToMemory(image,caption):
    tens = np.load(image.decode("utf-8")+".npy")
    return tens,caption

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((training_keys,training_caps))
dataset = dataset.map(lambda x, y: tf.numpy_function(mapCacheToMemory,[x,y],[tf.float32,tf.int32]))
dataset = dataset.shuffle(buffer_size).batch(batch_size)

# Training the Model

In [None]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, features, hidden):
        hidden_axis = tf.expand_dims(hidden,1)
        attention_hidden_layer = (tf.nn.tanh(self.W1(features)+self.W2(hidden_axis)))
        score = self.V(attention_hidden_layer)
        attention_weights = tf.nn.softmax(score, axis = 1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector,axis = 1)
        return context_vector, attention_weights

In [None]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self,embedding_dim):
        super(CNN_Encoder,self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)
    
    def call(self,x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self,embedding_dim,units,vocab_size):
        super(RNN_Decoder,self).__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)
    
    def call(self,x,features,hidden):
        context_vector,attention_weights = self.attention(features,hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)
        output, state = self.gru(x)
        x = self.fc1(output)
        x = tf.reshape(x,(-1,x.shape[2]))
        x = self.fc2(x)
        return x,state,attention_weights
    
    def resetState(self,batch_size):
        return tf.zeros((batch_size,self.units))

In [None]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim,units,vocab_size)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
def lossFunction(expected,predicted):
    mask = tf.math.logical_not(tf.math.equal(expected,0))
    loss_ = loss_object(expected,predicted)
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


In [None]:
checkpoint_path = "./checkpoints/train"
checkpoint = tf.train.Checkpoint(encoder=encoder,decoder=decoder,optimizer=optimizer)
checkpoint_manager = tf.train.CheckpointManager(checkpoint,checkpoint_path,max_to_keep=5)

In [None]:
start_epoch = 1
if checkpoint_manager.latest_checkpoint:
    start_epoch = int(checkpoint_manager.latest_checkpoint.split('-')[-1])
    checkpoint.restore(checkpoint_manager.latest_checkpoint)

In [None]:
loss_plot = []
@tf.function
def trainStep(image_tensor, target):
    loss = 0
    hidden = decoder.resetState(batch_size=target.shape[0])
    decoder_input = tf.expand_dims([tokenizer.word_index["<start>"]] * target.shape[0],1)
    with tf.GradientTape() as tape:
        features = encoder(image_tensor)
        for i in range(1,target.shape[1]):
            prediction, hidden, _ = decoder(decoder_input,features,hidden)
            loss += lossFunction(target[:,i], prediction)
            decoder_input = tf.expand_dims(target[:,i],1)
    total_loss = (loss/int(target.shape[1]))
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss,trainable_variables)
    optimizer.apply_gradients(zip(gradients,trainable_variables))
    return loss, total_loss


In [None]:
epochs = 20
for epoch in range(start_epoch, epochs):
    start = time.time()
    total_loss = 0
    for (batch,  (image_tensor,target)) in enumerate(dataset):
        batch_loss, t_loss = trainStep(image_tensor,target)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy() / int(target.shape[1])
            print(f"Epoch {epoch} Batch {batch} Loss {average_batch_loss:.4f}")
        loss_plot.append(total_loss/num_steps)
    if epoch % 5 == 0:
        checkpoint_manager.save()
    print(f"Epoch {epoch} Loss {total_loss/num_steps:.6f}")
    print(f"Time taken for 1 epoch {time.time() - start:.2f} sec\n")