# Steps
1. data collection
2. Understanding the data
3. Data cleaning
4. Loading the training set
5. Data preprocessing - images
6. Data preprocessing - Captions
7. Data preparation using Generator Function
8. Word Embeddings
9. Model Architecture
10. Inference
11. Evaluation

In [None]:
!pip install tensorflow

In [None]:
!pip install keras

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import re
import nltk
import string
import json
from time import time
import pickle
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input,Dense,Dropout,Embedding, LSTM
from keras.layers.merge import add


In [None]:
#read text caption

def readTextFile(path):
    with open(path) as f:
        captions = f.read() #readline to read each line
    return captions
        

In [None]:
captions = readTextFile("flickr_dataset/Flickr8k_text/Flickr8k.token.txt")
captions = captions.split("\n")[:-1]
#len(captions.split("\n"))  --> 40461

In [None]:
len(captions)

In [None]:
captions[-1]

In [None]:
captions[0]

In [None]:
first, second = captions[0].split("\t")
print(first.split(".")[0])
print(second)

In [None]:
print(captions[2])

In [None]:
descriptions = {}

for i in range(0,40460):
    first, second = captions[i].split("\t")
    img_name = first.split(".")[0]
    
    #if the image id is already present or not
    if descriptions.get(img_name) is None:
        descriptions[img_name] = []
        
    descriptions[img_name].append(second)

In [None]:
descriptions["1000268201_693b08cb0e"]

In [None]:
!pip3 install opencv-python

In [None]:
IMG_PATH = "flickr_dataset/Flicker8k_Dataset/"

import cv2
from matplotlib import pyplot as plt

img = cv2.imread(IMG_PATH+"1000268201_693b08cb0e.jpg")
cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()

# data cleaning

In [None]:
def clean_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub("[^a-z]"," ", sentence)
    sentence = sentence.split()
    
    sentence = [s for s in sentence if len(s)>1]
    sentence = " ".join(sentence)
    return sentence
    

In [None]:
clean_text("A cat is sitting over the house number 64")

In [None]:
#clean all captions
for key, caption_list in descriptions.items():
    for i in range(len(caption_list)):
        caption_list[i] = clean_text(caption_list[i])

In [None]:
descriptions["1000268201_693b08cb0e"]

In [None]:
# write the data to text file
with open("descriptions_1.txt","w") as f:
    f.write(str(descriptions))

# Vocabulary
(set of all unique words model can predict)

In [None]:
descriptions = None
with open("descriptions_1.txt","r") as f:
    descriptions= f.read()
    
json_acceptable_string = descriptions.replace("'","\"")
descriptions = json.loads(json_acceptable_string)

In [None]:
print(type(descriptions))

In [None]:
# Vocab
vocab = set()
for key in descriptions.keys():
    [vocab.update(sentence.split()) for sentence in descriptions[key]]
    
print("Vocab Size : %d"% len(vocab))
    

In [None]:
total_words = []

for key in descriptions.keys():
    [total_words.append(i) for des in descriptions[key] for i in des.split()]
    
print("Total words %d"%len(total_words))

In [None]:
import collections

counter = collections.Counter(total_words)
freq_cnt = dict(counter)
print(len(freq_cnt.keys()))

In [None]:
#sort the dictionary according to the freq count
sorted_freq_cnt = sorted(freq_cnt.items(), reverse=True, key= lambda x:x[1])

#Filter
threshold = 10
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
total_words = [x[0] for x in sorted_freq_cnt]

In [None]:
#sorted_freq_cnt
'''
[('in', 18987),
 ('the', 18420),
 ('on', 10746),
 ('is', 9345),
 ('and', 8863),
 ('dog', 8138),
 ('with', 7765),
 ('man', 7275),.............]
 '''

# prepare train/test data

In [None]:
train_file_data = readTextFile("flickr_dataset/Flickr8k_text/Flickr_8k.trainImages.txt")
test_file_data = readTextFile("flickr_dataset/Flickr8k_text/Flickr_8k.testImages.txt")

In [None]:
print(train_file_data[10])

In [None]:
train = [row.split(".")[0] for row in train_file_data.split("\n")[:-1]]
test = [row.split(".")[0] for row in test_file_data.split("\n")[:-1]]

print(train[:5])
# print(test[:-10])

In [None]:
train_descriptions= {}

for img_id in train:
    train_descriptions[img_id] = []
    for cap in descriptions[img_id]:
        cap_to_append ="startseq "+cap+" endseq"
        train_descriptions[img_id].append(cap_to_append)
        

In [None]:
train_descriptions["1000268201_693b08cb0e"]

# Transfer Learning

images --> Features
Text ----> Features

## Step 1 Image Feature Extraction


In [None]:
model = ResNet50(weights="imagenet", input_shape=(224,224,3))
model.summary()

In [None]:
# resnet --> extract features
model.layers[-2]

In [None]:
model.layers[-2].output

In [None]:
model_new = Model(model.input, model.layers[-2].output)

In [None]:
def preprocess_img(img):
    img = image.load_img(img, target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis = 0)
    # Normalisation
    img = preprocess_input(img)
    return img

In [None]:
img = preprocess_img(IMG_PATH+"1000268201_693b08cb0e.jpg")
plt.imshow(img[0])
plt.axis("off")
plt.show()
# print(img)

In [None]:
def encode_img(img):
    img = preprocess_img(img)
    feature_vector = model_new.predict(img)
    feature_vector = feature_vector.reshape((-1,))
    #print(feature_vector.shape)
    return feature_vector

In [None]:
encode_img(IMG_PATH+"1000268201_693b08cb0e.jpg")

In [None]:
start = time()
encoding_train = {}
#image_id ---> feature_vector extracted from Resnet Image
for ix, img_ig in enumerate(train):
    img_path = IMG_PATH+"/"+img_id+".jpg"
    encoding_train[img_id] = encode_img(img_path)
    
    if ix%100==0 :
        print("Encoding in Progress Time step %d "%ix)
end_t = time()
print("Total time taken :", end_t - start)


In [None]:
#store every thing to the disk
with open("encoded_train_features.pk1","wb") as f:
    pickle.dump(encoding_train, f)

In [None]:
start = time()
encoding_test = {}
#image_id ---> feature_vector extracted from Resnet Image
for ix, img_ig in enumerate(test):
    img_path = IMG_PATH+"/"+img_id+".jpg"
    encoding_test[img_id] = encode_img(img_path)
    
    if ix%100==0 :
        print("Test Encoding in Progress Time step %d "%ix)
end_t = time()
print("Total time taken(test) :", end_t - start)


In [None]:
#store every thing to the disk
with open("encoded_test_features.pk1","wb") as f:
    pickle.dump(encoding_test, f)

### Data Pre-processing for Captions

In [None]:
# Vocab
len(total_words)

In [None]:
word_to_idx = {}
idx_to_word = {}

for i,word in enumerate(total_words):
    word_to_idx[word] = i+1
    idx_to_word[i+1] = word
    

In [None]:
print(word_to_idx["dog"])
print(idx_to_word[1])
print(len(idx_to_word))

In [None]:
idx_to_word[1846] = 'startseq'
word_to_idx['startseq'] = 1846

idx_to_word[1847] = 'endseq'
word_to_idx['endseq'] = 1847

vocab_size = len(word_to_idx)+1
print("Vocab Size", vocab_size)

In [None]:
max_len = 0
for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        max_len = max(max_len, len(cap.split()))
        
print(max_len)

## Data Loader

In [None]:
    def data_generator(train_descriptions, encoding_train,word_to_idx, max_len, batch_size):
        X1,X2, y = [],[],[]
        
        n=0
        while True:
            for key, desc_list in train_descriptions.items():
                n+=1
                
                photo = encoding_train[key+".jpg"]
                for desc in desc_list:
                    seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx]
                    for i in range(1, len(seq)):
                        xi = seq[0:i]
                        yi = seq[i]
                        
                        # 0 denote padding word
                        xi = pad_sequence([xi],maxlen = maxlen, value = 0, padding = 'post')
                        yi = to_categorical([yi], num_classes = vocab_size)[0]
                        
                        X1.append(photo)
                        X2.append(xi)
                        y.append(yi)
                        
                    if n == batch_size:
                        yield [[np.array(X1), np.array(X2)],np.array(y)]
                        
                        X1,X2,y = [],[],[]
                        n=0
                        

# Word Embedding

In [None]:
f = open("./saved/glove.6B.50d.txt",encoding='utf8')

In [None]:
embedding_index = {}

for line in f:
    values = line.split()
    
    word = values[0]
    word_embedding = np.array(values[1:], dtype='float')
    embedding_index[word] = word_embedding


In [None]:
f.close()

In [None]:
embedding_index['apple']

In [None]:
def get_embedded_matrix():
    emb_dim = 50
    matrix = np.zeros((vocab_size, emb_dim))
    for word,idx in word_to_idx.items():
        embedding_vector = embedding_index.get(word)
        
        if embedding_vector is not None:
            matrix[idx] = embedding_vector
        
    return matrix

In [None]:
print(type(word_to_idx))

In [None]:
word_to_idx['the']

In [None]:
embedding_matrix = get_embedded_matrix()
embedding_matrix.shape

In [None]:
embedding_matrix[1847]

# Model Architecture

In [None]:
input_img_features = Input(shape=(2048,))
inp_img1 = Dropout(0.3)(input_img_features)
inp_img2 = Dense(256, activation='relu')(inp_img1)

In [None]:
# Captions as Input
input_captions = Input(shape=(max_len,))
inp_cap1 = Embedding(input_dim = vocab_size, output_dim = 50, mask_zero = True)(input_captions)
inp_cap2 = Dropout(0.3)(inp_cap1)
inp_cap3 = LSTM(256)(inp_cap2)

In [None]:
decoder1 = add([inp_img2, inp_cap3])
decoder2 = Dense(256, activation ='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

#Combined Model
model = Model(inputs = [input_img_features,input_captions], outputs = outputs)

In [None]:
model.summary()

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [None]:
model.compile(loss='categorcal_crossentropy',optimizer = 'adam')

# Train our model

In [None]:
# Training Of Model
epochs = 20
batch_size  = 3
steps = len(train_descriptions)//batch_size


In [None]:
 for i in range(epochs):
        generator= data_generator(train_descriptions, encoding_train, word_to_idx, max_len, batch_size)
        model.fit_generator(generator, epochs = 1,steps_per_epoch = steps, verbose =1 )
        model.save('./model_weights/model_'+ str(i)+'.h5')

In [None]:
def predict_caption(photo):
    
    in_text = "startseq"
    for i in range(max_len):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        sequence = pad_sequences([sequence], maxlen = max_len, padding='post')
        
        ypred = model.predict([photo, sequence])
        ypred = ypred.argmax() #word with max prob always - Greedy Sampling
        word = idx_to_word[ypred]
        in_text = idx_to_word[ypred]
        in_text = ''+word
        
        if word == 'endseq':
            break
            
    final_caption = in_text.split()[1:-1]
    final_caption = ' '.join(final_caption)
    
    return final_caption