In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import json
import collections
import pickle
import time
import os
import cv2
import gensim
from keras.applications.resnet50 import ResNet50,preprocess_input,decode_predictions
from keras.preprocessing import image
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Flatten,Embedding,Dropout,Input,LSTM,add
from keras.models import Model,load_model
from keras.utils.np_utils import to_categorical 
#%xmode verbose

Using TensorFlow backend.


In [39]:
def readTextFiles(path):
    with open(path) as f:
        return f.read()
captions = readTextFiles("flickr8k/Data/Flickr_TextData/Flickr8k.token.txt").split('\n')

# last index is an empty string so we need to clip it
captions = captions[:-1]
print(captions[90])

1022975728_75515238d8.jpg#0	A black dog running in the surf .


## Making dictionary to map image id with its captions

In [40]:
dictImgCap = {}

for line in captions:
    imgId,caption = line.split('\t')
    imgId = imgId.split(".")[0]
    
    if dictImgCap.get(imgId) is None:
        dictImgCap[imgId]=[]
        
    dictImgCap[imgId].append(caption)
    
dictImgCap=json.dumps(dictImgCap)

with open('./resources/descriptions.txt','w') as f:
    f.write(dictImgCap)

dictImgCap=json.loads(dictImgCap)    

## Creating vocab dictionary

In [41]:
all_words=[]
for key in dictImgCap.keys():
    [all_words.append(word.lower()) for sentence in dictImgCap[key] for word in sentence.split()]
len(all_words)

476706

### Getting words which has frequency more than 10

In [45]:
counter = collections.Counter(all_words)
counter = dict(counter)

vocab = [word for word,freq in counter.items() if freq>10]
vocab = sorted(vocab)
print(len(vocab))

with open('./resources/vocab.pkl','wb') as f:
    pickle.dump(vocab,f)

1855


## Preparing Training and Testing Data

In [43]:
train_images = readTextFiles("flickr8k/Data/Flickr_TextData/Flickr_8k.trainImages.txt").split('\n')[:-1]
test_images = readTextFiles("flickr8k/Data/Flickr_TextData/Flickr_8k.testImages.txt").split('\n')[:-1]

In [44]:
train_descriptions = {}
for img in train_images:
    img=img.split('.')[0]
    train_descriptions[img]=[]
    for caption in dictImgCap[img]:
        cap_to_add = "<s> " + caption + "<e>"
        train_descriptions[img].append(cap_to_add)
        
train_desc=json.dumps(train_descriptions)

with open('./resources/train_descriptions.txt','w') as f:
    f.write(train_desc)

## Extracting Image features using Transfer learning

In [16]:
model = ResNet50(weights='imagenet',input_shape=(224,224,3))
model.summary()


Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________

In [17]:
new_model = Model(model.input,model.layers[-2].output)
new_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
____________________________________________________________________________________________

Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
__________________________________________________________________________________________________


In [18]:
def preprocess_image(img_path):
    img = image.load_img(img_path,target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    img = preprocess_input(img)  ## preprocessing for ResNet50
    return img
def encode_image(img_path):
    img = preprocess_image(img_path)
    img_feature_vector = new_model.predict(img)
    img_feature_vector = img_feature_vector.reshape((-1,))
    return img_feature_vector

In [None]:
encoded_train_data = {}
for i,img in enumerate(train_images):
    img_path = "flickr8k/Data/Images/"+img
    encoded_img = encode_image(img_path)
    encoded_train_data[img]=encoded_img
    
    if i%100==0:
        print("Images Processed : %d"%(i+1))
        
with open("./resources/encoded_train_images.pkl","wb") as f:
    pickle.dump(encoded_train_data,f)        

In [None]:
encoded_test_data = {}
for i,img in enumerate(test_images):
    img_path = "flickr8k/Data/Images/"+img
    encoded_img = encode_image(img_path)
    encoded_test_data[img]=encoded_img
    
    if i%100==0:
        print("Images Processed : %d"%(i+1))

with open("./resources/encoded_test_images.pkl","wb") as f:
    pickle.dump(encoded_test_data,f)

## Preprocessing Captions

In [47]:
## making word2idx and idx2word dictionaries
word2idx = {}
idx2word = {}

for i,word in enumerate(vocab):
    word2idx[word] = i+1
    idx2word[i+1] = word

## adding <s> and <e> to vocab
word2idx["<s>"] = 1857
idx2word[1857] = "<s>"

word2idx["<e>"] = 1858
idx2word[1858] = "<e>"


with open('./resources/word2idx.pkl','wb') as f:
    pickle.dump(word2idx,f)
with open('./resources/idx2word.pkl','wb') as f:
    pickle.dump(idx2word,f)

In [48]:
maxlen=0
for item in train_descriptions.keys():
    for cap in train_descriptions[item]:
        if len(cap.split()) > maxlen:
            maxlen=len(cap.split())
print(maxlen) 

39


## Using Glove Embeddings

In [22]:
f=open('glove.6B.50d.txt',encoding='utf8')

In [23]:
glove_embedding={}

for line in f:
    value= line.split()
    word = value[0]
    embedding = np.array(value[1:],dtype='float32')
    glove_embedding[word]=embedding 

f.close()

In [49]:
def get_embedding_matrix():
    embedding_matrix = np.zeros((len(word2idx)+1,50))
    for word,idx in word2idx.items():
        embedding = glove_embedding.get(word)
        if embedding is not None:
            embedding_matrix[idx] = glove_embedding[word] 
    return embedding_matrix

In [50]:
embedding_matrix = get_embedding_matrix()
with open('./resources/embedding_matrix.pkl','wb') as f:
    pickle.dump(embedding_matrix,f)