In [58]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import json
import collections
import pickle
import time
import os
import re
import cv2
import gensim
from tensorflow.keras.applications.resnet50 import ResNet50,preprocess_input,decode_predictions
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense,Flatten,Embedding,Dropout,Input,LSTM,add
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
#%xmode verbose

In [50]:
## Loading data

flickr_data = pd.read_csv('flickr30k/results.csv',delimiter='|')
flickr_data.drop(columns=[' comment_number'],axis=1,inplace=True)

## Making dictionary to map image id with its captions

In [51]:
grouped_by_name = flickr_data.groupby(['image_name',' comment'])['image_name'].agg({'Frequency':'count'})

dictImgCap = {k.split('.')[0]:list(grouped_by_name.loc[k].index) for k in grouped_by_name.index.levels[0]}

dictImgCap = json.dumps(dictImgCap)

with open('./resources/descriptions.txt','w') as f:
    f.write(dictImgCap)

dictImgCap=json.loads(dictImgCap)

In [52]:
#lowercasing the captions

for img_id, captions in dictImgCap.items():
    for caption in captions:
        caption = re.sub('[^a-z]+',' ',caption.lower())
        caption = ' '.join(caption.split(' '))

## Creating vocab dictionary

In [53]:
all_words=[]
for key in dictImgCap.keys():
    [all_words.append(word.lower()) for sentence in dictImgCap[key] for word in sentence.split()]
len(all_words)

2127459

### Getting words which has frequency more than 10

In [54]:
counter = collections.Counter(all_words)
counter = dict(counter)

vocab = [word for word,freq in counter.items() if freq>10]
vocab = sorted(vocab)
print(len(vocab))

with open('./resources/vocab.pkl','wb') as f:
    pickle.dump(vocab,f)

5173


## Preparing Training and Testing Data

In [55]:
train_images = list(dictImgCap.keys())
#test_images = readTextFiles("flickr8k/Data/Flickr_TextData/Flickr_8k.testImages.txt").split('\n')[:-1]

In [56]:
train_descriptions = {}
for img in train_images:
    train_descriptions[img]=[]
    for caption in dictImgCap[img]:
        cap_to_add = "<s> " + caption + " <e>"
        train_descriptions[img].append(cap_to_add)
        
train_desc=json.dumps(train_descriptions)

with open('./resources/train_descriptions.txt','w') as f:
    f.write(train_desc)

## Extracting Image features using Transfer learning

In [59]:
model = ResNet50(weights='imagenet',input_shape=(224,224,3))
model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalizationV1) (None, 112, 112, 64) 256         conv1[0][0]                      
___________________________________________________________________________________________

In [60]:
new_model = Model(model.input,model.layers[-2].output)
new_model.save('./resources/resNet50.h5')
new_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalizationV1) (None, 112, 112, 64) 256         conv1[0][0]                      
______________________________________________________________________________________________

In [61]:
def preprocess_image(img_path):
    img = image.load_img(img_path,target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    img = preprocess_input(img)  ## preprocessing for ResNet50
    return img
def encode_image(img_path):
    img = preprocess_image(img_path)
    img_feature_vector = new_model.predict(img)
    img_feature_vector = img_feature_vector.reshape((-1,))
    return img_feature_vector

In [62]:
encoded_train_data = {}
for i,img in enumerate(train_images):
    img_path = "flickr30k/flickr30k_images/"+img+".jpg"
    encoded_img = encode_image(img_path)
    encoded_train_data[img]=encoded_img
    
    if i%100==0:
        print("Images Processed : %d"%(i+1))
        
with open("./resources/encoded_train_images.pkl","wb") as f:
    pickle.dump(encoded_train_data,f)        

Images Processed : 1
Images Processed : 101
Images Processed : 201
Images Processed : 301
Images Processed : 401
Images Processed : 501
Images Processed : 601
Images Processed : 701
Images Processed : 801
Images Processed : 901
Images Processed : 1001
Images Processed : 1101
Images Processed : 1201
Images Processed : 1301
Images Processed : 1401
Images Processed : 1501
Images Processed : 1601
Images Processed : 1701
Images Processed : 1801
Images Processed : 1901
Images Processed : 2001
Images Processed : 2101
Images Processed : 2201
Images Processed : 2301
Images Processed : 2401
Images Processed : 2501
Images Processed : 2601
Images Processed : 2701
Images Processed : 2801
Images Processed : 2901
Images Processed : 3001
Images Processed : 3101
Images Processed : 3201
Images Processed : 3301
Images Processed : 3401
Images Processed : 3501
Images Processed : 3601
Images Processed : 3701
Images Processed : 3801
Images Processed : 3901
Images Processed : 4001
Images Processed : 4101
Imag

In [None]:
# encoded_test_data = {}
# for i,img in enumerate(test_images):
#     img_path = "flickr8k/Data/Images/"+img
#     encoded_img = encode_image(img_path)
#     encoded_test_data[img]=encoded_img
    
#     if i%100==0:
#         print("Images Processed : %d"%(i+1))

# with open("./resources/encoded_test_images.pkl","wb") as f:
#     pickle.dump(encoded_test_data,f)

## Preprocessing Captions

In [63]:
## making word2idx and idx2word dictionaries
word2idx = {}
idx2word = {}

for i,word in enumerate(vocab):
    word2idx[word] = i+1
    idx2word[i+1] = word

## adding <s> and <e> to vocab
word2idx["<s>"] = 5174
idx2word[5174] = "<s>"

word2idx["<e>"] = 5175
idx2word[5175] = "<e>"


with open('./resources/word2idx.pkl','wb') as f:
    pickle.dump(word2idx,f)
with open('./resources/idx2word.pkl','wb') as f:
    pickle.dump(idx2word,f)

In [64]:
maxlen=0
for item in train_descriptions.keys():
    for cap in train_descriptions[item]:
        if len(cap.split()) > maxlen:
            maxlen=len(cap.split())
print(maxlen) 

84


## Using Glove Embeddings

In [65]:
f=open('glove.6B.50d.txt',encoding='utf8')

In [66]:
glove_embedding={}

for line in f:
    value= line.split()
    word = value[0]
    embedding = np.array(value[1:],dtype='float32')
    glove_embedding[word]=embedding 

f.close()

In [67]:
def get_embedding_matrix():
    embedding_matrix = np.zeros((len(word2idx)+1,50))
    for word,idx in word2idx.items():
        embedding = glove_embedding.get(word)
        if embedding is not None:
            embedding_matrix[idx] = glove_embedding[word] 
    return embedding_matrix

In [68]:
embedding_matrix = get_embedding_matrix()
with open('./resources/embedding_matrix.pkl','wb') as f:
    pickle.dump(embedding_matrix,f)

In [72]:
embedding_matrix[5173]

array([ 1.07609999e+00,  1.03639996e+00, -2.39590004e-01,  7.58300012e-04,
       -3.62280011e-01, -1.61500001e+00, -1.73150003e+00, -8.29580009e-01,
        7.73880005e-01,  2.51269996e-01,  3.84810001e-01,  8.21200013e-03,
        8.76010001e-01,  1.57900006e-01,  3.41720015e-01,  9.21370029e-01,
        3.60289991e-01,  5.74419975e-01, -1.05949998e+00,  1.83920002e+00,
       -4.26209986e-01, -1.16740000e-02, -1.27969995e-01,  5.64079992e-02,
       -5.40910006e-01, -1.23590004e+00, -4.60689992e-01,  3.81830007e-01,
       -7.29040027e-01, -1.21910000e+00,  6.65069997e-01,  5.14090002e-01,
        1.04759997e-02, -5.18060029e-01,  1.91180006e-01,  3.34459990e-01,
        1.08930004e+00,  8.13400000e-02,  7.68519998e-01,  1.49480000e-01,
       -6.87500000e-01, -1.77059993e-01,  9.17180032e-02,  5.18920004e-01,
        1.36059999e+00, -2.12099999e-01, -7.09619999e-01, -2.67419994e-01,
        5.75829983e-01,  9.92359966e-02])