In [6]:
import numpy as np
import requests
from urllib import request
from keras.preprocessing.image import load_img,img_to_array
from matplotlib import  pyplot as plt
from PIL import  Image
from io import  BytesIO
import io
from keras.preprocessing.text import  Tokenizer
from keras.preprocessing.sequence import  pad_sequences
from keras.utils import np_utils
from keras.models import Model
from tensorflow.keras.applications.vgg16 import VGG16
from keras.layers import *
from tqdm import tqdm
from nltk.corpus import stopwords

In [2]:
srcFile='c:/Ady/pysource/GoogleImageCaptioningDS/Train_GCC-training.tsv'

In [3]:
fp=open(srcFile,'r',encoding='ISO-8859-1')
allDescUrl=fp.readlines()[:10000]
fp.close()

In [4]:
def getImageDataFromURL(entry):
    desc,url=entry.split('\t')
    img=None
    validDesc=None
    try:
        if desc is not None and url is not None:
            resp=request.urlopen(url,timeout=5)
            if resp.getcode()==200 and resp.getcode()!=404:
                img=Image.open(BytesIO(resp.read()))
                img_arr=img_to_array(img.resize(size=(224,224)))
                
                if img_arr.shape==(224,224,3):
                    img_arr=img_arr/255
                    validDesc=desc
                    #print('Added:',url)
                else:
                    pass
            return validDesc,img_arr
        else:
            print('Discarded:',url)
            return None,None
        
    except Exception as e:
        return None,None

In [7]:
vgg16=VGG16(input_shape=(224,224,3),weights='imagenet',include_top=False)
vgg16.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [6]:
vision_inp=Input(shape=(224,224,3))
x=vgg16.get_layer('block5_pool').output
x=Flatten()(x)
img_out=Dense(4096,activation='relu')(x)
model=Model(inputs=vgg16.input,outputs=img_out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [7]:
def getEncodedImageData(model,img_arr):
    return model.predict(img_arr.reshape(1,224,224,3))

In [9]:
imgDescList=[]
for entry in tqdm(allDescUrl[:1000]):
    desc,img=getImageDataFromURL(entry)
    if desc is not None and img is not None:
        imgDescList.append([getEncodedImageData(model,img),desc])

len(imgDescList)


  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]
  0%|                                                                                 | 1/1000 [00:00<10:03,  1.66it/s]
  0%|▏                                                                                | 2/1000 [00:01<09:44,  1.71it/s]
  0%|▏                                                                                | 3/1000 [00:01<09:47,  1.70it/s]
  0%|▎                                                                                | 4/1000 [00:02<11:51,  1.40it/s]
  0%|▍                                                                                | 5/1000 [00:03<13:06,  1.27it/s]
  1%|▍                                                                                | 6/1000 [00:04<12:57,  1.28it/s]
  1%|▌                                                                                | 7/1000 [00:05<11:57,  1.38it/s]
  1%|▋                                 

874

In [10]:
imgArr=np.array([x[0] for x in imgDescList])

In [11]:
docList=[['<start>']+x[1].split(' ')+['<end>'] for x in imgDescList]
wordList=[]
for doc in docList:
    for word in doc:
        wordList.append(word)
uniqueWords=list(set(wordList))
word2idx=dict((w,i) for i,w in enumerate(uniqueWords))
idx2word=dict((i,w) for i,w in enumerate(uniqueWords))

In [12]:
nb_vocab=len(uniqueWords)+1
nb_vocab

2392

In [13]:
word2idx['<start>'],word2idx['<end>']

(574, 1455)

In [14]:
in_seq=[]
in_encodedImg=[]
out_seq=[]

for i,doc in enumerate(docList):
    encodedImgData=imgArr[i]
    for i in range(0,len(doc)-1,1):
        in_encodedImg.append(encodedImgData)
        in_seq.append([word2idx[x] for x in doc[:i+1]])
        out_seq.append(word2idx[doc[i+1]])
in_seq=np.array(in_seq)
out_seq=np.array(out_seq)
maxlen=max([len(x) for x in in_seq])
in_seq=pad_sequences(in_seq,maxlen=maxlen)
in_encodedImg=np.array(in_encodedImg)
out_seq_matrix=np.zeros((len(out_seq),nb_vocab))
for i,out in enumerate(out_seq):
    out_seq_matrix[i][out]=1

in_encodedImg=in_encodedImg.reshape((in_encodedImg.shape[0],4096))
in_encodedImg.shape,in_seq.shape,out_seq_matrix.shape,maxlen

((9919, 4096), (9919, 42), (9919, 2392), 42)

In [15]:
img_in_shape=(4096,)
text_in_shape=(maxlen,)

In [16]:
imgInpLayer=Input(img_in_shape)
img_l1=Dropout(rate=0.5)(imgInpLayer)
img_out=Dense(4096,activation='relu')(img_l1)

txtInpLayer=Input(text_in_shape)

embLayer=Embedding(output_dim=128,input_dim=nb_vocab,input_length=maxlen)(txtInpLayer)
lstm1=LSTM(64,return_sequences=True,activation='relu')(embLayer)
batchNorm1=BatchNormalization()(lstm1)
lstm2=LSTM(128,activation='relu')(lstm1)
batchNorm1=BatchNormalization()(lstm2)
dense1=Dense(4096,activation='relu')(batchNorm1)

merged=add([img_out,dense1])

dense2=Dense(1000,activation='relu')(merged)
drop1=Dropout(rate=0.20)(dense2)
batch_2=BatchNormalization()(drop1)
dense3=Dense(500,activation='relu')(drop1)
drop2=Dropout(rate=0.20)(dense3)
batch_3=BatchNormalization()(drop2)
out=Dense(nb_vocab,activation='softmax')(batch_3)

captionModel=Model([imgInpLayer,txtInpLayer],out)
captionModel.summary()

W1116 21:37:45.485755 11504 deprecation.py:506] From C:\Users\dhararn\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 42)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 42, 128)      306176      input_4[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 42, 64)       49408       embedding_1[0][0]                
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
lstm_2 (LS

In [17]:
captionModel.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

W1116 21:37:53.572315 11504 deprecation_wrapper.py:119] From C:\Users\dhararn\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [18]:
hist=captionModel.fit([in_encodedImg,in_seq],out_seq_matrix,batch_size=1024,epochs=500,validation_split=0.2,verbose=1,shuffle=True)

W1116 21:37:58.064290 11504 deprecation.py:323] From C:\Users\dhararn\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 7935 samples, validate on 1984 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500

In [None]:
idx=np.random.randint(len(allDescUrl[:439]))
desc,img=getImageDataFromURL(allDescUrl[idx])
if desc is not None and img is not None:
    plt.imshow(img)
    plt.show()

    testEncImage=getEncodedImageData(model=model,img_arr=img)
    testStartSeq=np.zeros_like(in_seq[0])
    testStartSeq[0]=word2idx['<start>']
    testStartSeq=testStartSeq.reshape((1,maxlen))
    outArr=[]
    pred=captionModel.predict([testEncImage,testStartSeq])[0]
    predWordList=[]
    for x in np.argsort(pred)[::-1]:
        try:
            if idx2word[x] is not None:
                predWordList.append(idx2word[x])
            else:
                break
        except Exception as e:
            pass
            
    if '<end>' in predWordList:
        print('Has end')
    else:
        print('Has no end')
    for idx in np.argsort(pred)[::-1]:
        try:
            predWord=idx2word[idx]
            if predWord or idx2word[idx]!='<end>':
                outArr.append(predWord)
            else:
                break
        except Exception as e:
            pass
print(' '.join(outArr[:10]))

In [None]:
idx=np.random.randint(len(allDescUrl[:439]))
desc,img=getImageDataFromURL(allDescUrl[idx])
if desc is not None and img is not None:
    plt.imshow(img)
    plt.show()

    testEncImage=getEncodedImageData(model=model,img_arr=img)
    testStartSeq=np.zeros_like(in_seq[0])
    testStartSeq[0]=word2idx['<start>']
    testStartSeq=testStartSeq.reshape((1,maxlen))
    outArr=[]
    pred=captionModel.predict([testEncImage,testStartSeq])[0]
    #print(min(pred),max(pred))
    pred=np.argsort(pred)
    #print(pred,pred[::-1])
    for p in pred[::-1]:
        #print(pred[p])
        word=idx2word[p]
        if word!='<end>':
            outArr.append(word)
        else:
            break
    print(' '.join(outArr))
            #idx2word[np.argmax(pred)])
    #for i in range(10):
    #    preds=captionModel.predict([testEncImage,testStartSeq])[0]
    #    word=idx2word[np.argmax(preds)]
    #    if word!='<end>':
    #        outArr.append(word)
    #        preds=preds[1:]
    #    else:
    #        break
    #print(' '.join(outArr))
else:
    print('Img not available')