# Data Downloading

In [None]:
!mkdir /root/.kaggle/
!mv /content/kaggle.json /root/.kaggle/
!kaggle datasets download -d adityajn105/flickr8k

mkdir: cannot create directory ‘/content/.kaggle/’: File exists


In [None]:
!unzip -q /content/flickr8k.zip -d "/content/drive/MyDrive/Image Captioning/data"

# Encoder

In [None]:
import os, pickle, sys, tqdm
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from keras.models import Model

In [None]:
inceptionv3 = InceptionV3()
encoder = Model(inceptionv3.input, inceptionv3.layers[-2].output)
encoder.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 batch_normalization (BatchNorm  (None, 149, 1

In [None]:
data_path = '/content/drive/MyDrive/Image Captioning/data/Images'
imgs = os.listdir(data_path)
features = {}

for img in tqdm.tqdm(imgs, leave=True, ncols=80,):
    img_path = f'{data_path}/{img}'
    x = load_img(img_path, target_size=(299, 299))
    x = img_to_array(x)[None, :, :, :]
    x = preprocess_input(x)
    feature_vec = encoder.predict(x, verbose=0)
    feature_vec = feature_vec.squeeze()
    img_id = img.split('.')[0]
    features[img_id] = feature_vec

100%|███████████████████████████████████████| 8091/8091 [44:57<00:00,  3.00it/s]


In [None]:
pickle.dump(features, open('/content/drive/MyDrive/Image Captioning/data/features.pkl', 'wb'))

# ETL

In [None]:
import pandas as pd
import re, string

In [None]:
captions = pd.read_csv('/content/drive/MyDrive/Image Captioning/data/captions.txt')
captions.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [None]:
captions.shape

(40455, 2)

In [None]:
def clean(txt):
    txt = txt.lower()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub('', txt).strip()

In [None]:
df_result = captions.copy()
df_result['caption'] = df_result['caption'].apply(clean)

In [None]:
captions['caption'][:5].values

array(['A child in a pink dress is climbing up a set of stairs in an entry way .',
       'A girl going into a wooden building .',
       'A little girl climbing into a wooden playhouse .',
       'A little girl climbing the stairs to her playhouse .',
       'A little girl in a pink dress going into a wooden cabin .'],
      dtype=object)

In [None]:
df_result['caption'][:5].values

array(['a child in a pink dress is climbing up a set of stairs in an entry way',
       'a girl going into a wooden building',
       'a little girl climbing into a wooden playhouse',
       'a little girl climbing the stairs to her playhouse',
       'a little girl in a pink dress going into a wooden cabin'],
      dtype=object)

In [None]:
# df_result.to_csv('/content/drive/MyDrive/Image Captioning/data/result.txt', header=True, index=False, sep=',', mode='a')


## If you want to make a **dict** result instead of **DataFrame** result

In [None]:
# img_captions = {}

# for row in captions.iterrows():
#     id, cap = row[1]

#     cleaned_cap = clean(cap)

#     if id not in img_captions:
#         img_captions[id] = []
#     img_captions[id].append(cleaned_cap)

# img_captions['1000268201_693b08cb0e.jpg']

# Preprocessing

## Data and special tokens

In [None]:
cap = pd.read_csv('/content/drive/MyDrive/Image Captioning/data/result.txt')
cap.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse
3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...


In [None]:
cap.caption = cap.caption.apply(lambda txt:'<sos> '+txt+' <eos>')
cap.image = cap.image.str.replace('.jpg', '')
cap.head()

  cap.image = cap.image.str.replace('.jpg', '')


Unnamed: 0,image,caption
0,1000268201_693b08cb0e,<sos> a child in a pink dress is climbing up a...
1,1000268201_693b08cb0e,<sos> a girl going into a wooden building <eos>
2,1000268201_693b08cb0e,<sos> a little girl climbing into a wooden pla...
3,1000268201_693b08cb0e,<sos> a little girl climbing the stairs to her...
4,1000268201_693b08cb0e,<sos> a little girl in a pink dress going into...


In [None]:
cap_dict = cap.groupby('image')['caption'].agg(list).to_dict()
len(cap_dict)

8091

## Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
all_caps = []

for id in cap_dict:
    [all_caps.append(c) for c in cap_dict[id]]

len(all_caps)

40455

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_caps)

In [None]:
vocab_size = len(tokenizer.word_index)
vocab_size

8830

In [None]:
sequences = tokenizer.texts_to_sequences(all_caps)

In [None]:
all_caps[0]

'<sos> a child in a pink dress is climbing up a set of stairs in an entry way <eos>'

In [None]:
sequences[0]

[2, 1, 42, 4, 1, 90, 170, 7, 119, 53, 1, 395, 12, 392, 4, 28, 5223, 693, 3]

# Data Generator

In [None]:
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

In [None]:
max_len = max(map(len, all_caps))
max_len

207

In [None]:
with open('/content/drive/MyDrive/Image Captioning/data/features.pkl', 'rb') as f:
    features = pickle.load(f)

In [None]:
def data_generator():
    for img_id, caps_list in cap_dict.items():
        x1, x2, y = [], [], []
        sequences = tokenizer.texts_to_sequences(caps_list)
        for seq in sequences:
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                x1.append(features[img_id])
                x2.append(in_seq)
                y.append(out_seq)
        if len(x1)==len(x2)==len(y):
            x2 = pad_sequences(x2, maxlen=max_len)
            y = to_categorical(y, num_classes = vocab_size)
            yield [[np.array(x1), np.array(x2)], np.array(y)]

In [None]:
f = next(iter(data_generator()))
len(f)

2

# Model

## Architecture Building

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Dropout
from keras.layers import add

In [None]:
encoder.outputs[0].shape

TensorShape([None, 2048])

In [None]:
# Feature Extractor
in1 = Input(shape=(2048,), name='in1')
fe = Dropout(0.5)(in1)
fe = Dense(256)(fe)

fe.shape

TensorShape([None, 256])

In [None]:
# Sequence Model
in2 = Input(shape=(max_len,), name='in2')
sm = Embedding(vocab_size, 256, mask_zero=True)(in2)
sm = Dropout(0.2)(sm)
sm = LSTM(256)(sm)

sm.shape

TensorShape([None, 256])

In [None]:
# Decoder Block
db = add([fe, sm])
db = Dense(256, 'relu')(db)
outs = Dense(vocab_size, 'softmax', name='outs')(db)

outs.shape

TensorShape([None, 8830])

In [None]:
# Model
model = Model(inputs=[in1, in2], outputs=outs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 in2 (InputLayer)               [(None, 207)]        0           []                               
                                                                                                  
 in1 (InputLayer)               [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 207, 256)     2260480     ['in2[0][0]']                    
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['in1[0][0]']                    
                                                                                            

## Training

In [None]:
xy_generator = data_generator()
history = model.fit(xy_generator, epochs=2, steps_per_epoch=len(cap_dict)*0.5) #batch_size = 256

Epoch 1/2
 569/4045 [===>..........................] - ETA: 36:10 - loss: 4.2894 - accuracy: 0.2585

# Prediction

In [50]:
wrd2idx = tokenizer.word_index

In [None]:
def greedy_search(pic):
	start = '<sos>'
	for i in range(max_len):
		seq = [wrd2idx[word] for word in start.split() if word in wrd2idx]
		seq = pad_sequences([seq], maxlen = max_len)
		yhat = model.predict([pic, seq])
		yhat = np.argmax(yhat)
		word = wrd2idx[yhat]
		start += ' ' + word
		if word == '<eos>':
			break
	final = start.split()
	final = final[1:-1]
	final = ' '.join(final)
	return final
