In [1]:
from PIL import Image
import numpy as np
from math import ceil
import matplotlib.pyplot as plt
import pickle
import json
import string
import os

In [2]:
from keras.applications.xception import Xception # importing cnn model for image processing
from tensorflow.keras.preprocessing.text import Tokenizer # importing tokenizer for vocabulary
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, add
from keras.models import Model, load_model

In [3]:
def load_pickle_model(filename):
    with open(filename, 'rb') as file:
        model=pickle.load(file)
    return model

Data Cleaning

In [4]:
# load all captions in given file and cleaning them
def load_captions_data(filename):
    caption_dict={}
    with open(filename, "r") as file:
        data_lines=file.readlines()

    for line in data_lines:
        image_name=line.split('\t')[0].split('#')

        if not image_name[0] in caption_dict:
            caption_dict[image_name[0]]=[]

        cleaned_caption=clean_caption(line.split('\t')[1][:-1])
        caption_dict[image_name[0]].append(cleaned_caption)

    return caption_dict

In [5]:
# clean each caption
def clean_caption(original_caption):
    # tokenizing each word in lowercase and punctuations removed
    tokens=original_caption.lower().translate({string.punctuation: ''}).split()

    # remove all hanging 's and a
    tokens=[token for token in tokens if len(token)>1]

    # removing words with numbers in them
    tokens=[token for token in tokens if token.isalpha()]

    # converting back to string
    cleaned_caption='startseq ' + ' '.join(tokens) + ' endseq'

    return cleaned_caption

In [6]:
def save_data(data, filepath):
    with open(filepath, 'w') as file:
        json.dump(data, file)

In [7]:
filename="Flickr8k_Dataset/Flickr8k_text/Flickr8k.token.txt"

caption_data=load_captions_data(filename)
print("Caption Data Loaded Successfully")

save_data(caption_data, "data_models/captions_file.json")
print("Caption Data Saved Successfully")

Caption Data Loaded Successfully
Caption Data Saved Successfully


Extract Features

In [46]:
model=Xception(include_top=False, weights="imagenet", pooling='avg') # load cnn model

In [47]:
# extract features of all images in the directory by using given cnn model
def extract_features(model, directory):
    image_files=os.listdir(directory)
    features={}

    for file in image_files:
        filename=directory+'/'+file
        image=Image.open(filename).resize((299,299))
        image=np.expand_dims(image, axis=0)
        image=image/255
        feature=model.predict(image)
        features[file]=feature

    return features

In [None]:
features=extract_features(model, "Flickr8k_Dataset/Flicker8k_images")

In [None]:
with open('data_models/extracted_image_features.pickle', 'wb') as file:
    pickle.dump(features, file)

In [48]:
with open('data_models/xception_model.pickle', 'wb') as file:
    pickle.dump(model, file)

Loading Training Data

In [8]:
# to get list of all image names in given file
def load_image_names(filename):
    with open(filename, 'r') as file:
        lines=file.readlines()

    # to remove '\n' at the end of each line
    names=list(map(lambda line: line[:-1], lines))
    return names

In [9]:
train_images=load_image_names("Flickr8k_Dataset/Flickr8k_text/Flickr_8k.trainImages.txt")

In [10]:
# to get description of each images in image_names
def load_image_description(filename, image_names):
    with open(filename, 'r') as file:
        data=json.load(file)

    description={}
    for image in image_names:
        description[image]=data[image]

    return description

In [11]:
train_description=load_image_description("data_models/captions_file.json", train_images)

In [12]:
# to get features of all images in image_names
def load_image_features(filename, image_names):
    with open(filename, 'rb') as file:
        data=pickle.load(file)

    features={}
    for image in image_names:
        features[image]=data[image]

    return features

In [13]:
train_features=load_image_features('data_models/extracted_image_features.pickle', train_images)

Tokenizing Vocabulary

In [14]:
# converting dictionary of captions into list of tokenizing
def dict_to_list(dict):
    list=[]

    for namesList in dict.values():
        [list.append(name) for name in namesList]

    return list

In [15]:
# creating tokenizer fitting into text of all captions in the list
# this will vectorize each text corpus (mapping some integer to a word in vocabulary)
def create_tokenizer(descriptions):
    desc_list=dict_to_list(descriptions)
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [16]:
tokenizer=create_tokenizer(train_description)
with open("data_models/tokenizer.pickle", "wb") as file:
    pickle.dump(tokenizer, file)

vocab_size=len(tokenizer.word_index)+1
vocab_size # no. of words mapped in tokenizer

7266

In [17]:
max_length=max(len(desc) for desc in dict_to_list(train_description))
max_length # max length of any caption

186

Create a Data Generator

In [18]:
# to create list of features, input sequences and output sequences for given given description list
def create_sequence(feature, desc_list, tokenizer, max_length, vocab_size):
    # features list, in_seq list, out_seq list
    x1, x2, y=list(), list(), list()
    for desc in desc_list:
        # convert text to corresponding
        sequence=tokenizer.texts_to_sequences([desc])[0]

        # divide one sequence into in_seq and out_seq at each i
        for i in range(1, len(sequence)):
            in_seq, out_seq=sequence[:i], sequence[i]

            in_seq=pad_sequences([in_seq], maxlen=max_length)[0] # to make each in_seq of size equal to max_length
            out_seq=to_categorical([out_seq], num_classes=vocab_size)[0] # to encode out_seq to some class

            x1.append(feature)
            x2.append(in_seq)
            y.append(out_seq)

    return np.array(x1), np.array(x2), np.array(y)

In [19]:
# to generate data of all image features, their input sequences and output sequences
def data_generator(descriptions, features, tokenizer, max_length, vocab_size, batch_size):
    X1_batch, X2_batch, y_batch = [], [], []
    for key, description_list in descriptions.items():
        feature=features[key][0]
        input_image, input_seq, output_seq=create_sequence(feature, description_list, tokenizer, max_length, vocab_size)
        [X1_batch.append(image) for image in input_image]
        [X2_batch.append(seq) for seq in input_seq]
        [y_batch.append(output) for output in output_seq]

        if len(X1_batch)>=batch_size:
            yield (np.array(X1_batch[:batch_size]), np.array(X2_batch[:batch_size])), np.array(y_batch[:batch_size])
            X1_batch, X2_batch, y_batch=X1_batch[batch_size:], X2_batch[batch_size:], y_batch[batch_size:]

    if len(X1_batch)>0:
        yield (np.array(X1_batch), np.array(X2_batch)), np.array(y_batch)

In [20]:
batch_size=64
generated_data=data_generator(train_description, train_features, tokenizer, max_length, vocab_size, batch_size)
for x in generated_data:
    batch_len=len(x[0][0])

batch_len

17

Define CNN-RNN Model

In [21]:
def define_model(vocab_size, max_length):
    # Features from CNN model compressed from 2048 into 256
    inputs1=Input(shape=(2048,))
    fl1=Dropout(0.5)(inputs1)
    fl2=Dense(256, activation="relu")(fl1)

    # LSTM sequence model
    inputs2=Input(shape=(max_length,))
    sl1=Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    sl2=Dropout(0.5)(sl1)
    sl3=LSTM(256)(sl2)

    # Merge both models
    decoder1=add([fl2, sl3])

    decoder2=Dense(256, activation="relu")(decoder1)
    outputs=Dense(vocab_size, activation="softmax")(decoder2)

    # Merge all layers into model
    model=Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Summarize the model
    print(model.summary())
    # plot_model(model, to_file="data_models/model.png", show_shapes=True)

    return model

Training our Image Caption Generator Model

In [22]:
print("vocab size =", vocab_size)
print("max length =", max_length)
print("no. of train features =", len(train_features))
print("no. of train descriptions =", len(train_description))

vocab size = 7266
max length = 186
no. of train features = 6000
no. of train descriptions = 6000


In [84]:
# defining our model
model=define_model(vocab_size, max_length)

None


In [25]:
# data generation
batch_size=64
generated_data=data_generator(train_description, train_features, tokenizer, max_length, vocab_size, batch_size)

In [86]:
len(train_description)

6000

In [26]:
epochs=60
batch_size=64
# model training
model.fit(generated_data, epochs=epochs, steps_per_epoch=ceil(len(train_description)/batch_size), verbose=1)

model.save("data_models/image_captioning_model.keras")

Epoch 1/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 522ms/step - accuracy: 0.2773 - loss: 3.9245
Epoch 2/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 576ms/step - accuracy: 0.2508 - loss: 4.2061
Epoch 3/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 579ms/step - accuracy: 0.2537 - loss: 4.1466
Epoch 4/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 622ms/step - accuracy: 0.2528 - loss: 4.0564
Epoch 5/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 616ms/step - accuracy: 0.2714 - loss: 4.0100
Epoch 6/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 640ms/step - accuracy: 0.2844 - loss: 3.8261
Epoch 7/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 654ms/step - accuracy: 0.2457 - loss: 3.9192
Epoch 8/30
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 628ms/step - accuracy: 0.2727 - loss: 3.8315
Epoch 9/30
[1m94/94[0m [32m━━

In [27]:
test_images=load_image_names("Flickr8k_Dataset/Flickr8k_text/Flickr_8k.testImages.txt")
test_description=load_image_description("data_models/captions_file.json", test_images)
test_features=load_image_features('data_models/extracted_image_features.pickle', test_images)

In [28]:
generated_data=data_generator(test_description, test_features, tokenizer, max_length, vocab_size, 64)

In [32]:
model.evaluate(generated_data, steps=100)

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 147ms/step - accuracy: 0.2952 - loss: 3.8418


[3.9012291431427, 0.2915624976158142]

Testing the Image Captioning Model

In [33]:
# extract features of given image
def extract_features(model, image):
    image=image.resize((299,299))
    image=np.expand_dims(image, axis=0)
    image=image/255
    feature=model.predict(image)
    return feature

In [34]:
# generate description of given image
def generate_description(model, tokenizer, features, max_length):
    in_text=""
    sequence=np.zeros((1,max_length))
    for i in range(max_length):
        pred=model.predict([[features, sequence]], verbose=0)
        token=np.argmax(pred)
        word=tokenizer.index_word.get(token)
        sequence[0][i]=token

        if(not word):
            break
        in_text+=" "+word
        if(word=="endseq"):
            break

    in_text=in_text.replace("startseq", "")
    in_text=in_text.replace("endseq", "")

    return in_text

In [35]:
xception_model=load_pickle_model("data_models/xception_model.pickle")
main_model=load_model("data_models/image_captioning_model.keras")

In [36]:
tokenizer=load_pickle_model("data_models/tokenizer.pickle")
vocab_size=len(tokenizer.word_index)+1
max_length=186

In [37]:
image_path="Flickr8k_Dataset/Flicker8k_images/44856031_0d82c2c7d1.jpg"
image=None
try:
    image=Image.open(image_path)
except:
    print("Invalid Image!")

if(image):
    print("Extracting Image Features...")
    features=extract_features(xception_model, image)
    print("Generating Image Caption")
    generated_desc=generate_description(main_model, tokenizer, features, max_length)
    print("Image Caption:", generated_desc)

Extracting Image Features...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 955ms/step
Generating Image Caption
Image Caption:  the orange dog is running through the air 
