In [88]:
import numpy as np
from PIL import Image # Python Imaging Library
import os
import string
from pickle import dump # pickle = built-in Python module that allows you to store and retrieve Python objects
from pickle import load
from keras.applications.xception import Xception # to get pretrained model Xception (CNN)
from keras.applications.xception import preprocess_input
from tensorflow.keras.utils import load_img
from tensorflow.keras.utils import img_to_array
from keras.preprocessing.text import Tokenizer # for text tokenization
from keras.utils import pad_sequences # padding the sequence of text
from keras.utils import to_categorical
from keras.layers import add
from keras.models import Model, load_model # define, train, evaluate the model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout # keras to build our CNN and LSTM (Long Short term memory)
from tqdm.notebook import tqdm as tqdm # to check loop progress
tqdm().pandas()

0it [00:00, ?it/s]

In [2]:
# Function to load document file into memory
def load_file(filename):
    # open the file
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [3]:
# Function to get all images with their captions
def image_captions(filename):
    file = load_file(filename)
    # splits the text in lines
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [4]:
# Function to clean the dictionary - "descriptions"
def text_clean(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-", " ")
            descp = img_caption.split()
            # uppercase to lowercase
            descp = [wrd.lower() for wrd in descp]
            # remove punctuation
            descp = [wrd.translate(table) for wrd in descp]
            # remove hanging 's and a
            descp = [wrd for wrd in descp if(len(wrd)>1)]
            # remove words containing numbers with them
            descp = [wrd for wrd in descp if(wrd.isalpha())]
            # converting back to string
            img_caption = ' '.join(descp)
            captions[img][i] = img_caption
    return captions

In [5]:
dataset_text = "F:\\IIITN\\6th sem\\Machine Learning\\Image_Captioning_Project\\Flickr8k_text" # Location of token text file
dataset_images = "F:\\IIITN\\6th sem\\Machine Learning\\Image_Captioning_Project\\Flickr8k_Dataset\\Flicker8k_Dataset" # location of images

In [6]:
# Create a vocabulary of the model
def text_vocab(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [7]:
# Function to save descriptions in a file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+'\t'+desc)
    data = "\n".join(lines)
    file = open(filename, "w")
    file.write(data)
    file.close()

In [8]:
filename = dataset_text + "\\" + "Flickr8k.token.txt"

In [9]:
descriptions = image_captions(filename)
print("Length of descriptions = ", len(descriptions))

Length of descriptions =  8092


In [10]:
clean_descriptions = text_clean(descriptions)

In [11]:
vocabulary = text_vocab(clean_descriptions)

In [12]:
print("Length of vocabulary = ", len(vocabulary))

Length of vocabulary =  8763


In [13]:
save_descriptions(clean_descriptions, "descriptions.txt")

In [14]:
# Function to extract features from images
def extract_features(directory):
    # Create instance of Xception model
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for pic in tqdm(os.listdir(directory)):
        file = directory + "\\" + pic
        image = Image.open(file)
        # resize the image 299 * 299 * 3
        image.resize((299,299))
        # adds an extra dimension needed by Xception model
        image = np.expand_dims(image, axis=0)
        # These operations are done so that the pixel values are in the range -1 to 1
        image = image / 127.5
        image = image - 1.0
        feature = model.predict(image)
        features[pic] = feature
    return features

In [15]:
features = extract_features(dataset_images)

  0%|          | 0/8091 [00:00<?, ?it/s]























































In [16]:
dump(features, open("features.p","wb"))

In [17]:
features = load(open("features.p","rb"))

In [18]:
# Loading dataset for model training
def load_photos(filename):
    file = load_file(filename)
    photos = file.split("\n")[:-1]
    return photos

In [19]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
train_imgs = load_photos(filename)

In [20]:
def load_clean_descriptions(filename, photos):
    # loading clean descriptions
    file = load_file(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if(len(words)<1):
            continue
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = ' ' + " ".join(image_caption) + ' '
            descriptions[image].append(desc)
    return descriptions

In [22]:
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)

In [23]:
print(train_descriptions)



In [26]:
# load features
def load_features(photos):
    all_features = load(open("features.p", "rb"))
    features = {k: all_features[k] for k in photos}
    return features

In [27]:
train_features = load_features(train_imgs)

In [28]:
print(train_features)

{'2513260012_03d33305cf.jpg': array([[0.00311506, 0.08923321, 0.00812232, ..., 0.11387293, 0.00838462,
        0.01839934]], dtype=float32), '2903617548_d3e38d7f88.jpg': array([[0.09869961, 0.06049246, 0.09424561, ..., 0.16256562, 0.09222282,
        0.00852073]], dtype=float32), '3338291921_fe7ae0c8f8.jpg': array([[1.5440936e-01, 0.0000000e+00, 4.9055638e-05, ..., 1.9618135e-02,
        6.2073600e-02, 0.0000000e+00]], dtype=float32), '488416045_1c6d903fe0.jpg': array([[0.36046287, 0.00130885, 0.01537671, ..., 0.06985299, 0.00859223,
        0.        ]], dtype=float32), '2644326817_8f45080b87.jpg': array([[0.00333591, 0.        , 0.09399841, ..., 0.02665227, 0.0120783 ,
        0.        ]], dtype=float32), '218342358_1755a9cce1.jpg': array([[0.02230419, 0.01388508, 0.02007393, ..., 0.00058924, 0.01833913,
        0.19266416]], dtype=float32), '2501968935_02f2cd8079.jpg': array([[0.06621931, 0.06007456, 0.30950847, ..., 0.02200619, 0.00563516,
        0.41551018]], dtype=float32), '26

In [29]:
from keras.preprocessing.text import Tokenizer

In [30]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [33]:
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [34]:
tokenizer = create_tokenizer(train_descriptions)

In [35]:
print(tokenizer)

<keras.preprocessing.text.Tokenizer object at 0x000001F7BC07CCA0>


In [36]:
dump(tokenizer, open('tokenizer.p', 'wb'))

In [37]:
vocab_size = len(tokenizer.word_index) + 1

In [38]:
print(vocab_size)

7577


In [39]:
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

In [40]:
max_length = max_length(descriptions)

In [41]:
print(max_length)

32


In [59]:
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            # retrieve features
            feature = features[key][0]
            inp_image, inp_seq, op_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[inp_image, inp_seq], op_word]

In [60]:
def create_sequences(tokenizer, max_length, desc_list, feature):
    x_1, x_2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        x_1.append(feature)
        x_2.append(in_seq)
        y.append(out_seq)
    return np.array(x_1), np.array(x_2), np.array(y)

In [61]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))

In [62]:
print(a.shape)

(8, 2048)


In [63]:
print(b.shape)

(8, 32)


In [81]:
print(c.shape)

(8, 7577)


In [65]:
from keras.utils import plot_model

In [66]:
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [67]:
print('Dataset: ', len(train_imgs))

Dataset:  6000


In [68]:
print('Descriptions: train=', len(train_descriptions))

Descriptions: train= 6000


In [69]:
print('Photos: train=', len(train_features))

Photos: train= 6000


In [70]:
print('Vocabulary Size:', vocab_size)

Vocabulary Size: 7577


In [71]:
print('Description Length: ', max_length)

Description Length:  32


In [73]:
model = define_model(vocab_size, max_length)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 32, 256)      1939712     ['input_5[0][0]']                
                                                                                                  
 dropout_2 (Dropout)            (None, 2048)         0           ['input_4[0][0]']                
                                                                                            

In [74]:
epochs = 10

In [75]:
steps = len(train_descriptions)

In [79]:
os.mkdir("models")

In [80]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")

  model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)




In [82]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import argparse

In [85]:
img_path = "C:\\Users\\Dell\\Downloads\\test_1.jpg"

In [86]:
def extract_features(filename, model):
    try:
        image = Image.open(filename)
    except:
        print("ERROR: Can't open image! Ensure that image path and extension is correct")
    image = image.resize((299,299))
    image = np.array(image)
    if image.shape[2] == 4:
           image = image[..., :3]
    image = np.expand_dims(image, axis=0)
    image = image/127.5
    image = image - 1.0
    feature = model.predict(image)
    return feature