In [1]:
import numpy as np
from PIL import Image # Python Imaging Library
import os
import string
from pickle import dump # pickle = built-in Python module that allows you to store and retrieve Python objects
from pickle import load
from keras.applications.xception import Xception # to get pretrained model Xception (CNN)
from keras.applications.xception import preprocess_input
from tensorflow.keras.utils import load_img
from tensorflow.keras.utils import img_to_array
from keras.preprocessing.text import Tokenizer # for text tokenization
from keras.utils import pad_sequences # padding the sequence of text
from keras.utils import to_categorical
from keras.layers import add
from keras.models import Model, load_model # define, train, evaluate the model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout # keras to build our CNN and LSTM (Long Short term memory)
from tqdm.notebook import tqdm as tqdm # to check loop progress
tqdm().pandas()

0it [00:00, ?it/s]

In [2]:
# Function to load document file into memory
def load_file(filename):
    # open the file
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [3]:
# Function to get all images with their captions
def image_captions(filename):
    file = load_file(filename)
    # splits the text in lines
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [4]:
# Function to clean the dictionary - "descriptions"
def text_clean(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-", " ")
            descp = img_caption.split()
            # uppercase to lowercase
            descp = [wrd.lower() for wrd in descp]
            # remove punctuation
            descp = [wrd.translate(table) for wrd in descp]
            # remove hanging 's and a
            descp = [wrd for wrd in descp if(len(wrd)>1)]
            # remove words containing numbers with them
            descp = [wrd for wrd in descp if(wrd.isalpha())]
            # converting back to string
            img_caption = ' '.join(descp)
            captions[img][i] = img_caption
    return captions

In [5]:
dataset_text = "F:\\IIITN\\6th sem\\Machine Learning\\Image_Captioning_Project\\Flickr8k_text" # Location of token text file
dataset_images = "F:\\IIITN\\6th sem\\Machine Learning\\Image_Captioning_Project\\Flickr8k_Dataset\\Flicker8k_Dataset" # location of images

In [6]:
# Create a vocabulary of the model
def text_vocab(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [7]:
# Function to save descriptions in a file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+'\t'+desc)
    data = "\n".join(lines)
    file = open(filename, "w")
    file.write(data)
    file.close()

In [8]:
filename = dataset_text + "\\" + "Flickr8k.token.txt"

In [9]:
descriptions = image_captions(filename)
print("Length of descriptions = ", len(descriptions))

Length of descriptions =  8092


In [10]:
clean_descriptions = text_clean(descriptions)

In [11]:
vocabulary = text_vocab(clean_descriptions)

In [12]:
print("Length of vocabulary = ", len(vocabulary))

Length of vocabulary =  8763


In [13]:
save_descriptions(clean_descriptions, "descriptions.txt")

In [14]:
# Function to extract features from images
def extract_features(directory):
    # Create instance of Xception model
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for pic in tqdm(os.listdir(directory)):
        file = directory + "\\" + pic
        image = Image.open(file)
        # resize the image 299 * 299 * 3
        image.resize((299,299))
        # adds an extra dimension needed by Xception model
        image = np.expand_dims(image, axis=0)
        # These operations are done so that the pixel values are in the range -1 to 1
        image = image / 127.5
        image = image - 1.0
        feature = model.predict(image)
        features[pic] = feature
    return features

In [None]:
features = extract_features(dataset_images)

  0%|          | 0/8091 [00:00<?, ?it/s]

























In [None]:
dump(features, open("features.p","wb"))

In [None]:
features = load(open("features.p","rb"))

In [None]:
# Loading dataset for model training
def load_photos(filename):
    file = load_file(filename)
    photos = file.split("\n")[:-1]
    return photos

In [None]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
train_imgs = load_photos(filename)