In [41]:
!pip install nltk




In [42]:
import pandas as pd
import numpy as np
import nltk
import pickle
from os import path
import gensim
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.translate.bleu_score import corpus_bleu
from tensorflow.keras import preprocessing
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Embedding, LSTM, add

nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wolfe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv("captions.txt")

In [4]:
images = df.iloc[:, 0]
captions = df.iloc[:, 1]
print(images.shape)
print(captions.shape)

(40455,)
(40455,)


In [5]:
#### preprocess the captions #######
for i, caption in enumerate(captions):
    _caption = caption.lower()
    tokens = _caption.split()
    tokens = [word for word in tokens if not word in stop_words]
    # token length greater than one. removes dangling characters
    tokens = [word for word in tokens if len(word)>1]
    # remove tokens with numbers in them
    tokens = [word for word in tokens if word.isalpha()]
    #Add start and end sequence tokens
    captions[i] = "<start> " + " ".join(tokens) + " <end>"


print(captions[0:5])

0    <start> child pink dress climbing set stairs e...
1             <start> girl going wooden building <end>
2    <start> little girl climbing wooden playhouse ...
3    <start> little girl climbing stairs playhouse ...
4    <start> little girl pink dress going wooden ca...
Name: caption, dtype: object


In [6]:
train_x, test_x, train_y, test_y = train_test_split(images, captions)
print(train_x[0:5])
print(train_y[0:5])

133       103195344_5d2dc613a3.jpg
36801     460195978_fc522a4979.jpg
37935     514905846_b54d13946a.jpg
7897     2341254813_c53a5ef27a.jpg
15624    2855667597_bf6ceaef8e.jpg
Name: image, dtype: object
133      <start> man sitting front metal sculpture fron...
36801    <start> grey grey sweashirt running alongside ...
37935          <start> boy shirt tie jumps staircase <end>
7897           <start> child jumps air bowling alley <end>
15624    <start> four large black dogs running grass <end>
Name: caption, dtype: object


In [7]:
#### Create a vocabulary from the train captions in train_y #####
#Eliminate infrequently occurring words from the vocabulary

vocabulary = set()
word_counts = {}

for caption in train_y:
    tokens = caption.split(" ")
    for token in tokens:
        vocabulary.add(token)
        if token in word_counts:
            word_counts[token] += 1
        else:
            word_counts[token] = 1

core_vocab = [i for i in word_counts if word_counts[i] >= 5]


In [8]:
#### create a list of captions corresponding to a particular image ######

image_to_caption = defaultdict(list)

for img_, caption in zip(images, captions):
    _image = img_.replace(".jpg", "").strip()
    image_to_caption[_image].append(caption)




In [9]:
model = InceptionV3(weights="imagenet")
model = Model(model.input, model.layers[-2].output)

In [33]:
# Convert all the images to size 299x299 as expected by the
# encoded_images = []
# for image in images:
#   path = path.join("images", image)
#   img = image.load_img(path, target_size=(299, 299))
#   X = image.img_to_array(img)
#   X = np.expand_dims(x, axis=0)
#   X = preprocess_input(x)
#   X = np.reshape(x, x.shape[1])

dir_path = os.path.dirname(os.path.realpath("__file__")) + "\\images\\"
encoded_images = []
encoded_image_dict = {}

for i in os.listdir(dir_path):
    image_path = path.join(dir_path, i)
    img = preprocessing.image.load_img(image_path, target_size=(299, 299))
    X = preprocessing.image.img_to_array(img)
    X = np.expand_dims(X, axis=0)
    X = preprocess_input(X)
    image_features = model.predict(X, verbose=0)
    encoded_images.append(image_features.reshape(2048,))
    encoded_image_dict[i] = image_features.reshape(2048,)

pickle.dump(encoded_images, open("image_pickle.p", "wb"))
pickle.dump(encoded_image_dict, open("image_dict_pickle.p", "wb"))

"""
image_path = path.join(dir_path,"images\\1000268201_693b08cb0e.jpg")
img = preprocessing.image.load_img(image_path, target_size=(299, 299))
X = preprocessing.image.img_to_array(img)
print(X.shape)
X = np.expand_dims(X, axis=0)
print(X.shape)
X = preprocess_input(X)
print(X.shape)
"""

'\nimage_path = path.join(dir_path,"images\\1000268201_693b08cb0e.jpg")\nimg = preprocessing.image.load_img(image_path, target_size=(299, 299))\nX = preprocessing.image.img_to_array(img)\nprint(X.shape)\nX = np.expand_dims(X, axis=0)\nprint(X.shape)\nX = preprocess_input(X)\nprint(X.shape)\n'

In [11]:
#Create python dictionaries to encode indices of unique words in vocabulary
w2idx = {core_vocab[i]: i for i in range(len(core_vocab))}
idx2w = {i: core_vocab[i] for i in range(len(core_vocab))}

In [12]:
#Get max caption length
max_caption_length = max(len(i.split(' ')) for i in train_y)
print(max_caption_length)

22


In [37]:
def generate_data(caption_dict, images, w2idx, max_len, batch_size):
    image_sequence, input_words, output_words = [], [], []

    for i in range(len(images)):

        for img_name, caption_list in caption_dict.items():
            photo = images[img_name + '.jpg']

            for j in caption_list:
                caption = [w2idx[w] for w in caption_list if w in w2idx]

                for k in caption:
                    input_sequence = pad_sequences([caption[:k]], maxlen = max_len)[0]
                    output_sequence = to_categorical([caption[k]], num_classes = len(w2idx))[0]
                    
                    image_sequence.append(photo)
                    input_words.append(input_sequence)
                    output_words.append(output_sequence)

            if i == batch_size:
                yield [[np.array(image_sequence), np.array(input_words)], np.array(output_words)]
                image_sequence, input_words, output_words = [], [], []

In [34]:
#Create dictionary of word embeddings for full vocabulary
w2v_model = KeyedVectors.load_word2vec_format("glove.6B.200d.txt", binary=False)
embedding_dict = {i: w2v_model[i] for i in core_vocab if i in w2v_model}
pickle.dump(embedding_dict, open("w2v_embeddings.p", "wb"))

In [15]:
#Create matrix of word embeddings
emb_dimensions = 200
word_embeddings = np.zeros((len(core_vocab), emb_dimensions))
for word, index in w2idx.items():
    if word in embedding_dict:
        word_embeddings[index] = embedding_dict[word]


In [39]:
#Create merge architecture model
image_inputs = Input(shape=(2048,))
image_layer1 = Dropout(0.5)(image_inputs)
image_layer2 = Dense(256, activation = 'relu')(image_layer1)

caption_inputs = Input(shape = max_caption_length)
caption_layer1 = Embedding(len(core_vocab), emb_dimensions, mask_zero = True)(caption_inputs)
caption_layer2 = Dropout(0.5)(caption_layer1)
caption_layer3 = LSTM(256)(caption_layer2)

decoder_layer1 = add([image_layer2, caption_layer3])
decoder_layer2 = Dense(256, activation = 'relu')(decoder_layer1)
output = Dense(len(core_vocab), activation = 'softmax')(decoder_layer2)

merge_model = Model(inputs = [image_inputs, caption_inputs], outputs = output)
merge_model.layers[2].set_weights([word_embeddings])
merge_model.layers[2].trainable = False

merge_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [35]:
def unpickle_images(file_name):
    dir_path = os.path.dirname(os.path.realpath("__file__"))
    with open(dir_path + '\\' + file_name, 'rb') as p:
        encoded_images = pickle.load(p)
    return encoded_images

In [40]:
encoded_images = unpickle_images('image_dict_pickle.p')
num_epochs = 20
num_steps = len(encoded_images)
for i in range(num_epochs):
    data_generator = generate_data(image_to_caption, encoded_images, w2idx, max_caption_length, 5)
    merge_model.fit(data_generator, epochs = 1, steps_per_epoch = num_steps, verbose = 1)
    model.save('model_epoch_' + str(i) + '.h5')

02_97e5819b79
399212516_d68046b277
399246804_b4b5dc70e1
399679638_d3036da331
400562847_e15aba0aac
400851260_5911898657
401079494_562454c4d6
401476986_73918145a3
403523132_73b9a1a4b3
403678611_73978faed7
404216567_75b50b5a36
404702274_fa8b3fe378
404850242_3a55a4c874
404890608_33f138aefa
405051459_3b3a3ba5b3
405253184_5f611f3880
405331006_4e94e07698
405534893_2d0f3b0147
405534993_5158644f98
405537503_f66ecc5073
405615014_03be7ef618
405961988_fcfe97f31e
405970010_8cebaa77d3
406248253_27b5eba25a
406642021_9ec852eccf
406901451_7eafd7568a
407008823_bdd7fc6ed5
407569668_19b3f8eaf6
407678652_1f475acd65
408233586_f2c1be3ce1
408573233_1fff966798
408627152_1feaa4b94e
408748500_e8dc8c0c4f
409327234_7b29eecb4e
410042380_517ccee020
410413536_11f1127c46
410422753_de506155fa
410453140_5401bf659a
411011549_1298d2b4d2
411175971_0fffd3b8c6
411216802_aead9e67e3
411863595_d77156687e
412056525_191724b058
412082368_371df946b3
412101267_7257e6d8c0
412203580_2c7278909c
413231421_43833a11f5
413737417_b0a8b445e9

ValueError: in user code:

    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py:531 train_step  **
        y_pred = self(x, training=True)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\network.py:719 call
        convert_kwargs_to_constants=base_layer_utils.call_context().saving)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\network.py:888 _run_internal_graph
        output_tensors = layer(computed_tensors, **kwargs)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:886 __call__
        self.name)
    C:\Users\wolfe\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer dense_3 is incompatible with the layer: expected axis -1 of input shape to have value 2048 but received input with shape [None, 1]


In [None]:
def generate_caption(image, max_len):
    caption_ = recursive_caption(image, '<start>', max_len)
    caption_list = caption_.split(' ')[1:-1]
    return ' '.join(caption_list)

def recursive_caption(image, current_caption, max_len):
    caption_indices = [w2idx[i] for i in current_caption.split() if i in w2idx]
    padded_caption = pad_sequences([caption_indices], maxlen = max_len + len(caption_indices))

    new_token = idx2w[np.argmax(merge_model.predict([image, padded_caption], verbose=0))]
    caption = current_caption + new_token

    if new_token == '<end>':
        return caption
    if depth == 0:
        return caption
        
    recursive_caption(image, caption, max_len-1)


In [None]:
def model_evaluation(model, caption_list, images, max_len):
    ground_truth, predictions = [], []
    
    for key, captions in caption_list.items():

        prediction = generate_caption(images[key], max_len)
        predictions.append(prediction)

        g_truth = [caption.split() for caption in captions]
        ground_truth.append(g_truth.split())
    
    bleu_1 = corpus_bleu(ground_truth, predictions, weights = [1.0, 0, 0, 0])
    bleu_2 = corpus_bleu(ground_truth, predictions, weights = [.5, .5, 0, 0])
    bleu_3 = corpus_bleu(ground_truth, predictions, weights = [.3, .3, .3, 0])
    bleu_4 = corpus_bleu(ground_truth, predictions, weights = [.25, .25, .25, .25])
    
    return [bleu_1, bleu_2, bleu_3, bleu_4]           