In [56]:
!pip install nltk




In [146]:
import pandas as pd
import numpy as np
import nltk
import pickle
from os import path
import gensim
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from collections import defaultdict
from nltk.corpus import stopwords
from tensorflow.keras import preprocessing
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Embedding, LSTM, add

nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wolfe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
df = pd.read_csv("captions.txt")

In [113]:
images = df.iloc[:, 0]
captions = df.iloc[:, 1]
print(images.shape)
print(captions.shape)

(40455,)
(40455,)


In [114]:
#### preprocess the captions #######
for i, caption in enumerate(captions):
    _caption = caption.lower()
    tokens = _caption.split()
    tokens = [word for word in tokens if not word in stop_words]
    # token length greater than one. removes dangling characters
    tokens = [word for word in tokens if len(word)>1]
    # remove tokens with numbers in them
    tokens = [word for word in tokens if word.isalpha()]
    #Add start and end sequence tokens
    captions[i] = "<start> " + " ".join(tokens) + " <end>"


print(captions[0:5])

0    <start> child pink dress climbing set stairs e...
1             <start> girl going wooden building <end>
2    <start> little girl climbing wooden playhouse ...
3    <start> little girl climbing stairs playhouse ...
4    <start> little girl pink dress going wooden ca...
Name: caption, dtype: object


In [115]:
train_x, test_x, train_y, test_y = train_test_split(images, captions)
print(train_x[0:5])
print(train_y[0:5])

36492      44129946_9eeb385d77.jpg
34893    3711664623_ef87105ea7.jpg
8999      241347441_d3dd9b129f.jpg
7305     2295447147_458cfea65a.jpg
11672    2584020755_14e2b3e8fc.jpg
Name: image, dtype: object
36492    <start> sun setting man woman watch boat go <end>
34893    <start> two adults two boys posing mountains l...
8999     <start> football player dressed red looks fiel...
7305                         <start> brown dog field <end>
11672    <start> child stands front dancing wedding par...
Name: caption, dtype: object


In [116]:
#### Create a vocabulary from the train captions in train_y #####
#Eliminate infrequently occurring words from the vocabulary

vocabulary = set()
word_counts = {}

for caption in train_y:
    tokens = caption.split(" ")
    for token in tokens:
        vocabulary.add(token)
        if token in word_counts:
            word_counts[token] += 1
        else:
            word_counts[token] = 1

core_vocab = [i for i in word_counts if word_counts[i] >= 5]


In [117]:
#### create a list of captions corresponding to a particular image ######

image_to_caption = defaultdict(list)

for img_, caption in zip(images, captions):
    _image = img_.replace(".jpg", "").strip()
    image_to_caption[_image].append(caption)




In [118]:
model = InceptionV3(weights="imagenet")
model = Model(model.input, model.layers[-2].output)

In [124]:
# Convert all the images to size 299x299 as expected by the
# encoded_images = []
# for image in images:
#   path = path.join("images", image)
#   img = image.load_img(path, target_size=(299, 299))
#   X = image.img_to_array(img)
#   X = np.expand_dims(x, axis=0)
#   X = preprocess_input(x)
#   X = np.reshape(x, x.shape[1])

dir_path = os.path.dirname(os.path.realpath("__file__")) + "\\images\\"
encoded_images = []

for i in os.listdir(dir_path):
    image_path = path.join(dir_path, i)
    img = preprocessing.image.load_img(image_path, target_size=(299, 299))
    X = preprocessing.image.img_to_array(img)
    X = np.expand_dims(X, axis=0)
    X = preprocess_input(X)
    image_features = model.predict(X, verbose=0)
    encoded_images.append(image_features.reshape(2048,))

pickle.dump(encoded_images, open("image_pickle.p", "wb"))

"""
image_path = path.join(dir_path,"images\\1000268201_693b08cb0e.jpg")
img = preprocessing.image.load_img(image_path, target_size=(299, 299))
X = preprocessing.image.img_to_array(img)
print(X.shape)
X = np.expand_dims(X, axis=0)
print(X.shape)
X = preprocess_input(X)
print(X.shape)
"""

(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)
(2048,)


KeyboardInterrupt: 

In [134]:
#Create python dictionaries to encode indices of unique words in vocabulary
w2idx = {core_vocab[i]: i for i in range(len(core_vocab))}
idx2w = {i: core_vocab[i] for i in range(len(core_vocab))}

In [135]:
#Get max caption length
max_caption_length = max(len(i.split(' ')) for i in train_y)
print(max_caption_length)

21


In [136]:
#Create dictionary of word embeddings for full vocabulary
w2v_model = KeyedVectors.load_word2vec_format("glove.6B.200d.txt", binary=False)
embedding_dict = {i: w2v_model[i] for i in core_vocab if i in w2v_model}

In [137]:
#Create matrix of word embeddings
emb_dimensions = 200
word_embeddings = np.zeros((len(core_vocab), emb_dimensions))
for word, index in w2idx.items():
    if word in embedding_dict:
        word_embeddings[index] = embedding_dict[word]


In [148]:
#Create merge architecture model
image_inputs = Input(shape=(2048,))
image_layer1 = Dropout(0.5)(image_inputs)
image_layer2 = Dense(256, activation = 'relu')(image_layer1)

caption_inputs = Input(shape = max_caption_length)
caption_layer1 = Embedding(len(core_vocab), emb_dimensions, mask_zero = True)(caption_inputs)
caption_layer2 = Dropout(0.5)(caption_layer1)
caption_layer3 = LSTM(256)(caption_layer2)

decoder_layer1 = add([image_layer2, caption_layer3])
decoder_layer2 = Dense(256, activation = 'relu')(decoder_layer1)
output = Dense(len(core_vocab), activation = 'softmax')(decoder_layer2)

merge_model = Model(inputs = [image_inputs, caption_inputs], outputs = output)
merge_model.layers[2].set_weights([word_embeddings])
merge_model.layers[2].trainable = False

merge_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')