In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
import string 

In [2]:
df = pd.read_csv('../input/flickr8k/captions.txt' , sep=',')
df = df.rename(columns={"image": "image_name", "caption": "image_caption"})
df['image_caption'] = df['image_caption'].astype(str)
df['image_name'] = df['image_name'].astype(str)

In [3]:
def get_caption_list(df):
    caption_list = df['image_caption'].to_list()

    def clean_captions(caption_list):
        table = str.maketrans('', '', string.punctuation)

        for i in tqdm(range(len(caption_list))):
                desc = caption_list[i]
                # tokenize
                desc = desc.split()
                # convert to lower case
                desc = [word.lower() for word in desc]
                # remove punctuation from each token
                desc = [w.translate(table) for w in desc]
                # remove hanging 's' and 'a'
                desc = [word for word in desc if len(word)>1]
                # remove tokens with numbers in them
                desc = [word for word in desc if word.isalpha()]
                # store as string
                caption_list[i] =  ' '.join(desc)

        return (caption_list)
    caption_list = clean_captions(caption_list)
    
    for i, caption in enumerate(caption_list):
        caption_list[i] = '<start> ' + caption + ' <end>'
    
    
    return caption_list

In [4]:
caption_list = get_caption_list(df)
df['image_caption'] = caption_list

100%|██████████| 40455/40455 [00:00<00:00, 73807.60it/s]


In [5]:
vocabulary = []
for i, key  in enumerate(caption_list):
    word_list = (caption_list[i].split())
    for word in word_list:
        vocabulary.append(word)
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 453811


In [6]:
#from collections import Counter 
#def removeElements(lst, k): 
#    counted = Counter(lst) 
#    return [el for el in lst if counted[el] >= k] 
#k = 8
#vocabulary = ((removeElements(vocabulary, k))) 
#vocabulary.update(['<unk>'])
vocabulary = set(vocabulary)

In [7]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocabulary:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [None]:
ixtoword = np.load('../input/fork-of-image-captioning/ixtoword.npy',allow_pickle='TRUE').item()
wordtoix = np.load('../input/fork-of-image-captioning/wordtoix.npy',allow_pickle='TRUE').item()

In [None]:
max(len(d.split()) for d in caption_list)

In [8]:
caption_df =  pd.Series(caption_list, name ='image_caption') 
image_name_df = df['image_name']
df_new = pd.concat([image_name_df, caption_df], axis=1)
df_new = df_new.dropna(axis= 0)

In [9]:
df

Unnamed: 0,image_name,image_caption
0,1000268201_693b08cb0e.jpg,<start> child in pink dress is climbing up set...
1,1000268201_693b08cb0e.jpg,<start> girl going into wooden building <end>
2,1000268201_693b08cb0e.jpg,<start> little girl climbing into wooden playh...
3,1000268201_693b08cb0e.jpg,<start> little girl climbing the stairs to her...
4,1000268201_693b08cb0e.jpg,<start> little girl in pink dress going into w...
...,...,...
40450,997722733_0cb5439472.jpg,<start> man in pink shirt climbs rock face <end>
40451,997722733_0cb5439472.jpg,<start> man is rock climbing high in the air <...
40452,997722733_0cb5439472.jpg,<start> person in red shirt climbing up rock f...
40453,997722733_0cb5439472.jpg,<start> rock climber in red shirt <end>


In [10]:
from sklearn.model_selection import train_test_split
df_train , df_test = train_test_split(df,test_size=0.33, random_state=42)
df_train , df_val = train_test_split(df_train, test_size = 0.2 , random_state= 42)

In [11]:
def same_length_caption(caption , max_len=50):

    '''
    Takes caption as input and makes them of equal length
    
    Parameters:-
    caption (list) - The list of embedded caption to be made of particular length
    max_len (int) - The max length of the caption
    
    Return type:-
    
    caption (list) :- Returns a list with zero padding of length = max_len
    '''
    
    
    if(len(caption) == max_len):
        return (caption)
    else:
        for i in range((max_len-len(caption))):
            caption.append(0)
    return caption

In [12]:
def word_to_ix(caption , vocab):
    '''
    Maps the words to integers according to custom vocabulary
    
    Parameters:-
    caption (list) - The caption to be embedded
    vocab (dict) - The custom mapping that wil be used as vocabulary
    
    Return type:-
    
    caption (list) :- Returns a list after mapping them according to 'vocab'
    '''
        
    transformed_caption=[]
    for word in caption:
        if (word in wordtoix.keys()):
            transformed_caption.append(wordtoix[word])
        #else:
        #    transformed_caption.append(wordtoix['<unk>'])
    return (transformed_caption)
        

In [13]:
def ix_to_word(caption , vocab):
    '''
    Takes caption as input and maps them to words as defined by 'vocab'
    
    Parameters:-
    caption (list) - The list of embedded caption to be made of particular length
    vocab (dict) - The dictionary that wil be used as mapping
    
    Return type:-
    
    caption (list) :- Returns a list after converting respective integers to words according to vocab
    '''
    
    transformed_caption=[]
    for word in caption:
        if (word in ixtoword.keys()):
            transformed_caption.append(ixtoword[word])
        else:
            transformed_caption.append('<unk>')
    return (transformed_caption)

In [14]:
def generator(samples, batch_size=32):
    """
    Yields the next training batch.
    """
    num_samples = len(samples)
    
    while True:
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples.iloc[offset:offset+batch_size]

            input_seq=[]
            output_seq=[]
            features_list = []

            for batch_sample in batch_samples.index:
                caption_text = batch_samples.at[batch_sample , 'image_caption']
                caption = caption_text.split()
                caption = word_to_ix(caption , wordtoix)
                caption = same_length_caption(caption , max_len = 34)
                image_name = batch_samples.at[batch_sample , 'image_name']
                features = np.load('../input/image-caption-dataset/' 
                                   + image_name[0:-4]
                                   +'.npy'
                                  )
                features = (features).tolist()
                features_list.append(features)
                input_seq.append(caption[:-1])
                output_seq.append(caption[1:])
            
            features_list = np.array(features_list)
            input_seq = np.array(input_seq)
            output_seq = np.array(output_seq)

            yield [features_list, input_seq] ,output_seq

In [15]:

glove_dir = '../input/glove/glove.6B.200d.txt'
embeddings_index = {} # empty dictionary
f = open(glove_dir, encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


In [16]:
vocab_size = len(wordtoix) + 1

In [17]:
embedding_dim = 200
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [18]:
max_length = 34

In [19]:
inputs1 = tf.keras.layers.Input(shape=(64 , 2048))
fe2 = tf.keras.layers.GRU(200, return_sequences = False)(inputs1)

In [20]:
inputs2 = tf.keras.layers.Input(shape=(max_length-1))
se1 = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim)(inputs2)
se2 = tf.keras.layers.GRU(200,return_sequences=True )(se1 , initial_state =fe2)
se3 = tf.keras.layers.GRU(200,return_sequences=True )(se2 , initial_state =fe2)

In [None]:
att = tf.keras.layers.Attention(512)([se2 , fe2])

In [21]:
output = tf.keras.layers.Dense(vocab_size,activation='softmax')(se3)
model = tf.keras.Model(inputs=[inputs1, inputs2],
                      outputs=[output])

In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 64, 2048)]   0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 200)      1753200     input_2[0][0]                    
__________________________________________________________________________________________________
gru (GRU)                       (None, 200)          1350000     input_1[0][0]                    
______________________________________________________________________________________________

In [23]:
model.layers[2]

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f44bc165e90>

In [24]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [26]:

opt = tf.keras.optimizers.RMSprop(learning_rate=0.01)

model.compile(loss= 'sparse_categorical_crossentropy', optimizer=opt , metrics= ['accuracy'])

In [27]:
lr_red = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_accuracy', factor=0.2, patience=2, verbose=0, mode='auto',
    min_delta=0.0001, cooldown=0, min_lr=0.0000001
)

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=1, mode='auto',baseline=None, restore_best_weights=True)

In [28]:
BATCH_SIZE = 256

In [29]:
train_generator = generator(df_train,batch_size=BATCH_SIZE)
val_generator = generator(df_val,batch_size=BATCH_SIZE)

In [30]:
history = model.fit_generator(
        train_generator,
        validation_data=val_generator,
        validation_steps=(len(df_val)/BATCH_SIZE),
        steps_per_epoch= (len(df_train)/BATCH_SIZE),
        epochs=200, verbose=1,
        callbacks = [lr_red, es]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
 2/84 [..............................] - ETA: 4:43 - loss: 0.4336 - accuracy: 0.8883

KeyboardInterrupt: 

In [31]:
from tensorflow.keras.backend import manual_variable_initialization
manual_variable_initialization(True)

In [32]:
model.save(
    'my_file', overwrite=True, include_optimizer=True, save_format='h5',
    signatures=None, options=None
)

In [33]:
np.save('wordtoix.npy', wordtoix) 
np.save('ixtoword.npy', ixtoword)