# Pipeline for preprocessing image captions 

Uses some code from [this TF Tutorial](https://www.tensorflow.org/tutorials/text/image_captioning) for tokenizing captions

In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
from tqdm import tqdm
import time
#initialization code required to make tensorflow work on my systemabs
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
print(tf.__version__)
import pickle

2.4.1


In [2]:
data_dir = r'data/30k_sample/'

In [3]:
#map to the dataframe with captions and path to the images and their embeddings
path_to_train_meta = os.path.join(data_dir, r'sent_added')

In [4]:
data_df = pd.read_csv(path_to_train_meta)

In [5]:
#preprocess captions tpo include start and end tokens
data_df['captions_processed'] = data_df['caption'].apply(lambda x: '<start> ' + x + ' <end>')

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(data_df['captions_processed'])

In [7]:
#add a word for padding 
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [8]:
data_df['captions_processed']

0        <start> drum and drumsticks pattern repeat sea...
1        <start> abstract image with bright triangles o...
2             <start> digital art selected for the # <end>
3        <start> girl at the computer prints royalty fr...
4        <start> roadside home and tree , along the roa...
                               ...                        
29923           <start> the house of deity : mosaics <end>
29924               <start> actor at a fashion event <end>
29925    <start> man conducted the service for person <...
29926    <start> supermodel is photographed at a portra...
29927    <start> character dressed in the traditional w...
Name: captions_processed, Length: 29928, dtype: object

In [9]:
#keep only most common 10k words to limit vocab size
num_words = 10000

In [10]:
#limit vocab size to 10k 
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words} # <= because tokenizer is 1 indexed

In [11]:
vocab_len = len(tokenizer.word_index)
vocab_len

10001

In [12]:
data_df['captions_tokenized'] = tokenizer.texts_to_sequences(data_df['captions_processed'])
data_df['captions_tokenized'] 

0        [2, 5362, 9, 6457, 85, 1497, 124, 7, 47, 167, ...
1            [2, 186, 123, 10, 335, 1918, 8, 5, 18, 17, 3]
2                              [2, 381, 92, 574, 12, 4, 3]
3         [2, 74, 14, 4, 542, 2405, 565, 237, 187, 874, 3]
4                     [2, 6458, 80, 9, 86, 275, 4, 174, 3]
                               ...                        
29923                          [2, 4, 81, 6, 905, 5745, 3]
29924                            [2, 20, 14, 5, 46, 87, 3]
29925                    [2, 51, 4502, 4, 1414, 12, 13, 3]
29926               [2, 2257, 16, 599, 14, 5, 88, 1113, 3]
29927                     [2, 155, 857, 7, 4, 340, 253, 3]
Name: captions_tokenized, Length: 29928, dtype: object

In [13]:
#sannity check number of words should be same as number of tokens (with an one token removed for .)
print(data_df['captions_tokenized'].apply(lambda x: len(x)))

0        14
1        11
2         7
3        11
4         9
         ..
29923     7
29924     7
29925     8
29926     9
29927     8
Name: captions_tokenized, Length: 29928, dtype: int64


In [14]:
max_len = data_df['captions_tokenized'].apply(lambda x: len(x))
data_df['captions_processed'].apply(lambda x: len(x.split()))

0        15
1        12
2         8
3        11
4        11
         ..
29923     8
29924     7
29925     8
29926    10
29927     9
Name: captions_processed, Length: 29928, dtype: int64

In [15]:
#We will now pad the sequences 
list_of_tokenized_captions = data_df['captions_tokenized'].tolist()

In [16]:
padded_tok_captions = tf.keras.preprocessing.sequence.pad_sequences(list_of_tokenized_captions, padding='post')
padded_tok_captions = padded_tok_captions.tolist()


In [17]:
max_len = len(padded_tok_captions[0])
max_len

50

In [18]:
data_df['captions_tokenized_padded'] = pd.Series(padded_tok_captions)
data_df['captions_tokenized_padded']

0        [2, 5362, 9, 6457, 85, 1497, 124, 7, 47, 167, ...
1        [2, 186, 123, 10, 335, 1918, 8, 5, 18, 17, 3, ...
2        [2, 381, 92, 574, 12, 4, 3, 0, 0, 0, 0, 0, 0, ...
3        [2, 74, 14, 4, 542, 2405, 565, 237, 187, 874, ...
4        [2, 6458, 80, 9, 86, 275, 4, 174, 3, 0, 0, 0, ...
                               ...                        
29923    [2, 4, 81, 6, 905, 5745, 3, 0, 0, 0, 0, 0, 0, ...
29924    [2, 20, 14, 5, 46, 87, 3, 0, 0, 0, 0, 0, 0, 0,...
29925    [2, 51, 4502, 4, 1414, 12, 13, 3, 0, 0, 0, 0, ...
29926    [2, 2257, 16, 599, 14, 5, 88, 1113, 3, 0, 0, 0...
29927    [2, 155, 857, 7, 4, 340, 253, 3, 0, 0, 0, 0, 0...
Name: captions_tokenized_padded, Length: 29928, dtype: object

In [19]:
vocab_len = len(tokenizer.word_index)
vocab_len

10001

In [20]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3


In [21]:
#save preprocessed captions
data_df.to_csv(os.path.join(data_dir, "tokenized_sent_added"),index=False)

In [22]:
#pickle and save tokenizer
tokenizer_path = os.path.join(data_dir, r'tokenizer.pickle')

# saving
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [23]:
# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [24]:
vocab_len = len(tokenizer.word_index)
vocab_len


10001

In [25]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3
