# Pipeline for preprocessing image captions 

Uses some code from [this TF Tutorial](https://www.tensorflow.org/tutorials/text/image_captioning) for tokenizing captions

In [29]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
from tqdm import tqdm
import time
#initialization code required to make tensorflow work on my systemabs
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
print(tf.__version__)
import pickle

2.4.1


In [30]:
data_dir = r'data/30k_sample/'

In [31]:
#map to the dataframe with captions and path to the images and their embeddings
path_to_train_meta = os.path.join(data_dir, r'sent_added_neg')

In [32]:
data_df = pd.read_csv(path_to_train_meta)

In [33]:
len(data_df)

8912

In [34]:
#preprocess captions tpo include start and end tokens
data_df['captions_processed'] = data_df['caption'].apply(lambda x: '<start> ' + x + ' <end>')

In [35]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(data_df['captions_processed'])

In [36]:
#add a word for padding 
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [37]:
data_df['captions_processed']

0       <start> drum and drumsticks pattern repeat sea...
1       <start> girl at the computer prints royalty fr...
2             <start> shades of purple in the lobby <end>
3       <start> lupine plants thrive in the deep fores...
4       <start> the abstract figure of the smoke on a ...
                              ...                        
8907    <start> everyone needs a deck -- a place cover...
8908    <start> vector image on the white background <...
8909    <start> little girl studying in classroom with...
8910    <start> dagger with skulls and a vector art il...
8911         <start> man picking fruit over a fence <end>
Name: captions_processed, Length: 8912, dtype: object

In [38]:
print(len(tokenizer.word_index))

8624


In [39]:
#keep only most common  words to limit vocab size
num_words = 8000

In [40]:
#limit vocab size to 10k 
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words} # <= because tokenizer is 1 indexed

In [41]:
vocab_len = len(tokenizer.word_index)
vocab_len

8001

In [42]:
data_df['captions_tokenized'] = tokenizer.texts_to_sequences(data_df['captions_processed'])
data_df['captions_tokenized'] 

0       [2, 3485, 10, 3486, 88, 869, 165, 7, 28, 197, ...
1         [2, 82, 17, 5, 246, 2239, 277, 123, 57, 708, 3]
2                        [2, 1443, 6, 449, 7, 5, 2240, 3]
3       [2, 4940, 450, 4941, 7, 5, 1078, 4942, 234, 5,...
4           [2, 5, 290, 1079, 6, 5, 803, 8, 4, 14, 15, 3]
                              ...                        
8907    [2, 1068, 773, 4, 1604, 4, 306, 359, 9, 58, 22...
8908                         [2, 24, 83, 8, 5, 14, 15, 3]
8909    [2, 143, 82, 1616, 7, 1172, 9, 875, 17, 5, 79,...
8910            [2, 4380, 9, 1566, 10, 4, 24, 158, 21, 3]
8911                   [2, 41, 1985, 1011, 62, 4, 506, 3]
Name: captions_tokenized, Length: 8912, dtype: object

In [43]:
#sannity check number of words should be same as number of tokens (with an one token removed for .)
print(data_df['captions_tokenized'].apply(lambda x: len(x)))

0       14
1       11
2        8
3       13
4       12
        ..
8907    20
8908     8
8909    14
8910    10
8911     8
Name: captions_tokenized, Length: 8912, dtype: int64


In [44]:
max_len = data_df['captions_tokenized'].apply(lambda x: len(x))
data_df['captions_processed'].apply(lambda x: len(x.split()))

0       15
1       11
2        8
3       13
4       12
        ..
8907    22
8908     8
8909    15
8910    10
8911     8
Name: captions_processed, Length: 8912, dtype: int64

In [45]:
#We will now pad the sequences 
list_of_tokenized_captions = data_df['captions_tokenized'].tolist()

In [46]:
padded_tok_captions = tf.keras.preprocessing.sequence.pad_sequences(list_of_tokenized_captions, padding='post')
padded_tok_captions = padded_tok_captions.tolist()


In [47]:
max_len = len(padded_tok_captions[0])
max_len

50

In [48]:
data_df['captions_tokenized_padded'] = pd.Series(padded_tok_captions)
data_df['captions_tokenized_padded']

0       [2, 3485, 10, 3486, 88, 869, 165, 7, 28, 197, ...
1       [2, 82, 17, 5, 246, 2239, 277, 123, 57, 708, 3...
2       [2, 1443, 6, 449, 7, 5, 2240, 3, 0, 0, 0, 0, 0...
3       [2, 4940, 450, 4941, 7, 5, 1078, 4942, 234, 5,...
4       [2, 5, 290, 1079, 6, 5, 803, 8, 4, 14, 15, 3, ...
                              ...                        
8907    [2, 1068, 773, 4, 1604, 4, 306, 359, 9, 58, 22...
8908    [2, 24, 83, 8, 5, 14, 15, 3, 0, 0, 0, 0, 0, 0,...
8909    [2, 143, 82, 1616, 7, 1172, 9, 875, 17, 5, 79,...
8910    [2, 4380, 9, 1566, 10, 4, 24, 158, 21, 3, 0, 0...
8911    [2, 41, 1985, 1011, 62, 4, 506, 3, 0, 0, 0, 0,...
Name: captions_tokenized_padded, Length: 8912, dtype: object

In [49]:
vocab_len = len(tokenizer.word_index)
vocab_len

8001

In [50]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3


In [51]:
#save preprocessed captions
data_df.to_csv(os.path.join(data_dir, "tokenized_sent_added_neg"),index=False)

In [52]:
#pickle and save tokenizer
tokenizer_path = os.path.join(data_dir, r'tokenizer_neg.pickle')

# saving
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [53]:
# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [54]:
vocab_len = len(tokenizer.word_index)
vocab_len


8001

In [55]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3
