# Pipeline for preprocessing image captions : Test set

Uses some code from [this TF Tutorial](https://www.tensorflow.org/tutorials/text/image_captioning) for tokenizing captions

In [54]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
from tqdm import tqdm
import time
#initialization code required to make tensorflow work on my systemabs
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
print(tf.__version__)
import pickle

2.4.1


In [55]:
test_data_dir = r'data/initial_sample/'
train_data_dir = r'data/30k_sample/'

In [56]:
#map to the dataframe with captions and path to the images and their embeddings
path_to_train_meta = os.path.join(test_data_dir, r'sent_added')

In [57]:
data_df = pd.read_csv(path_to_train_meta)

In [58]:
#preprocess captions tpo include start and end tokens
data_df['captions_processed'] = data_df['caption'].apply(lambda x: '<start> ' + x + ' <end>')

In [59]:
#path to tokenizer for this test set 
tokenizer_file = r'tokenizer_old_pos.pickle'
tokenizer_path = os.path.join(train_data_dir, tokenizer_file)

In [60]:
# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [61]:
data_df['captions_tokenized'] = tokenizer.texts_to_sequences(data_df['captions_processed'])
data_df['captions_tokenized'] 

0                              [2, 1288, 9, 5, 73, 20, 3]
1       [2, 289, 111, 10, 37, 31, 747, 61, 4, 212, 146...
2       [2, 270, 328, 6, 242, 27, 1374, 4, 97, 12, 5, ...
3                        [2, 5, 411, 210, 46, 23, 306, 3]
4       [2, 4, 3893, 1, 1092, 10, 24, 16, 34, 127, 346...
                              ...                        
9298                       [2, 5323, 338, 14, 4, 1752, 3]
9299           [2, 7321, 8, 6757, 214, 103, 3821, 227, 3]
9300    [2, 112, 127, 320, 117, 85, 10, 1334, 7, 202, ...
9301               [2, 36, 6, 5, 1493, 559, 7, 4, 369, 3]
9302       [2, 10322, 8, 37, 298, 6, 1, 5630, 8, 2882, 3]
Name: captions_tokenized, Length: 9303, dtype: object

In [62]:
#sannity check number of words should be same as number of tokens (with an one token removed for .)
print(data_df['captions_tokenized'].apply(lambda x: len(x)))

0        7
1       20
2       13
3        8
4       26
        ..
9298     7
9299     9
9300    12
9301    10
9302    11
Name: captions_tokenized, Length: 9303, dtype: int64


In [63]:
max_len = data_df['captions_tokenized'].apply(lambda x: len(x))
data_df['captions_processed'].apply(lambda x: len(x.split()))

0        7
1       22
2       15
3        9
4       30
        ..
9298     8
9299     9
9300    13
9301    11
9302    11
Name: captions_processed, Length: 9303, dtype: int64

In [64]:
#We will now pad the sequences 
list_of_tokenized_captions = data_df['captions_tokenized'].tolist()

In [65]:
padded_tok_captions = tf.keras.preprocessing.sequence.pad_sequences(list_of_tokenized_captions, padding='post')
padded_tok_captions = padded_tok_captions.tolist()


In [66]:
max_len = len(padded_tok_captions[0])
max_len

41

In [67]:
data_df['captions_tokenized_padded'] = pd.Series(padded_tok_captions)
data_df['captions_tokenized_padded']

0       [2, 1288, 9, 5, 73, 20, 3, 0, 0, 0, 0, 0, 0, 0...
1       [2, 289, 111, 10, 37, 31, 747, 61, 4, 212, 146...
2       [2, 270, 328, 6, 242, 27, 1374, 4, 97, 12, 5, ...
3       [2, 5, 411, 210, 46, 23, 306, 3, 0, 0, 0, 0, 0...
4       [2, 4, 3893, 1, 1092, 10, 24, 16, 34, 127, 346...
                              ...                        
9298    [2, 5323, 338, 14, 4, 1752, 3, 0, 0, 0, 0, 0, ...
9299    [2, 7321, 8, 6757, 214, 103, 3821, 227, 3, 0, ...
9300    [2, 112, 127, 320, 117, 85, 10, 1334, 7, 202, ...
9301    [2, 36, 6, 5, 1493, 559, 7, 4, 369, 3, 0, 0, 0...
9302    [2, 10322, 8, 37, 298, 6, 1, 5630, 8, 2882, 3,...
Name: captions_tokenized_padded, Length: 9303, dtype: object

In [68]:
vocab_len = len(tokenizer.word_index)
vocab_len

10325

In [69]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3


In [70]:
#save preprocessed captions
data_df.to_csv(os.path.join(test_data_dir, "tokenized_old_sent_added_pos"),index=False)