# Pipeline for preprocessing image captions : Test set

Uses some code from [this TF Tutorial](https://www.tensorflow.org/tutorials/text/image_captioning) for tokenizing captions

In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
from tqdm import tqdm
import time
#initialization code required to make tensorflow work on my systemabs
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
print(tf.__version__)
import pickle

2.4.1


In [15]:
test_data_dir = r'data/initial_sample/'
train_data_dir = r'data/30k_sample/'

In [16]:
#map to the dataframe with captions and path to the images and their embeddings
path_to_train_meta = os.path.join(test_data_dir, r'sent_added')

In [17]:
data_df = pd.read_csv(path_to_train_meta)

In [18]:
#preprocess captions tpo include start and end tokens
data_df['captions_processed'] = data_df['caption'].apply(lambda x: '<start> ' + x + ' <end>')

In [19]:
#path to tokenizer for this test set 
tokenizer_file = r'tokenizer_old.pickle'
tokenizer_path = os.path.join(train_data_dir, tokenizer_file)

In [20]:
# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [21]:
data_df['captions_tokenized'] = tokenizer.texts_to_sequences(data_df['captions_processed'])
data_df['captions_tokenized'] 

0                              [2, 1392, 8, 5, 60, 17, 3]
1       [2, 344, 168, 10, 34, 32, 1005, 55, 4, 228, 21...
2       [2, 306, 296, 6, 157, 26, 1234, 4, 82, 15, 5, ...
3                        [2, 5, 545, 236, 52, 24, 362, 3]
4       [2, 4, 5269, 1, 928, 10, 22, 21, 30, 99, 363, ...
                              ...                        
9298                       [2, 5845, 326, 11, 4, 2287, 3]
9299            [2, 3531, 9, 4814, 200, 91, 3167, 226, 3]
9300    [2, 159, 99, 279, 124, 85, 10, 1496, 7, 272, 8...
9301               [2, 44, 6, 5, 1248, 612, 7, 4, 466, 3]
9302        [2, 6219, 9, 34, 338, 6, 1, 4178, 9, 2078, 3]
Name: captions_tokenized, Length: 9303, dtype: object

In [22]:
#sannity check number of words should be same as number of tokens (with an one token removed for .)
print(data_df['captions_tokenized'].apply(lambda x: len(x)))

0        7
1       20
2       13
3        8
4       26
        ..
9298     7
9299     9
9300    12
9301    10
9302    11
Name: captions_tokenized, Length: 9303, dtype: int64


In [23]:
max_len = data_df['captions_tokenized'].apply(lambda x: len(x))
data_df['captions_processed'].apply(lambda x: len(x.split()))

0        7
1       22
2       15
3        9
4       30
        ..
9298     8
9299     9
9300    13
9301    11
9302    11
Name: captions_processed, Length: 9303, dtype: int64

In [24]:
#We will now pad the sequences 
list_of_tokenized_captions = data_df['captions_tokenized'].tolist()

In [25]:
padded_tok_captions = tf.keras.preprocessing.sequence.pad_sequences(list_of_tokenized_captions, padding='post')
padded_tok_captions = padded_tok_captions.tolist()


In [26]:
max_len = len(padded_tok_captions[0])
max_len

41

In [27]:
data_df['captions_tokenized_padded'] = pd.Series(padded_tok_captions)
data_df['captions_tokenized_padded']

0       [2, 1392, 8, 5, 60, 17, 3, 0, 0, 0, 0, 0, 0, 0...
1       [2, 344, 168, 10, 34, 32, 1005, 55, 4, 228, 21...
2       [2, 306, 296, 6, 157, 26, 1234, 4, 82, 15, 5, ...
3       [2, 5, 545, 236, 52, 24, 362, 3, 0, 0, 0, 0, 0...
4       [2, 4, 5269, 1, 928, 10, 22, 21, 30, 99, 363, ...
                              ...                        
9298    [2, 5845, 326, 11, 4, 2287, 3, 0, 0, 0, 0, 0, ...
9299    [2, 3531, 9, 4814, 200, 91, 3167, 226, 3, 0, 0...
9300    [2, 159, 99, 279, 124, 85, 10, 1496, 7, 272, 8...
9301    [2, 44, 6, 5, 1248, 612, 7, 4, 466, 3, 0, 0, 0...
9302    [2, 6219, 9, 34, 338, 6, 1, 4178, 9, 2078, 3, ...
Name: captions_tokenized_padded, Length: 9303, dtype: object

In [28]:
vocab_len = len(tokenizer.word_index)
vocab_len

12942

In [29]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3


In [30]:
#save preprocessed captions
data_df.to_csv(os.path.join(test_data_dir, "tokenized_sent_added"),index=False)