# Pipeline for preprocessing image captions : Test set

Uses some code from [this TF Tutorial](https://www.tensorflow.org/tutorials/text/image_captioning) for tokenizing captions

In [45]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
from tqdm import tqdm
import time
#initialization code required to make tensorflow work on my systemabs
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
print(tf.__version__)
import pickle

2.4.1


In [46]:
test_data_dir = r'data/initial_sample/'
train_data_dir = r'data/30k_sample/'

In [47]:
#map to the dataframe with captions and path to the images and their embeddings
path_to_train_meta = os.path.join(test_data_dir, r'sent_added')

In [48]:
data_df = pd.read_csv(path_to_train_meta)

In [49]:
#preprocess captions tpo include start and end tokens
data_df['captions_processed'] = data_df['caption'].apply(lambda x: '<start> ' + x + ' <end>')

In [50]:
#path to tokenizer for this test set 
tokenizer_file = r'tokenizer_neg.pickle'
tokenizer_path = os.path.join(train_data_dir, tokenizer_file)

In [51]:
# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [52]:
data_df['captions_tokenized'] = tokenizer.texts_to_sequences(data_df['captions_processed'])
data_df['captions_tokenized'] 

0                              [2, 1671, 8, 4, 51, 15, 3]
1       [2, 719, 7762, 9, 39, 32, 6000, 52, 5, 219, 76...
2       [2, 519, 237, 6, 86, 26, 1008, 5, 54, 18, 4, 3...
3                       [2, 4, 1930, 250, 74, 22, 780, 3]
4       [2, 5, 1, 1, 693, 9, 19, 110, 29, 59, 427, 310...
                              ...                        
9291                          [2, 1, 303, 11, 5, 5796, 3]
9292           [2, 1871, 10, 3441, 160, 62, 2434, 198, 3]
9293    [2, 1025, 59, 209, 165, 88, 9, 2145, 7, 542, 1...
9294              [2, 210, 6, 4, 912, 796, 7, 5, 1392, 3]
9295      [2, 4436, 10, 39, 545, 6, 1, 2908, 10, 1342, 3]
Name: captions_tokenized, Length: 9296, dtype: object

In [53]:
#sannity check number of words should be same as number of tokens (with an one token removed for .)
print(data_df['captions_tokenized'].apply(lambda x: len(x)))

0        7
1       20
2       13
3        8
4       26
        ..
9291     7
9292     9
9293    12
9294    10
9295    11
Name: captions_tokenized, Length: 9296, dtype: int64


In [54]:
max_len = data_df['captions_tokenized'].apply(lambda x: len(x))
data_df['captions_processed'].apply(lambda x: len(x.split()))

0        7
1       22
2       15
3        9
4       30
        ..
9291     8
9292     9
9293    13
9294    11
9295    11
Name: captions_processed, Length: 9296, dtype: int64

In [55]:
#We will now pad the sequences 
list_of_tokenized_captions = data_df['captions_tokenized'].tolist()

In [56]:
padded_tok_captions = tf.keras.preprocessing.sequence.pad_sequences(list_of_tokenized_captions, padding='post')
padded_tok_captions = padded_tok_captions.tolist()


In [57]:
max_len = len(padded_tok_captions[0])
max_len

41

In [58]:
data_df['captions_tokenized_padded'] = pd.Series(padded_tok_captions)
data_df['captions_tokenized_padded']

0       [2, 1671, 8, 4, 51, 15, 3, 0, 0, 0, 0, 0, 0, 0...
1       [2, 719, 7762, 9, 39, 32, 6000, 52, 5, 219, 76...
2       [2, 519, 237, 6, 86, 26, 1008, 5, 54, 18, 4, 3...
3       [2, 4, 1930, 250, 74, 22, 780, 3, 0, 0, 0, 0, ...
4       [2, 5, 1, 1, 693, 9, 19, 110, 29, 59, 427, 310...
                              ...                        
9291    [2, 1, 303, 11, 5, 5796, 3, 0, 0, 0, 0, 0, 0, ...
9292    [2, 1871, 10, 3441, 160, 62, 2434, 198, 3, 0, ...
9293    [2, 1025, 59, 209, 165, 88, 9, 2145, 7, 542, 1...
9294    [2, 210, 6, 4, 912, 796, 7, 5, 1392, 3, 0, 0, ...
9295    [2, 4436, 10, 39, 545, 6, 1, 2908, 10, 1342, 3...
Name: captions_tokenized_padded, Length: 9296, dtype: object

In [59]:
vocab_len = len(tokenizer.word_index)
vocab_len

8001

In [60]:
#make sure we have unk tokens etc-
print(tokenizer.word_index['<pad>'])
print(tokenizer.word_index['<unk>'])
print(tokenizer.word_index['<start>'])
print(tokenizer.word_index['<end>'])

0
1
2
3


In [61]:
#save preprocessed captions
data_df.to_csv(os.path.join(test_data_dir, "tokenized_sent_added_neg"),index=False)