"""Data preparation for training image captioning model
This script will do the followings:
1) Come up with a vocab list by pooling all training and val captions
2) Convert each word from captions to an integer based on the vocab list
3) Produce image-name-index mapping, that maps an image to an integer based on its name (e.g. COCO_train2014_000000417432.jpg -> 1)
4) Rename all images using the image-name-index mapping above
import json
import os
import collections
import tensorflow as tf
import re
import h5py
import argparse
import sys
import numpy as np
import pandas as pd
FLAGS = None
BUFFER_TOKENS = ['<NULL>', '<START>', '<END>', '<UNK>']
def _parse_sentence(s):
s = s.replace('.', '')
s = s.replace(',', '')
s = s.replace('"', '')
s = s.replace("'", '')
s = s.lower()
s = re.sub("\s\s+", " ", s)
s = s.split(' ')
return s
def preprocess_json_files(path_to_dir):
"""Extract captions from each file and combine into lists, as well as image ids, and returned as dict"""
assert os.path.exists(path_to_dir), 'Path to directory of files does not exist!'
results = {}
for file in os.listdir(path_to_dir):
if 'captions_train2014' not in file and 'captions_val2014' not in file:
print("Skipping file {}".format(file))
temp_path = os.path.join(path_to_dir, file)
with open(temp_path, 'r') as f:
data = json.load(f)
caps = data['annotations']
images = [item['image_id'] for item in caps]
urls = {}
for img in data['images']:
urls[img['id']] = img['flickr_url']
caps = [_parse_sentence(item['caption']) for item in caps]
results[file] = (caps, images, urls)
del data
# return dict of each file, having list of captions and image_ids
results is a dict of two files (train and val), each of which has a caps list (results[file1][0]) and a images list (results[file1][1]), and urls dict
(results[file1][2]). cap list is a list of sentences(list of words), images list is a list of image ids(integers), and urls dict is a dict mapping each
image id to its url
return results
def rename_images(dir, image_id_to_idx):
image_dict = pd.read_csv(image_id_to_idx) # cols: image_idx, image_id
image_dict = image_dict.set_index('image_id')
image_dict = image_dict['image_index'].to_dict()
for img_name in os.listdir(dir):
original_img_path = os.path.join(dir, img_name)
temp_num = int(re.split('\.|_', img_name)[-2])
temp_num = image_dict[temp_num] # convert image id to idx
new_img_path = os.path.join(dir, '{0}.jpg'.format(temp_num))
os.rename(original_img_path, new_img_path)
print("Renaming images for folder {} done. ".format(dir))
def main(_):
## get the vocaboluary
list_of_all_words = None
results = preprocess_json_files(FLAGS.file_dir)
for k, v in results.items():
if list_of_all_words is None:
list_of_all_words = results[k][0].copy()
list_of_all_words += results[k][0]
list_of_all_words = [item for sublist in list_of_all_words for item in sublist]
counter = collections.Counter(list_of_all_words)
vocab = counter.most_common(FLAGS.total_vocab)
print("\nVocab generated! Most, median and least frequent words from the vocab are: \n{0}\n{1}\n{2}\n".format(vocab[0], vocab[int(FLAGS.total_vocab/2)], vocab[-1]))
## create word_to_idx, and idx_to_word
vocab = [i[0] for i in vocab]
word_to_idx = {}
idx_to_word = {}
for i in range(len(BUFFER_TOKENS)):
idx_to_word[int(i)] = BUFFER_TOKENS[i]
word_to_idx[BUFFER_TOKENS[i]] = i
for i in range(len(vocab)):
word_to_idx[vocab[i]] = i + len(BUFFER_TOKENS)
idx_to_word[int(i + len(BUFFER_TOKENS))] = vocab[i]
word_dict = {}
word_dict['idx_to_word'] = idx_to_word
word_dict['word_to_idx'] = word_to_idx
with open(os.path.join(FLAGS.file_dir, 'coco2014_vocab.json'), 'w') as f:
json.dump(word_dict, f)
## convert sentences into encoding/integers
# pad all sentence to length of FLAGS.padding_len - 2
def _convert_sentence_to_numbers(s):
"""Convert a sentence s (a list of words) to list of numbers using word_to_idx"""
s_encoded = [word_to_idx.get(w, UNK_IDX) for w in s]
s_encoded += [END_IDX]
s_encoded += [NULL_IDX] * (FLAGS.padding_len - 1 - len(s_encoded))
return s_encoded
h = h5py.File(os.path.join(FLAGS.file_dir,'coco2014_captions.h5'), 'w')
for k, _ in results.items():
results_to_save = {}
all_captions = results[k][0] # list of lists of words
all_images = results[k][1]
all_urls = results[k][2]
all_captions = [_convert_sentence_to_numbers(s) for s in all_captions] # list of numbers
valid_rows = [i for i in range(len(all_captions)) if len(all_captions[i]) == FLAGS.padding_len-1]
all_captions= [row for row in all_captions if len(row) == FLAGS.padding_len-1]
all_captions = np.array(all_captions)
all_images = np.array(all_images)
all_images = all_images[valid_rows]
assert all_images.shape[0] == all_captions.shape[0], "Processing error! all_captions and all_images diff in length."
# concatenate START and END tokens at two sides
col_start = np.array([START_TOKEN] * all_images.shape[0]).reshape(-1, 1)
#col_end = np.array([END_TOKEN] * all_images.shape[0]).reshape(-1, 1)
all_captions = np.hstack([col_start, all_captions])
## create dicts that maps image ids to 0,...,total_images - image_idx_to_id, image_id_to_idx
image_ids = set(all_images)
image_idx = list(range(len(image_ids)))
image_id_to_idx = {}
image_idx_to_id = {}
for idx, id in enumerate(image_ids):
image_id_to_idx[id] = idx
image_idx_to_id[idx] = id
all_images_idx = np.array([image_id_to_idx.get(id) for id in all_images])
## save all the data
if 'train' in k:
h.create_dataset('train_captions', data=all_captions)
h.create_dataset('train_image_idx', data=all_images_idx)
df = pd.DataFrame.from_dict(image_id_to_idx, 'index')
df['image_id'] = df.index.values
df.columns = ['image_index', 'image_id']
df.to_csv(os.path.join(FLAGS.file_dir, 'train_image_id_to_idx.csv'), index = False)
## write urls file to local as train2014_urls.txt
with open(os.path.join(FLAGS.file_dir, 'train2014_urls.txt'), 'w') as f:
for idx in range(len(image_idx_to_id)):
this_url = all_urls[image_idx_to_id[idx]]
f.write(this_url + '\n')
elif 'val' in k:
h.create_dataset('val_captions', data=all_captions)
h.create_dataset('val_image_idx', data=all_images_idx)
df = pd.DataFrame.from_dict(image_id_to_idx, 'index')
df['image_id'] = df.index.values
df.columns = ['image_index', 'image_id']
df.to_csv(os.path.join(FLAGS.file_dir, 'val_image_id_to_idx.csv'), index = False)
## write urls file to local as val2014_urls.txt
with open(os.path.join(FLAGS.file_dir, 'val2014_urls.txt'), 'w') as f:
for idx in range(len(image_idx_to_id)):
this_url = all_urls[image_idx_to_id[idx]]
f.write(this_url + '\n')
print("Strange file name found in dir: {0}, \nit does not belong to train nor val, so it is not able to save results!".format(k))
print("Data generation done.\n Start renaming images in sequence ...")
if FLAGS.train_image_dir != '':
train_dict = os.path.join(FLAGS.file_dir, 'train_image_id_to_idx.csv')
rename_images(FLAGS.train_image_dir, train_dict)
if FLAGS.val_image_dir != '':
val_dict = os.path.join(FLAGS.file_dir, 'val_image_id_to_idx.csv')
rename_images(FLAGS.val_image_dir, val_dict)
print("all done. ")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
default= '/home/ubuntu/COCO/dataset/COCO_captioning/',
default= '/home/ubuntu/COCO/dataset/COCO_captioning/',
Path to captions_train2014.json, captions_val2014.json\
help='Total number of vacobulary to use.'
help='Total len of padding the sentence.'
help='Absolute path to training dir containing images that are to be renamed.'
help='Absolute path to val dir containing images that are to be renamed.'
FLAGS, unparsed = parser.parse_known_args(), argv=[sys.argv[0]] + unparsed)