In [1]:
import numpy as np
import pandas as pd
import json
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
import json
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [12]:
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
def load(train=True):

    if train:
        filename = 'Dataset/captions_train2017.json'
    else:
        filename = 'Dataset/captions_val2017.json'

    path = filename
    
    with open(path, "r", encoding="utf-8") as file:
        data_raw = json.load(file)

    images = data_raw['images']
    annotations = data_raw['annotations']
    
    records = dict()

    for image in images:
        image_id = image['id']
        filename = image['file_name']
        
        record = dict()
        
        record['filename'] = filename
        
        record['captions'] = list()
        
        records[image_id] = record
        
    for ann in annotations:
        
        image_id = ann['image_id']
        caption = ann['caption']
        
        record = records[image_id]
        
        record['captions'].append(caption)
        
    records_list = [(key, record['filename'], record['captions'])
                    for key, record in sorted(records.items())]
    
    ids, filenames, captions = zip(*records_list)

    return ids, filenames, captions

In [3]:
_, filenames_train, captions_train = load(train=True)

In [4]:
mark_start = 'ssss '
mark_end = ' eeee'

In [5]:
def mark_captions(captions_listlist):
    captions_marked = [[mark_start + caption + mark_end
                        for caption in captions_list]
                        for captions_list in captions_listlist]
    
    return captions_marked

In [6]:
captions_train_marked = mark_captions(captions_train)
captions_train_marked[0]

['ssss Closeup of bins of food that include broccoli and bread. eeee',
 'ssss A meal is presented in brightly colored plastic trays. eeee',
 'ssss there are containers filled with different kinds of foods eeee',
 'ssss Colorful dishes holding meat, vegetables, fruit, and bread. eeee',
 'ssss A bunch of trays that have different food. eeee']

In [7]:
captions_train[0]

['Closeup of bins of food that include broccoli and bread.',
 'A meal is presented in brightly colored plastic trays.',
 'there are containers filled with different kinds of foods',
 'Colorful dishes holding meat, vegetables, fruit, and bread.',
 'A bunch of trays that have different food.']

In [8]:
def flatten(captions_listlist):
    captions_list = [caption
                     for captions_list in captions_listlist
                     for caption in captions_list]
    
    return captions_list

In [9]:
captions_train_flat = flatten(captions_train_marked)

In [10]:
num_words = 10000

In [13]:
class TokenizerWrap(Tokenizer):
    
    def __init__(self, texts, num_words=None):

        Tokenizer.__init__(self, num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))

    def token_to_word(self, token):

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        text = " ".join(words)

        return text
    
    def captions_to_tokens(self, captions_listlist):
        
        tokens = [self.texts_to_sequences(captions_list)
                  for captions_list in captions_listlist]
        
        return tokens

In [14]:
tokenizer = TokenizerWrap(texts=captions_train_flat, num_words=num_words)

In [15]:
token_start = tokenizer.word_index[mark_start.strip()]
token_start

2

In [16]:
token_end = tokenizer.word_index[mark_end.strip()]
token_end

3

In [17]:
tokens_train = tokenizer.captions_to_tokens(captions_train_marked)

In [18]:
tokens_train[0]

[[2, 841, 5, 2864, 5, 61, 26, 1984, 238, 9, 433, 3],
 [2, 1, 429, 10, 3310, 7, 1025, 390, 501, 1110, 3],
 [2, 63, 19, 993, 143, 8, 190, 958, 5, 743, 3],
 [2, 299, 725, 25, 343, 208, 264, 9, 433, 3],
 [2, 1, 170, 5, 1110, 26, 446, 190, 61, 3]]

In [19]:
captions_train_marked[0]

['ssss Closeup of bins of food that include broccoli and bread. eeee',
 'ssss A meal is presented in brightly colored plastic trays. eeee',
 'ssss there are containers filled with different kinds of foods eeee',
 'ssss Colorful dishes holding meat, vegetables, fruit, and bread. eeee',
 'ssss A bunch of trays that have different food. eeee']