<a href="https://colab.research.google.com/github/basselkassem/nlp-toolkit/blob/master/image_caption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Data

In [29]:
!wget http://images.cocodataset.org/zips/train2014.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip

--2020-08-08 23:27:08--  http://images.cocodataset.org/zips/train2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.176.91
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.176.91|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13510573713 (13G) [application/zip]
Saving to: ‘train2014.zip’


2020-08-08 23:40:15 (16.4 MB/s) - ‘train2014.zip’ saved [13510573713/13510573713]



In [6]:
!ls

annotations_trainval2014.zip  sample_data


In [30]:
%%time
from zipfile import ZipFile
with ZipFile('train2014.zip') as ref_file:
  ref_file.extractall()

CPU times: user 33.3 s, sys: 26.5 s, total: 59.8 s
Wall time: 4min 18s


In [8]:
%%time
with ZipFile('annotations_trainval2014.zip') as ref_file:
  ref_file.extractall()

CPU times: user 5.98 s, sys: 1.19 s, total: 7.17 s
Wall time: 7.35 s


In [9]:
!ls annotations

captions_train2014.json   instances_val2014.json
captions_val2014.json	  person_keypoints_train2014.json
instances_train2014.json  person_keypoints_val2014.json


# Import Libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

%matplotlib inline

In [97]:
from sklearn.model_selection import train_test_split

In [116]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Dense, Embedding, GRU, Flatten
from tensorflow.keras.applications import InceptionV3, inception_v3

print(tf.__version__)

2.3.0


# Prepare Data

In [10]:
image_folder = 'train2014/'
annotation_file = 'annotations/captions_train2014.json'

with open(annotation_file) as ref_file:
  annotations = json.load(ref_file)
annotations = annotations['annotations']

In [57]:
SAMPLES_NUM = 100
captions_list = np.random.choice(annotations, size = SAMPLES_NUM)

In [58]:
def get_img_name(img_id):
  return 'COCO_train2014_{:012d}.jpg'.format(img_id)

captions, image_paths = [], []
for caption_item in captions_list:
  img_path = image_folder + get_img_name(caption_item['image_id'])
  image_paths.append(img_path)
  captions.append(caption_item['caption'])

# Data Processing

## Image Feature Extraction

In [69]:
def process_image(img):
  res_img = tf.image.resize(img, size =(299, 299))
  return inception_v3.preprocess_input(res_img)

def read_image(img_path):
  img = tf.io.read_file(img_path)
  img = tf.image.decode_jpeg(img, channels = 3)
  return process_image(img), img_path

def create_feature_extractor():
  model = InceptionV3(include_top=False, weights = 'imagenet')
  input = model.input
  output = model.layers[-1].output
  feature_extractor = Model(inputs = input, outputs = output)
  return feature_extractor

feature_extractor = create_feature_extractor()

Load the images, extract features using inception_v3 and save the results on the desk. The reasons of doing this are: 
*  Use tensorflow parallel computing abilities to load the images
*  Reduce the amount of computations required during training by extracting features from images at early stage


In [78]:
unique_image_paths = sorted(set(image_paths))
image_ds = tf.data.Dataset.from_tensor_slices(unique_image_paths)
image_ds = image_ds.map(
    read_image, num_parallel_calls = tf.data.experimental.AUTOTUNE
).batch(32)

for imgs_batch, path_batch in image_ds:
  img_features_batch = feature_extractor(imgs_batch)
  img_features_batch = tf.reshape(img_features_batch, shape = (
      img_features_batch.shape[0], img_features_batch.shape[1] * img_features_batch.shape[2], -1,
  ))

  for img_features, path in zip(img_features_batch, path_batch):
    img_path = path.numpy().decode('utf-8')
    np.save(img_path, img_features)

##Caption Preprocessing

In [94]:
words_num = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words = words_num,
    filters = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ',
    oov_token = '<unk>',
)
tokenizer.fit_on_texts(captions)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

captions_seqs = tokenizer.texts_to_sequences(captions)
captions_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    captions_seqs, 
    padding = 'post',
)

## Creating Train/Test Datasets

In [104]:
img_paths_tr, img_paths_val, captions_seqs_tr, captions_seqs_val = train_test_split(
    image_paths, captions_seqs, train_size = 0.8,
)
len(img_paths_tr), len(img_paths_val), len(captions_seqs_tr), len(captions_seqs_val)

(80, 20, 80, 20)

In [106]:
BATCH_SIZE = 64
VOCAB_SIZE = words_num + 1
BUFFER_SIZE = 1000

def load_img_features(img_path, caption):
  path = img_path.decode('utf-8') + '.npy'
  return np.load(path), caption

train_ds = tf.data.Dataset.from_tensor_slices((img_paths_tr, captions_seqs_tr))
train_ds = train_ds.map(
    lambda img_path, caption: tf.numpy_function(
      load_img_features,  [img_path, caption], [tf.float32, tf.int32],  
    ),
    num_parallel_calls = tf.data.experimental.AUTOTUNE,
)

train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train_ds = train_ds.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)

In [114]:
sample_img_batch, sample_cap_batch = next(iter(train_ds))
sample_img_batch.shape, sample_cap_batch.shape

(TensorShape([64, 64, 2048]), TensorShape([64, 45]))

# Build the Model

## Attention Mechanisim

In [None]:
class Attention(Model):
  def __init__(self, units):
    super(Attention, self).__init__()
    pass
  def call(seld, features, states):
    pass

## CNN Encoder

In [None]:
class CNNEncoder(Model):
  def __init__(self):
    super(CNNEncoder, self).__init__()
  def call(self):
    pass

## RNN Decoder

In [None]:
class RNNDecoder(Model):
  def __init__(self):
    super(RNNDecoder, self).__init__()
  def call(self, ):
    pass

## Loss Function

## Checkpoints

## Training

## Evaluation