In [None]:
!pip install tensorflow_text

In [1]:
import random
import json
import tensorflow as tf
import zipfile

In [2]:
coco = tf.keras.utils.get_file(
        'train2017.zip',
        'http://images.cocodataset.org/zips/train2017.zip',
        extract=True
    )
annotations = tf.keras.utils.get_file(
        'captions_train2017.json',
        'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
        extract=True
    )

Downloading data from http://images.cocodataset.org/zips/train2017.zip
[1m19336861798/19336861798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 0us/step
Downloading data from http://images.cocodataset.org/annotations/annotations_trainval2017.zip
[1m252907541/252907541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


annotations/instances_train2017.json

annotations/instances_val2017.json

annotations/captions_train2017.json

annotations/captions_val2017.json

annotations/person_keypoints_train2017.json

annotations/person_keypoints_val2017.json

In [3]:
try:
    with zipfile.ZipFile(annotations, 'r') as zip_file:
        with zip_file.open('annotations/instances_train2017.json') as f:
            annotations_data = json.load(f)
            print("Successfully loaded JSON data")
            if isinstance(annotations_data, dict):
                print("Available keys:", annotations_data.keys())
except Exception as e:
    print(f"Error: {e}")

Successfully loaded JSON data
Available keys: dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])


In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=5000,
        oov_token="<unk>",
        filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ '
)

In [5]:
try:
    with zipfile.ZipFile(annotations, 'r') as zip_file:

        with zip_file.open('annotations/captions_train2017.json') as f:
            annotations_data = json.load(f)
            captions = [ann['caption'] for ann in annotations_data['annotations']]
            tokenizer.fit_on_texts(captions)
            image_size = 224
            vocab_size = len(tokenizer.word_index) + 1
            max_length = 50
            print(f"Number of captions loaded: {len(captions)}")
            print(f"Vocabulary size: {vocab_size}")

            print("\nFirst few captions:")
            for cap in captions[:3]:
                print(cap)

except Exception as e:
    print(f"Error: {e}")

Number of captions loaded: 591753
Vocabulary size: 27951

First few captions:
A bicycle replica with a clock as the front wheel.
A room with blue walls and a white sink and door.
A car that seems to be parked illegally behind a legally parked car


In [6]:
captions = [ann['caption'] for ann in annotations_data['annotations']]
tokenizer.fit_on_texts(captions)

image_size = 224
vocab_size = len(tokenizer.word_index) + 1
max_length = 50

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

In [8]:
image_size = 224
vocab_size = 5001
max_length = 50
embedding_dim = 256
num_heads = 8
ff_dim = 512
num_transformer_blocks = 4

In [9]:
def create_vit_encoder(image_size):
    inputs = layers.Input(shape=(image_size, image_size, 3))

    patch_size = 16
    num_patches = (image_size // patch_size) ** 2
    projection_dim = 768

    patches = layers.Conv2D(
        filters=projection_dim,
        kernel_size=patch_size,
        strides=patch_size,
        padding="valid"
    )(inputs)

    patches = layers.Reshape((num_patches, projection_dim))(patches)

    positional_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)
    positions = tf.range(start=0, limit=num_patches, delta=1)
    encoded_patches = patches + positional_embedding(positions)

    for _ in range(num_transformer_blocks):
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim)(x1, x1)
        x2 = layers.Add()([x1, attention_output])

        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        ffn_output = layers.Dense(ff_dim, activation="relu")(x3)
        ffn_output = layers.Dense(projection_dim)(ffn_output)
        encoded_patches = layers.Add()([x2, ffn_output])

    model = models.Model(inputs, encoded_patches)
    return model

def create_text_decoder(vocab_size, embedding_dim, max_length):
    inputs = layers.Input(shape=(max_length,))

    # Word Embeddings + Positional Embeddings
    word_embeddings = layers.Embedding(vocab_size, embedding_dim)(inputs)
    positional_embeddings = layers.Embedding(max_length, embedding_dim)(tf.range(start=0, limit=max_length, delta=1))
    embeddings = word_embeddings + positional_embeddings

    x = embeddings
    for _ in range(num_transformer_blocks):
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embedding_dim)(x1, x1)
        x2 = layers.Add()([x1, attention_output])

        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        ffn_output = layers.Dense(ff_dim, activation="relu")(x3)
        ffn_output = layers.Dense(embedding_dim)(ffn_output)
        x = layers.Add()([x2, ffn_output])

    outputs = layers.Dense(vocab_size, activation="softmax")(x)
    model = models.Model(inputs, outputs)
    return model

def create_image_captioning_model(image_size, vocab_size, embedding_dim, max_length):
    vit_encoder = create_vit_encoder(image_size)
    text_decoder = create_text_decoder(vocab_size, embedding_dim, max_length)

    image_inputs = layers.Input(shape=(image_size, image_size, 3), name='input_layer_2')
    text_inputs = layers.Input(shape=(max_length,), name='input_layer_3')

    encoded_image = vit_encoder(image_inputs)

    encoded_image = layers.GlobalAveragePooling1D()(encoded_image)
    encoded_image = layers.Dense(embedding_dim, activation="relu")(encoded_image)
    encoded_image = layers.RepeatVector(max_length)(encoded_image)


    embeddings = layers.Concatenate(axis=2)([encoded_image, text_decoder(text_inputs)])

    outputs = layers.Dense(vocab_size, activation="softmax")(embeddings)

    model = models.Model(inputs=[image_inputs, text_inputs], outputs=outputs)
    return model

In [None]:
"""try:
    from tensorflow.keras.preprocessing.text import Tokenizer
except:
    !pip install tensorflow_text"""

In [10]:
import os

In [11]:
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [12]:
image_dir = '/root/.keras/datasets/train2017'
annotations_file = '/root/.keras/datasets/annotations/captions_train2017.json'

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<unk>")
with open(annotations_file, 'r') as f:
    annotations_data = json.load(f)
    captions = [ann['caption'] for ann in annotations_data['annotations']]
tokenizer.fit_on_texts(captions)

In [14]:
class ImageCaptioningDataGenerator(Sequence):
    def __init__(self, image_dir, annotations_file, tokenizer, batch_size, image_size, max_length):
        with open(annotations_file, 'r') as f:
            annotations_data = json.load(f)
        self.image_dir = image_dir
        self.annotations = annotations_data['annotations']
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.image_size = image_size
        self.max_length = max_length
        self.indices = np.arange(len(self.annotations))

    def __len__(self):
        return len(self.annotations) // self.batch_size

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_annotations = [self.annotations[i] for i in batch_indices]


        batch_images = np.zeros((self.batch_size, self.image_size, self.image_size, 3))
        batch_sequences = np.zeros((self.batch_size, self.max_length))
        batch_targets = np.zeros((self.batch_size, self.max_length))

        # Fill batch arrays
        for i, annotation in enumerate(batch_annotations):
            image_id = annotation['image_id']
            image_path = os.path.join(self.image_dir, f"{str(image_id).zfill(12)}.jpg")
            img = load_img(image_path, target_size=(self.image_size, self.image_size))
            img = img_to_array(img) / 255.0
            batch_images[i] = img

            caption = annotation['caption']
            seq = self.tokenizer.texts_to_sequences([caption])[0]
            if len(seq) > self.max_length:
                seq = seq[:self.max_length]


            batch_sequences[i, :len(seq)] = seq

            # Target sequence, shifted by 1
            if len(seq) > 1:
                batch_targets[i, :len(seq)-1] = seq[1:]

        return {
            'input_layer_2': batch_images,
            'input_layer_3': batch_sequences
        }, batch_targets

In [17]:
model = create_image_captioning_model(image_size, vocab_size, embedding_dim, max_length)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

I cannot train the model due to the limitations of Google Colab environment, but if you manage to train the model, you can make inference using the following cell. Below the inference cell, you can find the training code. Don't forget to reinitialize the model by simply running the previous cell after running the next cell (inference). Good luck!

In [16]:
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def preprocess_image(image_path, image_size):
    img = load_img(image_path, target_size=(image_size, image_size))
    img = img_to_array(img) / 255.0
    return np.expand_dims(img, axis=0)

def generate_caption(model, tokenizer, image_path, image_size, max_length):

    image = preprocess_image(image_path, image_size)

    # Initialize the input sequence with zeros
    input_sequence = np.zeros((1, max_length))

    for i in range(max_length):
        predictions = model.predict({'input_layer_2': image, 'input_layer_3': input_sequence})

        predicted_word_id = np.argmax(predictions[0, i])

        if predicted_word_id == tokenizer.word_index.get('<end>', None):
            break

        input_sequence[0, i] = predicted_word_id

    caption = []
    for word_id in input_sequence[0]:
        if word_id == 0:
            continue
        word = tokenizer.index_word.get(word_id, '')
        if word == '<end>':
            break
        caption.append(word)

    return ' '.join(caption)


image_path = 'apple.jpg'  # Replace with your image path
caption = generate_caption(model, tokenizer, image_path, image_size, max_length)
print("Generated Caption:", caption)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms

You probably got a silly caption, because the model is not trained. You can train it using the cells below.

In [18]:
batch_size = 2
train_generator = ImageCaptioningDataGenerator(
    image_dir=image_dir,
    annotations_file=annotations_file,
    tokenizer=tokenizer,
    batch_size=batch_size,
    image_size=image_size,
    max_length=max_length
)

In [None]:
history = model.fit(
    train_generator,
    epochs=1,
    steps_per_epoch=len(train_generator)
)

  self._warn_if_super_not_called()


[1m   264/295876[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:13:17[0m 173ms/step - accuracy: 0.7789 - loss: 2.3163