# Image Caption  Genrator with Voice Integeration

## Import Libraries

In [2]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
!pip install omegaconf

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [4]:
import torch

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
warnings.filterwarnings('ignore')
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.applications import InceptionV3
from keras.layers import TextVectorization
from keras.preprocessing.image import load_img, img_to_array
from tqdm import tqdm_notebook
from collections import Counter
import pickle
from tensorflow.keras.utils import serialize_keras_object, deserialize_keras_object

In [6]:
model, example_text = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language='en',
    speaker='v3_en'
)

Using cache found in C:\Users\USER/.cache\torch\hub\snakers4_silero-models_master


## Preprocessing

In [7]:
# Desired image dimensions
IMAGE_SIZE = (299, 299)

# Fixed length allowed for any sequence
SEQ_LENGTH = 24

# Vocabulary size
VOCAB_SIZE = 13000

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 512

# Per-layer units in the feed-forward network
FF_DIM = 512


# Number of epochs
EPOCHS = 30


#Image Augmentation

from keras.saving import register_keras_serializable

# Data augmentation for image data
image_augmentation = keras.Sequential([layers.RandomFlip("horizontal"),
                                       layers.RandomRotation(0.2),
                                       layers.RandomContrast(0.3)])

## Model

In [8]:
@tf.keras.utils.register_keras_serializable()
def get_cnn_model():
    # Load InceptionV3 with imagenet weights, exclude top layer
    base_model = keras.applications.InceptionV3(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights="imagenet"
    )

    # Freeze weights of base model
    base_model.trainable = False

    # Get the output of the last conv layer
    base_model_out = base_model.output  # shape: (None, 8, 8, 2048)

    # Reshape to match transformer input: (batch, num_patches, channels)
    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)

    # Create the CNN model
    cnn_model = keras.models.Model(inputs=base_model.input, outputs=base_model_out)
    return cnn_model

#Encoder

@tf.keras.utils.register_keras_serializable()
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.0)
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.dense_1 = layers.Dense(embed_dim, activation="relu")

    def call(self, inputs, training, mask=None):
        inputs = self.layernorm_1(inputs)
        inputs = self.dense_1(inputs)
        attention_output_1 = self.attention_1(query=inputs,
                                              value=inputs,
                                              key=inputs,
                                              attention_mask=None,
                                              training=training)
        out_1 = self.layernorm_2(inputs + attention_output_1)
        return out_1

#Decoder

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1) # Positional encoding
        embedded_tokens = self.token_embeddings(inputs) # Input embedding
        embedded_tokens = embedded_tokens * self.embed_scale
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions # Positional embedding

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


@tf.keras.utils.register_keras_serializable()
class TransformerDecoderBlock(layers.Layer):
    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.num_heads = num_heads

        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.1)
        self.cross_attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.1)

        self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
        self.ffn_layer_2 = layers.Dense(embed_dim)

        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

        self.embedding = PositionalEmbedding(
            embed_dim=EMBED_DIM,
            sequence_length=SEQ_LENGTH,
            vocab_size=VOCAB_SIZE,
        )
        self.out = layers.Dense(VOCAB_SIZE, activation="softmax")

        self.dropout_1 = layers.Dropout(0.3)
        self.dropout_2 = layers.Dropout(0.5)

        self.supports_masking = True
    def call(self, inputs, encoder_outputs, training=False, mask=None):
      # inputs shape: [B, L] where L is SEQ_LENGTH-1
      x = self.embedding(inputs)  # [B, L, D]
      #build masks for length
      if mask is None:
        dec_pad = tf.math.not_equal(inputs, 0)  
      else:
        dec_pad = tf.cast(mask, tf.bool)
      causal = self.get_causal_attention_mask(x)         
      dec_pad_exp = tf.expand_dims(dec_pad, axis=1)       
      combined = tf.math.logical_and(causal, dec_pad_exp)   
      #Self attention ---
      attn1 = self.attention_1(
        query=x, value=x, key=x,
        attention_mask=combined, # Use the combined mask
        training=training
        )
      out1 = self.layernorm_1(x + attn1)
      # Cross attention ---
      cross_mask = tf.expand_dims(dec_pad, axis=2)  # [B, L, 1]
      attn2 = self.cross_attention_2(
        query=out1,
        value=encoder_outputs,
        key=encoder_outputs,
        attention_mask=cross_mask, # Use the cross mask
        training=training
        )
      out2 = self.layernorm_2(out1 + attn2)
      # --- FFN ---
      ffn = self.ffn_layer_1(out2)
      ffn = self.dropout_1(ffn, training=training)
      ffn = self.ffn_layer_2(ffn)
      ffn = self.layernorm_3(out2 + ffn)
      ffn = self.dropout_2(ffn, training=training)

      return self.out(ffn)   # [B, L, vocab]


    def get_causal_attention_mask(self, x):
        """
        Lower-triangular boolean mask [B, L, L] where True means "can attend".
        """
        shape = tf.shape(x)
        B = shape[0]
        L = shape[1]
        i = tf.range(L)[:, None]
        j = tf.range(L)[None, :]
        # True for j <= i (past + current), False for future
        mask = tf.math.greater_equal(i, j)              # [L, L] bool
        mask = tf.expand_dims(mask, axis=0)             # [1, L, L]
        mask = tf.tile(mask, [B, 1, 1])                 # [B, L, L]
        return mask


#Model architecture

@tf.keras.utils.register_keras_serializable()
class ImageCaptioningModel(keras.Model):
    def __init__(self, cnn_model, encoder, decoder, num_captions_per_image=5, image_aug=None,  **kwargs):
        super().__init__(**kwargs)
        self.cnn_model = cnn_model
        self.encoder = encoder
        self.decoder = decoder
        self.loss_tracker = keras.metrics.Mean(name="loss")
        self.acc_tracker = keras.metrics.Mean(name="accuracy")
        self.num_captions_per_image = num_captions_per_image
        self.image_aug = image_aug

        print()
        print(f'CNN input shape: {cnn_model.input_shape}')
        print(f'CNN output shape: {cnn_model.output_shape}', end='\n'*2)
        print(f'Encoder input ---> Dense layer shape: {cnn_model.output_shape} ---> (None, {cnn_model.output_shape[1]}, {EMBED_DIM})')
        print(f'Encoder output shape: (None, {cnn_model.output_shape[1]}, {EMBED_DIM})', end='\n'*2)
        print(f'Decoder input 1 (Caption) ---> Positional Embedding shape: (None, {SEQ_LENGTH-1}) ---> (None, {SEQ_LENGTH-1}, {EMBED_DIM})')
        print(f'Decoder input 2 (Embedded image features) shape: (None, {cnn_model.output_shape[1]}, {EMBED_DIM})')
        print(f'Decoder output (MH Cross-Attention) shape: (None, {SEQ_LENGTH-1}, {EMBED_DIM})')
        print(f'Decoder prediction (Dense layer) shape: (None, {SEQ_LENGTH-1}, {VOCAB_SIZE})')


    # Calculates the loss, taking into account a mask to handle padding.
    def calculate_loss(self, y_true, y_pred, mask):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    # Calculates the accuracy, taking into account a mask to handle padding.
    def calculate_accuracy(self, y_true, y_pred, mask):
        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
        accuracy = tf.math.logical_and(mask, accuracy)
        accuracy = tf.cast(accuracy, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

    def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
        encoder_out = self.encoder(img_embed, training=training)
        batch_seq_inp = batch_seq[:, :-1]
        batch_seq_true = batch_seq[:, 1:]
        # Creating a binary mask where 1 indicates a valid token, and 0 indicates padding.
        mask = tf.math.not_equal(batch_seq_true, 0)
        batch_seq_pred = self.decoder(batch_seq_inp, encoder_out, training=training, mask=mask)
        loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
        acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
        return loss, acc

    # Iterates through each caption for the given image, computes loss and accuracy, updates weights, and trackers.
    def train_step(self, batch_data):
        batch_img, batch_seq = batch_data
        batch_loss = 0
        batch_acc = 0

        # Applies image augmentation if image_aug is provided.
        if self.image_aug:
            batch_img = self.image_aug(batch_img)

        # 1. Get image embeddings
        img_embed = self.cnn_model(batch_img)

        # 2. Pass each of the five captions one by one to the decoder
        # along with the encoder outputs and compute the loss as well as accuracy
        # for each caption.
        for i in range(self.num_captions_per_image):
            with tf.GradientTape() as tape:
                loss, acc = self._compute_caption_loss_and_acc(img_embed, batch_seq[:, i], training=True)

                # 3. Update loss and accuracy
                batch_loss += loss
                batch_acc += acc

            # 4. Get the list of all the trainable weights
            train_vars = (self.encoder.trainable_variables + self.decoder.trainable_variables)

            # 5. Get the gradients
            grads = tape.gradient(loss, train_vars)

            # 6. Update the trainable weights
            self.optimizer.apply_gradients(zip(grads, train_vars))

        # 7. Update the trackers
        batch_acc /= float(self.num_captions_per_image)
        self.loss_tracker.update_state(batch_loss)
        self.acc_tracker.update_state(batch_acc)

        # 8. Return the loss and accuracy values
        return {"loss": self.loss_tracker.result(),
                "acc": self.acc_tracker.result()}

    # Similar to train_step but without updating weights.
    def test_step(self, batch_data):
        batch_img, batch_seq = batch_data
        batch_loss = 0
        batch_acc = 0

        # 1. Get image embeddings
        img_embed = self.cnn_model(batch_img)

        # 2. Pass each of the five captions one by one to the decoder
        # along with the encoder outputs and compute the loss as well as accuracy
        # for each caption.
        for i in range(self.num_captions_per_image):
            loss, acc = self._compute_caption_loss_and_acc(img_embed, batch_seq[:, i], training=False)

            # 3. Update batch loss and batch accuracy
            batch_loss += loss
            batch_acc += acc

        batch_acc /= float(self.num_captions_per_image)

        #Update the trackers
        self.loss_tracker.update_state(batch_loss)
        self.acc_tracker.update_state(batch_acc)

        return {"loss": self.loss_tracker.result(),
                "acc": self.acc_tracker.result()}

    @property
    def metrics(self):
      
        return [self.loss_tracker, self.acc_tracker]

    def call(self, inputs, training=False):
      if isinstance(inputs, dict):
        batch_img = inputs["img"]
        batch_seq = inputs["seq"]
      else:
        batch_img, batch_seq = inputs
      img_embed = self.cnn_model(batch_img, training=training)
      encoder_out = self.encoder(img_embed, training=training)
      output = self.decoder(batch_seq, encoder_out, training=training)
      return output

    def get_config(self):
      config = super().get_config()
      config.update({
        "cnn_model": serialize_keras_object(self.cnn_model),
        "encoder": serialize_keras_object(self.encoder),
        "decoder": serialize_keras_object(self.decoder),
        "num_captions_per_image": self.num_captions_per_image,
        "image_aug": serialize_keras_object(self.image_aug) if self.image_aug else None,
        })
      return config

    @classmethod
    def from_config(cls, config):
      cnn_model = deserialize_keras_object(config.pop("cnn_model"))
      encoder = deserialize_keras_object(config.pop("encoder"))
      decoder = deserialize_keras_object(config.pop("decoder"))
      return cls(cnn_model=cnn_model, encoder=encoder, decoder=decoder, **config)


cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=2)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=3)
caption_model = ImageCaptioningModel(cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=image_augmentation)


CNN input shape: (None, 299, 299, 3)
CNN output shape: (None, 64, 2048)

Encoder input ---> Dense layer shape: (None, 64, 2048) ---> (None, 64, 512)
Encoder output shape: (None, 64, 512)

Decoder input 1 (Caption) ---> Positional Embedding shape: (None, 23) ---> (None, 23, 512)
Decoder input 2 (Embedded image features) shape: (None, 64, 512)
Decoder output (MH Cross-Attention) shape: (None, 23, 512)
Decoder prediction (Dense layer) shape: (None, 23, 13000)


## Compiling the model

In [9]:
# Learning Rate Scheduler for the optimizer
@tf.keras.utils.register_keras_serializable()
class LRSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, post_warmup_learning_rate, warmup_steps):
        super().__init__()
        self.post_warmup_learning_rate = post_warmup_learning_rate
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        global_step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
        warmup_progress = global_step / warmup_steps
        warmup_learning_rate = self.post_warmup_learning_rate * warmup_progress
        return tf.cond(
            global_step < warmup_steps,
            lambda: warmup_learning_rate,
            lambda: self.post_warmup_learning_rate)

    def get_config(self):
        return {
            "post_warmup_learning_rate": self.post_warmup_learning_rate,
            "warmup_steps": self.warmup_steps,
        }

## Generating Captions

In [10]:

import tensorflow as tf
import pickle
from tensorflow import keras
from PIL import Image
import numpy as np
from keras.saving import register_keras_serializable
from keras.layers import TextVectorization
from keras import layers
from keras.applications import InceptionV3
from IPython.display import Audio, display


IMAGE_SIZE = (299, 299)
# Fixed length allowed for any sequence
SEQ_LENGTH = 24

# Vocabulary size
VOCAB_SIZE = 13000

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 512

# Per-layer units in the feed-forward network
FF_DIM = 512

# Batch size
BATCH_SIZE = 64

image_augmentation = keras.Sequential([layers.RandomFlip("horizontal"),
                                       layers.RandomRotation(0.2),
                                       layers.RandomContrast(0.3)])


@register_keras_serializable()
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    strip_chars = "!\"#$%&'()*+,-./:;=?@[\\]^_`{|}~1234567890"
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

#Load vectorizer ---
with open("vocab.txt", "r", encoding="utf-8") as f:
    vocab = [line.strip() for line in f]

# Defining the vectorizer
vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH,
    standardize=custom_standardization)

vectorization.set_vocabulary(vocab)



custom_objects = {
    "ImageCaptioningModel": ImageCaptioningModel,
    "TransformerDecoderBlock": TransformerDecoderBlock,
    "PositionalEmbedding": PositionalEmbedding
}

caption_model = keras.models.load_model("caption_model.keras", custom_objects=custom_objects)
# --- 4. Preprocess image ---
def decode_and_resize(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    return img

img_path = "Images/2097420505.jpg"
img = decode_and_resize(img_path)

# --- 5. Generate caption sequence ---
SEQ_LENGTH = 24  # match your training sequence length

vocab = vectorization.get_vocabulary()
start_token = vocab.index("<start>") if "<start>" in vocab else 1
end_token = vocab.index("<end>") if "<end>" in vocab else 2
id_to_token = {i: token for i, token in enumerate(vocab)}

# Initialize caption with start token
caption_seq = np.zeros((1, SEQ_LENGTH), dtype=np.int32)
caption_seq[0, 0] = start_token

# Add batch dimension to image
img_batch = tf.expand_dims(img, 0)  # shape: (1, 299, 299, 3)

# Ensure caption_seq is already batched
caption_batch = caption_seq.astype(np.float32)  # TFSMLayer may require float32



#Autoregressive generation ---
for i in range(1, SEQ_LENGTH):
    preds = caption_model((img, caption_seq), training=False)

    next_token = np.argmax(preds[0, i-1, :])
    caption_seq[0, i] = next_token

    # compare against `end_token`
    if next_token == end_token:
        break

#Decode caption ---
caption_tokens = [id_to_token.get(t, "") for t in caption_seq[0]]
caption_text = " ".join([t for t in caption_tokens if t not in ("<start>", "<end>", "")])
print("Generated caption:", caption_text)

print('Audio is being generated...')
model.save_wav(text=caption_text,
               speaker='en_0',
               sample_rate=48000, audio_path=f'{img_path}.wav')
choice = input("Play the audio (yes/no)? ").strip().lower()
if choice == "yes":
      # Play audio 
    display(Audio(f'{img_path}.wav', autoplay=True))
    print("Audio playback finished.")
   
else:
    print("Audio not played.")
    os.remove(f'{img_path}.wav')


CNN input shape: (None, 299, 299, 3)
CNN output shape: (None, 64, 2048)

Encoder input ---> Dense layer shape: (None, 64, 2048) ---> (None, 64, 512)
Encoder output shape: (None, 64, 512)

Decoder input 1 (Caption) ---> Positional Embedding shape: (None, 23) ---> (None, 23, 512)
Decoder input 2 (Embedded image features) shape: (None, 64, 512)
Decoder output (MH Cross-Attention) shape: (None, 23, 512)
Decoder prediction (Dense layer) shape: (None, 23, 13000)

Generated caption: a man in a blue jacket and hat is standing in the snow
Audio is being generated...


Play the audio (yes/no)?  yes


Audio playback finished.
