# Preliminary settings

Libraries imported.

In [12]:
import os
import re
import json
import pickle
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import efficientnet
from tensorflow.keras.layers import TextVectorization

Set a randomizer.

In [13]:
from datetime import datetime
seed = int(round(datetime.now().timestamp()))
np.random.seed(seed)
tf.random.set_seed(seed)

Important constants.

In [14]:
# Desired image dimensions
IMAGE_SIZE = (299, 299)

# Vocabulary size
VOCAB_SIZE = 6000

# Fixed length allowed for any sequence
SEQ_LENGTH = 8

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 256

# Per-layer units in the feed-forward network
FF_DIM = 256

# Setting parameters of the model

Set the version of the model and the path to the vocabulary of vectorised captions.

In [15]:
mdx = '231005'
dxx = f'/data/Vocab/{mdx}/'

Load the vocabulary.

In [16]:
filepath = f'{dxx}vocab_{mdx}'
try:
    print(f"Loading vocabulary from {filepath}")
    with open(filepath, "rb") as f:
        vx = pickle.load(f)
    print("Vocabulary loaded successfully")
        
    global VOCAB_SIZE
    VOCAB_SIZE = len(vx)
except Exception as e:
    print(f"Error loading vocabulary file: {e}")

Loading vocabulary from /data/Vocab/231005/vocab_231005
Vocabulary loaded successfully


Set the image augmentation and vectorisation functions to be used in the model.

In [17]:
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


# Using raw string for strip_chars
strip_chars = r"!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
strip_chars = strip_chars.replace("<", "").replace(">", "")

# Initialize TextVectorization
try:
    if vx:
        vectorization = TextVectorization(
            max_tokens=VOCAB_SIZE,
            output_mode="int",
            output_sequence_length=SEQ_LENGTH,
            standardize=custom_standardization,
            vocabulary=vx,
        )
    else:
        raise ValueError("Vocabulary is not defined.")
except Exception as e:
    print(f"Error initializing TextVectorization: {e}")
    vectorization = None


# Data augmentation for image data
image_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1), # Reduced rotation for faster preprocessing
        layers.RandomContrast(0.2), # Lighter contrast adjustment
        # layers.RandomTranslation(0.1, 0.1),
    ]
)

Image loading and decoding function.

In [18]:
# Decode, resize, and preprocess images
def decode_and_resize(img_path):
    try:
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, IMAGE_SIZE)
        img = tf.image.convert_image_dtype(img, tf.float32)
        return img
    except Exception as e:
        print(f"Error in decoding and resizing image {img_path}: {e}")

# Transformer model

Create the same Transformer model as used in training.

In [19]:
# Defining the Model
# CNN
def get_cnn_model():
    try:
        base_model = efficientnet.EfficientNetB0(
            input_shape=(*IMAGE_SIZE, 3),
            include_top=False,
            weights="imagenet",
        )
        # We freeze our feature extractor
        base_model.trainable = False
        base_model_out = base_model.output
        
        # Reduce the sequence length using a pooling operation
        # Using GlobalAveragePooling2D to reduce the spatial dimensions
        base_model_out = layers.GlobalAveragePooling2D()(base_model_out)
        
        # Project the output to match the embedding size
        base_model_out = layers.Dense(EMBED_DIM)(base_model_out)
        
        cnn_model = keras.models.Model(base_model.input, base_model_out)
        
        # Print CNN Model Summary
        print("\nCNN Model Summary:")
        cnn_model.summary()
        
        return cnn_model
    except Exception as e:
        print(f"Error loading CNN model: {e}")


# Positional Encoding and Encoder/Decoder

# Encoder
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.0 # previously 0.1
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.dense_1 = layers.Dense(embed_dim, activation="relu")

    def call(self, inputs, training, mask=None):
        # Input shape
        print(f"Encoder Input Shape: {inputs.shape}")

        print(f"Encoder Input Shape before LayerNorm: {inputs.shape}")
        inputs = self.layernorm_1(inputs)
        print(f"Encoder Input Shape after LayerNorm: {inputs.shape}")
        
        inputs = self.dense_1(inputs)

        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=None,
            training=training,
        )
        
        out_1 = self.layernorm_2(inputs + attention_output_1)

        # Output shape
        print(f"Encoder Output Shape: {out_1.shape}")
        return out_1


# Positional Encoding
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))

    def call(self, inputs):
        print(f"Positional Embedding Input Shape: {inputs.shape}")
        
        # Get input shape and positions
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        
        # Embed tokens and positions
        embedded_tokens = self.token_embeddings(inputs)
        embedded_tokens = embedded_tokens * self.embed_scale # Apply scaling
        embedded_positions = self.position_embeddings(positions)
        
        # Embeddings shape
        print(f"Positional Embedding Output Shape: {embedded_tokens.shape}")
        print(f"embedded_tokens dtype: {embedded_tokens.dtype}")
        print(f"embedded_positions dtype: {embedded_positions.dtype}")
        
        # Return combined embeddings
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


# Decoder
class TransformerDecoderBlock(layers.Layer):
    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.num_heads = num_heads
        
        # Attention layers
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )

        # Feed-forward layers
        self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
        self.ffn_layer_2 = layers.Dense(embed_dim)

        # Layer normalizations
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

        # Output layers
        self.embedding = PositionalEmbedding(
            embed_dim=EMBED_DIM, sequence_length=SEQ_LENGTH, vocab_size=VOCAB_SIZE
        )
        self.out = layers.Dense(VOCAB_SIZE, activation="softmax")

        # Dropout layers
        self.dropout_1 = layers.Dropout(0.3) # previously 0.1
        self.dropout_2 = layers.Dropout(0.5) # previously 0.1
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, training, mask=None):
        """
        Args:
            inputs: Tokenized inputs to the decoder (batch_size, sequence_length).
            encoder_outputs: Outputs from the encoder (batch_size, seq_len, embed_dim).
            training: Boolean indicating whether it's training or inference.
            mask: Mask for padded tokens (batch_size, sequence_length).
        
        Returns:
            preds: Decoder output predictions (batch_size, seq_len, vocab_size).
        """
        print(f"Decoder Input Shape: {inputs.shape}")
        
        inputs = self.embedding(inputs)
        causal_mask = self.get_causal_attention_mask(inputs)

        if mask is not None:
            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
            combined_mask = tf.minimum(combined_mask, causal_mask)

        # Self-attention
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=combined_mask,
            training=training,
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        # Cross-attention with encoder outputs
        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
            training=training,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        # Feed-forward network
        ffn_out = self.ffn_layer_1(out_2)
        ffn_out = self.dropout_1(ffn_out, training=training)
        ffn_out = self.ffn_layer_2(ffn_out)

        ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
        ffn_out = self.dropout_2(ffn_out, training=training)
        preds = self.out(ffn_out)
        
        print(f"Decoder Output Shape: {preds.shape}")
        return preds

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


# Model definition
class ImageCaptioningModel(keras.Model):
    def __init__(
        self, cnn_model, encoder, decoder, num_captions_per_image=2, image_aug=None,
    ):
        super().__init__()
        self.cnn_model = cnn_model
        self.encoder = encoder
        self.decoder = decoder
        self.loss_tracker = keras.metrics.Mean(name="loss")
        self.acc_tracker = keras.metrics.Mean(name="accuracy")
        self.num_captions_per_image = num_captions_per_image
        self.image_aug = image_aug

    def calculate_loss(self, y_true, y_pred, mask):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def calculate_accuracy(self, y_true, y_pred, mask):
        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
        accuracy = tf.math.logical_and(mask, accuracy)
        accuracy = tf.cast(accuracy, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

    def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
        print(f"Image Embedding Input Shape before passing to Encoder: {img_embed.shape}")
        
        # batch_seq = tf.expand_dims(batch_seq, axis=1)
        print(f"Batch Sequence Input Shape before slicing: {batch_seq.shape}")
        
        encoder_out = self.encoder(img_embed, training=training)
        batch_seq_inp = batch_seq[:, :-1] # Input sequence (without the last token)

        print(f"Batch Sequence Input Shape before target sequence: {batch_seq_inp.shape}")
        
        batch_seq_true = batch_seq[:, 1:] # Target sequence (without the first token)
        mask = tf.math.not_equal(batch_seq_true, 0)
        
        print(f"Batch Sequence Input Shape: {batch_seq_inp.shape}")
        print(f"Batch Sequence True Shape: {batch_seq_true.shape}")
        
        batch_seq_pred = self.decoder(
            batch_seq_inp, encoder_out, training=training, mask=mask
        )

        print(f"Batch Sequence Predicted Shape: {batch_seq_pred.shape}")
        
        loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
        acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
        return loss, acc

    def train_step(self, batch_data):
        batch_img, batch_seq = batch_data

        # batch_seq = tf.expand_dims(batch_seq, axis=1)

        print(f"Training Image Batch Shape before passing to CNN: {batch_img.shape}")
        total_loss = 0
        total_acc = 0
    
        if self.image_aug:
            batch_img = self.image_aug(batch_img)

        print(f"Training Image Batch Shape: {batch_img.shape}")
        print(f"Training Sequence Batch Shape: {batch_seq.shape}")
        
        # 1. Get image embeddings from CNN
        img_embed = self.cnn_model(batch_img)
        print(f"Image Embeddings Shape: {img_embed.shape}")

        # 2. Reshape CNN output to (batch_size, 1, embedding_dim)
        img_embed = tf.expand_dims(img_embed, axis=1)  # It should be (None, 1, 1024)

        print(f"Reshaped Image Embeddings for Encoder: {img_embed.shape}")
        
        # 3. Make sure batch_seq has 3 dimensions
        if batch_seq.shape.ndims == 2:
            # Reshape the sequence to have a third dimension (e.g., 1 caption per image)
            batch_seq = tf.expand_dims(batch_seq, axis=1)
        
        print(f"Updated Sequence Shape: {batch_seq.shape}")

        # 4. Accumulate loss and accuracy for each caption
        with tf.GradientTape() as tape:
            # Loop through each caption (batch_seq should be (batch_size, num_captions, sequence_length))
            num_captions_per_image = batch_seq.shape[1] # Extract the num_captions dimension
            
            for i in range(self.num_captions_per_image):
                loss, acc = self._compute_caption_loss_and_acc(
                    img_embed, batch_seq[:, i, :], training=True
                )
                total_loss += loss
                total_acc += acc

            # 5. Compute the mean loss and accuracy
            avg_loss = total_loss / tf.cast(self.num_captions_per_image, dtype=tf.float32)
            avg_acc = total_acc / tf.cast(self.num_captions_per_image, dtype=tf.float32)

        # Backpropagation
        # 6. Get the list of all the trainable weights
        train_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
        
        # 7. Get the gradients (from the accumulated loss)
        grads = tape.gradient(avg_loss, train_vars)
    
        # 8. Update the trainable weights
        self.optimizer.apply_gradients(zip(grads, train_vars))
    
        # 9. Update the trackers
        self.loss_tracker.update_state(avg_loss)
        self.acc_tracker.update_state(avg_acc)
    
        # 10. Return the loss and accuracy values
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    def test_step(self, batch_data):
        batch_img, batch_seq = batch_data
        print(f"Validation Image Batch Shape: {batch_img.shape}")
        print(f"Validation Sequence Batch Shape: {batch_seq.shape}")

        # batch_seq = tf.expand_dims(batch_seq, axis=1)

        batch_loss = 0
        batch_acc = 0

        # 1. Get image embeddings
        img_embed = self.cnn_model(batch_img)
        print(f"Image Embeddings Shape: {img_embed.shape}")
        img_embed = tf.expand_dims(img_embed, axis=1)
        print(f"Reshaped Image Embeddings Shape: {img_embed.shape}")

        # 2. Pass each of the captions one by one to the decoder
        # along with the encoder outputs and compute the loss as well as accuracy
        # for each caption.
        # Loop through captions
        for i in range(self.num_captions_per_image):
            batch_seq_inp = batch_seq[:, i, :-1]
            batch_seq_true = batch_seq[:, i, 1:]
            print(f"Validation Sequence Input Shape: {batch_seq_inp.shape}")
            print(f"Validation Sequence True Shape: {batch_seq_true.shape}")
        
            loss, acc = self._compute_caption_loss_and_acc(
                img_embed, batch_seq[:, i, :], training=False
            )

            # 3. Update batch loss and batch accuracy
            batch_loss += loss
            batch_acc += acc

        batch_acc /= float(self.num_captions_per_image)

        # 4. Update the trackers
        self.loss_tracker.update_state(batch_loss)
        self.acc_tracker.update_state(batch_acc)

        # 5. Return the loss and accuracy values
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker, self.acc_tracker]

# Model construction

# Initialize components
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)


CNN Model Summary:
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
rescaling_1 (Rescaling)         (None, 299, 299, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
normalization_1 (Normalization) (None, 299, 299, 3)  7           rescaling_1[0][0]                
__________________________________________________________________________________________________
stem_conv_pad (ZeroPadding2D)   (None, 301, 301, 3)  0           normalization_1[0][0]            
________________________________________________________________________

Instantiate the model.

In [20]:
# Create the ImageCaptioningModel
caption_model = ImageCaptioningModel(
    cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=image_augmentation,
)

# Model testing

Load the desired model weights.

In [21]:
mdx = '231005'  # Sets the version
tmpx = f'/data/Model_weights/{mdx}/Temp/'

# Check for the files
try:
    # Loading weights
    weight_path = f'{tmpx}imgcap_{mdx}'
    fls = os.listdir(tmpx)

    # Look for specific weight files (like .index or .data-00000-of-00001)
    checkpoint_files = [f for f in fls if "imgcap_" in f]
    
    if len(checkpoint_files) > 0:
        print("Found saved weights, loading them now...")
        caption_model.load_weights(weight_path)
        print("Saved weights loaded successfully")
    else:
        print("No saved weights found")
except FileNotFoundError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Found saved weights, loading them now...
Saved weights loaded successfully


Function to generate caption.

In [22]:
try:
    vocab = vectorization.get_vocabulary() if vectorization else None
    if vocab:
        index_lookup = dict(zip(range(len(vocab)), vocab))
        print(f"Vocabulary size: {len(vocab)}")
    else:
        index_lookup = None
        print("Vocabulary is missing. Captions may not generate correctly.")
except AttributeError as e:
    index_lookup = None
    print(f"Failed to retrieve vocabulary: {e}")
except Exception as e:
    index_lookup = None
    print(f"Unexpected error while initializing vocabulary: {e}")

max_decoded_sentence_length = SEQ_LENGTH - 1

def generate_caption(caption_model, image_path, display_image=False):
    """
    Generate a caption for the given image.

    Args:
    - caption_model: Trained image captioning model.
    - image_path: Path to the input image.
    - display_image (bool): Whether to display the image. Default is False.

    Returns:
    - str: The generated caption or an error message.
    """
    try:
        # Decode and resize the image
        sample_img = decode_and_resize(image_path)
        print(sample_img)
        if sample_img is None:
            print(f"Error: Image could not be processed for {image_path}.")
            return "Image could not be processed."

        img = sample_img.numpy().clip(0, 255).astype(np.uint8)

        # Display the image if required
        if display_image:
            plt.axis('off')
            plt.imshow(img)
            plt.show()
            plt.close()

        # Process the image
        # Pass the image to the CNN
        img_tensor = tf.expand_dims(sample_img, 0)
        print(f"Image tensor shape: {img_tensor.shape}")
        img_features = caption_model.cnn_model(img_tensor)
        print(f"Features shape after CNN: {img_features.shape}")
        
        # Expand dimensions to make it compatible with the encoder
        img_features = tf.expand_dims(img_features, 1)

        # Encode the image
        # Pass the image features to the Transformer encoder
        encoded_img = caption_model.encoder(img_features, training=False)

        # Decode the caption
        # Generate the caption using the Transformer decoder
        decoded_caption = "<start> "
        for i in range(max_decoded_sentence_length):
            if vectorization is None:
                print("Error: Caption generation unavailable.")
                return "Caption generation unavailable."

            # Ensure decoded_caption is passed as a list of strings
            tokenized_caption = vectorization(tf.constant([decoded_caption]))[:, :-1]
            if tokenized_caption is None or not tf.is_tensor(tokenized_caption):
                print(f"Tokenization failed for caption: {decoded_caption}")
                return "Error: Tokenization failed."
            print(f"Tokenized caption shape: {tokenized_caption.shape}")
            
            # Create mask for the tokenized caption
            mask = tf.math.not_equal(tokenized_caption, 0)
            if mask is None or not tf.is_tensor(mask):
                print("Mask creation failed.")
                return "Error: Mask creation failed."
            print(f"Mask shape: {mask.shape}")
            
            predictions = caption_model.decoder(
                tokenized_caption, encoded_img, training=False, mask=mask
            )
            if predictions is None or not tf.is_tensor(predictions):
                print("Decoder predictions failed.")
                return "Error: Decoder predictions failed."
            print(f"Predictions shape: {predictions.shape}")

            # Get the predicted token
            sampled_token_index = np.argmax(predictions[0, -1, :])

            # Check if token index is valid
            if sampled_token_index >= len(vocab):
                print(f"Warning: Token index {sampled_token_index} out of range.")
                continue
            
            sampled_token = index_lookup[sampled_token_index]
            
            if sampled_token == "[UNK]":
                continue  # Skip unknown tokens

            # Handle noisy or unknown tokens
            if sampled_token in ("[UNK]", ""):
                print(f"Warning: Encountered noisy token '{sampled_token}'. Skipping.")
                continue  # Skip this token

            if sampled_token == "<end>":
                break
            
            decoded_caption += " " + sampled_token

        # Clean the caption
        decoded_caption = (
            decoded_caption.replace("<start> ", "")
            .replace(" <end>", "")
            .replace("[UNK]", "")
            .strip()
        )
        print(f"Generated caption for image {image_path}: {decoded_caption}")
        return decoded_caption
    except Exception as e:
        print(f"Error generating caption for image {image_path}: {e}")
        return "Error generating caption."

Vocabulary size: 3138


Choose the desired images to test.

In [23]:
# Main loop to process test images
dataset_directory = "/data/test/rxxch9vw59-2/"
test_image_filenames = os.listdir(dataset_directory+"images")

# Verify the number of test images
print(f"Number of test images: {len(test_image_filenames)}")

Number of test images: 1442


Display the image and the generated caption.

In [24]:
# Display the image and the generated caption.
captions = {}
for image_name in test_image_filenames[:5]:
    image_path = os.path.join(dataset_directory, "images", image_name)
    caption = generate_caption(caption_model, image_path, display_image=True)
    if caption:
        captions[image_path] = caption
        print(f"Image: {image_name}\nPredicted Caption: {caption}\n")

tf.Tensor(
[[[  5.         13.         16.       ]
  [  5.         13.         16.       ]
  [  5.5267563  13.526756   16.526756 ]
  ...
  [  8.216848   20.216848   32.216846 ]
  [  8.458194   20.458195   32.458195 ]
  [  8.458194   20.458195   32.458195 ]]

 [[  5.6254177  13.625418   16.625418 ]
  [  5.6254177  13.625418   16.625418 ]
  [  5.8227315  13.822731   16.822731 ]
  ...
  [  8.921408   20.921408   32.921406 ]
  [  9.250835   21.250835   33.250835 ]
  [  9.250835   21.250835   33.250835 ]]

 [[  6.         14.         17.       ]
  [  6.         14.         17.       ]
  [  6.         14.         17.       ]
  ...
  [ 10.137707   22.137707   34.137707 ]
  [ 10.29097    22.29097    34.29097  ]
  [ 10.29097    22.29097    34.29097  ]]

 ...

 [[158.42093   142.65898   123.44271  ]
  [153.82774   136.65384   120.270935 ]
  [167.63065   152.48405   136.14848  ]
  ...
  [172.70146   156.42607   132.61723  ]
  [165.04141   144.6434    119.84241  ]
  [204.62683   180.8509    152.98

2024-12-23 03:10:46.714216: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8401
2024-12-23 03:10:47.854927: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-12-23 03:10:47.856346: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-12-23 03:10:47.856386: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2024-12-23 03:10:47.857319: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-12-23 03:10:47.857399: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1

In [25]:
# Dictionary to store generated captions
gen_cap = {}

In [26]:
# Save captions to JSON
output_directory = "/results/Captions"
os.makedirs(output_directory, exist_ok=True)  # Ensure the output directory exists

In [27]:
# Generate captions for all images in the dataset
for idx, filename in enumerate(test_image_filenames):
    path = f"{dataset_directory}images/{filename}"
    caption = generate_caption(caption_model, path, display_image=False)
    gen_cap[filename] = caption
    print(f"Processed {idx + 1}/{len(test_image_filenames)}: {filename} -> {caption}")

tf.Tensor(
[[[  5.         13.         16.       ]
  [  5.         13.         16.       ]
  [  5.5267563  13.526756   16.526756 ]
  ...
  [  8.216848   20.216848   32.216846 ]
  [  8.458194   20.458195   32.458195 ]
  [  8.458194   20.458195   32.458195 ]]

 [[  5.6254177  13.625418   16.625418 ]
  [  5.6254177  13.625418   16.625418 ]
  [  5.8227315  13.822731   16.822731 ]
  ...
  [  8.921408   20.921408   32.921406 ]
  [  9.250835   21.250835   33.250835 ]
  [  9.250835   21.250835   33.250835 ]]

 [[  6.         14.         17.       ]
  [  6.         14.         17.       ]
  [  6.         14.         17.       ]
  ...
  [ 10.137707   22.137707   34.137707 ]
  [ 10.29097    22.29097    34.29097  ]
  [ 10.29097    22.29097    34.29097  ]]

 ...

 [[158.42093   142.65898   123.44271  ]
  [153.82774   136.65384   120.270935 ]
  [167.63065   152.48405   136.14848  ]
  ...
  [172.70146   156.42607   132.61723  ]
  [165.04141   144.6434    119.84241  ]
  [204.62683   180.8509    152.98



tf.Tensor(
[[[37.943142  37.03406   35.03406  ]
  [56.902664  55.902664  53.902664 ]
  [64.23653   60.23653   59.23653  ]
  ...
  [ 1.9027153  1.9027153  1.9027153]
  [ 1.310671   1.310671   1.310671 ]
  [ 2.2380776  2.2380776  2.2380776]]

 [[36.993656  36.7878    34.7878   ]
  [58.189594  57.189594  55.189594 ]
  [69.07358   65.07358   64.07358  ]
  ...
  [ 1.9205163  1.9205163  1.9205163]
  [ 1.9854096  1.9854096  1.9854096]
  [ 1.9148855  1.9148855  1.9148855]]

 [[38.697117  38.626884  36.626884 ]
  [64.63187   63.631863  61.631863 ]
  [70.73442   66.73442   65.73442  ]
  ...
  [ 2.0086935  2.0086935  2.0086935]
  [ 1.         1.         1.       ]
  [ 2.3246777  2.3246777  2.3246777]]

 ...

 [[12.044451  10.044451  11.834295 ]
  [24.707317  23.707317  20.357738 ]
  [28.226036  27.234726  23.375168 ]
  ...
  [ 3.505604   3.505604   3.505604 ]
  [ 2.6237884  2.6237884  2.6237884]
  [ 1.2909546  1.2909546  1.2909546]]

 [[16.731155  16.307913  13.851886 ]
  [21.362497  20.004618  1



Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape:



Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape:



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



tf.Tensor(
[[[ 68.53177    70.606125   62.349884 ]
  [ 66.266525   69.182144   61.76717  ]
  [ 63.247494   63.518654   53.992798 ]
  ...
  [127.62373   128.09851   135.16083  ]
  [128.04398   158.49713   184.16583  ]
  [ 92.80844   110.74371   125.75707  ]]

 [[ 66.2408     65.05917    54.136868 ]
  [ 95.895294   89.6653     73.07873  ]
  [139.00694   131.16284   112.144325 ]
  ...
  [ 81.97166    60.123135   77.61887  ]
  [147.23576   147.07298   156.2862   ]
  [166.98819   184.07281   201.65599  ]]

 [[149.66066   142.52148   130.96808  ]
  [189.469     186.28171   179.27167  ]
  [203.37509   202.53331   201.09879  ]
  ...
  [ 55.23409    49.30836    56.725433 ]
  [104.529724   90.62129   106.46822  ]
  [168.9671    181.21451   193.3866   ]]

 ...

 [[ 36.35813     1.9771024  24.469032 ]
  [ 41.25008     2.453308   30.663292 ]
  [ 41.25261     4.615384   33.835075 ]
  ...
  [ 52.908413   64.90842    78.90842  ]
  [ 51.615356   61.769287   78.38464  ]
  [ 50.002872   60.13739    74.75



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[184.32297   193.32297   198.32297  ]
  [185.68964   194.68964   199.68964  ]
  [184.49164   193.49164   198.49164  ]
  ...
  [184.77094   197.77094   206.77094  ]
  [184.        197.        205.26257  ]
  [184.        197.        205.       ]]

 [[186.50836   195.95218   199.62071  ]
  [188.        197.        202.       ]
  [186.2291    195.2291    200.2291   ]
  ...
  [184.77094   197.77094   206.       ]
  [184.        197.        205.26257  ]
  [184.        197.        205.       ]]

 [[185.31886   193.77705   196.54796  ]
  [188.        196.19565   199.75418  ]
  [187.7709    195.2291    201.       ]
  ...
  [184.77094   197.77094   204.81137  ]
  [184.        197.        205.26257  ]
  [184.        197.        205.       ]]

 ...

 [[ 88.         82.         68.       ]
  [ 88.48173    82.48173    68.48173  ]
  [ 85.17665    79.17665    65.17665  ]
  ...
  [ 36.646812   28.646812    8.45813  ]
  [ 33.924736   25.924736    6.924736 ]
  [ 32.         24.          5.  



tf.Tensor(
[[[1.1269445e-02 1.1269445e-02 1.1269445e-02]
  [1.9039549e+00 1.9039549e+00 1.9039549e+00]
  [6.3108891e-01 6.3108891e-01 1.4896030e+00]
  ...
  [9.8043782e-01 9.8043782e-01 9.8043782e-01]
  [2.4200289e+00 2.4200289e+00 2.4200289e+00]
  [2.5312572e+00 2.5312572e+00 2.5312572e+00]]

 [[3.1409040e-02 3.1409040e-02 3.1409040e-02]
  [8.6782038e-01 8.3161259e-01 8.3161259e-01]
  [4.9483788e-01 0.0000000e+00 0.0000000e+00]
  ...
  [1.0500172e+00 1.0500172e+00 1.0500172e+00]
  [2.9074268e+00 2.9074268e+00 2.9074268e+00]
  [1.0862411e+00 1.0862411e+00 1.0862411e+00]]

 [[2.1739125e-02 2.1739125e-02 2.1739125e-02]
  [2.3307407e+00 1.3307407e+00 6.4693987e-01]
  [1.3258900e+01 1.0720367e+01 8.0884056e+00]
  ...
  [8.0508301e+01 8.1100281e+01 8.2284225e+01]
  [1.0902250e+01 1.0902250e+01 1.0902250e+01]
  [1.1582203e+00 1.1582203e+00 1.1582203e+00]]

 ...

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.000000



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[ 52.45652   65.68405   45.00146 ]
  [ 52.663563  67.66357   45.238815]
  [ 23.42528   34.425278  19.231302]
  ...
  [ 12.        12.        10.      ]
  [ 16.958252  16.958252  14.958252]
  [ 94.84244   95.3669    60.482285]]

 [[ 42.68124   55.68124   38.544117]
  [ 32.52962   44.767796  30.284275]
  [ 31.03854   39.449913  26.202263]
  ...
  [ 13.725752  13.725752  11.725752]
  [ 18.958252  19.821129  14.405647]
  [105.502914  99.929886  61.294746]]

 [[ 16.060732  23.060732  15.498859]
  [ 29.272655  37.272655  24.189043]
  [ 22.265202  30.265202  17.265202]
  ...
  [ 14.        14.        12.      ]
  [ 26.692467  27.692467  21.734215]
  [ 59.10511   58.263416  30.155832]]

 ...

 [[167.2936   154.05827  126.564156]
  [133.70363  126.265396 101.58009 ]
  [136.74301  129.74301  101.74301 ]
  ...
  [ 44.753048  46.753048  35.753048]
  [ 39.889603  45.889603  35.889603]
  [ 44.51713   46.51713   32.836468]]

 [[158.50938  149.50938  120.50938 ]
  [149.21204  142.21204  1



tf.Tensor(
[[[ 65.        70.        64.      ]
  [ 65.        70.        64.      ]
  [ 65.        70.        64.      ]
  ...
  [ 16.730442  22.730442  20.730442]
  [ 15.377927  17.377926  16.377926]
  [ 51.269604  49.269604  50.269604]]

 [[ 65.431435  70.431435  64.431435]
  [ 65.431435  70.431435  64.431435]
  [ 65.431435  70.431435  64.431435]
  ...
  [ 17.769861  23.769861  21.769861]
  [ 15.240998  17.241     16.241   ]
  [ 36.158348  34.158348  35.158348]]

 [[ 66.        71.        65.      ]
  [ 66.        71.        65.      ]
  [ 66.        71.        65.      ]
  ...
  [ 17.913414  23.913414  21.913414]
  [ 16.996721  18.996721  17.996721]
  [ 25.922733  23.922733  24.922733]]

 ...

 [[190.11717   56.891514  52.14293 ]
  [201.82799   48.959522  37.558933]
  [204.31477   38.07283   18.75493 ]
  ...
  [ 32.42151   40.42151   59.42151 ]
  [ 42.0676    50.0676    71.41055 ]
  [ 57.730595  65.7306    88.7306  ]]

 [[160.89514   39.337566  32.137684]
  [192.64738   45.89906   



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[ 5.  1.  2.]
  [ 5.  1.  2.]
  [ 5.  1.  2.]
  ...
  [ 4.  4.  4.]
  [ 4.  4.  4.]
  [ 4.  4.  4.]]

 [[ 5.  1.  2.]
  [ 5.  1.  2.]
  [ 5.  1.  2.]
  ...
  [ 4.  4.  4.]
  [ 4.  4.  4.]
  [ 4.  4.  4.]]

 [[ 7.  3.  4.]
  [ 7.  3.  4.]
  [ 7.  3.  4.]
  ...
  [ 4.  4.  4.]
  [ 4.  4.  4.]
  [ 4.  4.  4.]]

 ...

 [[18. 11. 19.]
  [18. 11. 19.]
  [18. 11. 19.]
  ...
  [ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]

 [[18. 13. 20.]
  [17. 12. 19.]
  [16. 11. 18.]
  ...
  [ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]

 [[18. 13. 20.]
  [17. 12. 19.]
  [16. 11. 18.]
  ...
  [ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]], shape=(299, 299, 3), dtype=float32)
Image tensor shape: (1, 299, 299, 3)
Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Posit



tf.Tensor(
[[[ 50.        54.        53.      ]
  [ 50.377922  54.377922  53.377922]
  [ 47.        51.        50.      ]
  ...
  [ 76.07142   76.07142   76.07142 ]
  [ 79.76063   79.76063   79.76063 ]
  [ 80.37625   80.25084   82.62709 ]]

 [[ 55.        59.        58.      ]
  [ 48.762535  52.762535  51.762535]
  [ 48.66388   52.66388   51.66388 ]
  ...
  [196.77539  205.77539  202.77539 ]
  [ 64.759346  73.75422   70.7538  ]
  [ 78.71827   79.71827   74.71827 ]]

 [[ 51.        55.        54.      ]
  [ 51.99833   55.99833   54.99833 ]
  [ 48.33612   52.33612   51.33612 ]
  ...
  [ 54.57173   58.57173   61.57173 ]
  [ 73.49228   78.49228   72.4957  ]
  [ 60.863945  65.863945  59.863945]]

 ...

 [[143.65993  141.65993  129.65993 ]
  [145.37639  143.37639  131.37639 ]
  [132.63531  130.63531  118.635315]
  ...
  [149.04541  153.04541  152.04541 ]
  [155.88428  159.88599  158.88086 ]
  [158.12413  163.12413  157.12413 ]]

 [[140.87312  138.87312  126.87313 ]
  [129.995    127.99499  1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[236.57631  236.57631  244.99771 ]
  [235.00168  235.00168  245.00168 ]
  [236.05519  236.05527  245.94481 ]
  ...
  [129.0967   128.20862  125.88614 ]
  [132.24962  127.256744 132.35583 ]
  [132.56058  126.7776   136.13487 ]]

 [[235.26985  235.26985  243.68915 ]
  [236.04884  236.04884  246.0388  ]
  [237.07251  237.12602  244.96548 ]
  ...
  [122.945694 125.01398  121.79604 ]
  [123.50151  122.77946  125.11038 ]
  [121.78693  119.80699  122.76686 ]]

 [[246.92168  246.92168  254.92168 ]
  [245.70895  245.70895  253.70895 ]
  [240.90858  240.97002  248.78484 ]
  ...
  [122.77899  127.77063  123.81333 ]
  [118.13278  120.16094  115.134125]
  [121.33954  123.35979  118.288055]]

 ...

 [[ 45.230873  44.239204  49.21421 ]
  [ 46.75496   45.76329   50.738297]
  [ 41.223076  40.231407  45.206413]
  ...
  [ 32.3388    18.74417   37.229576]
  [ 32.861668  27.427227  37.861668]
  [ 37.273243  40.23644   42.27982 ]]

 [[ 41.543533  40.543533  46.53853 ]
  [ 39.82687   38.82687   



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[124.26923   94.26923  120.26923 ]
  [123.358826  93.358826 119.358826]
  [122.842804  92.842804 118.842804]
  ...
  [104.96985   88.96985  115.96985 ]
  [103.82036   87.82036  113.82036 ]
  [104.26923   88.26923  115.04682 ]]

 [[124.        94.       120.      ]
  [123.80769   93.80769  119.80769 ]
  [121.898186  91.94128  117.91973 ]
  ...
  [105.        89.       116.      ]
  [105.66724   89.66724  115.85954 ]
  [104.04277   88.04277  114.71359 ]]

 [[122.        92.       118.      ]
  [122.        92.       118.      ]
  [119.765884  91.765884 116.765884]
  ...
  [104.        88.       115.      ]
  [105.        89.       116.      ]
  [104.07699   88.07699  114.2994  ]]

 ...

 [[130.5686   127.87622  164.87622 ]
  [135.98825  131.52628  165.8848  ]
  [140.41966  136.41966  169.41966 ]
  ...
  [145.34619  148.23419  189.23419 ]
  [147.34619  150.34619  193.34619 ]
  [146.077    149.077    192.077   ]]

 [[130.03015  126.03014  163.03015 ]
  [133.66722  127.66722  1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]]

 ...

 [[206.31802  154.46834   78.33775 ]
  [199.13782  146.61746   77.12106 ]
  [178.9903   126.84353   68.15709 ]
  ...
  [185.7292   158.0803    84.731285]
  [198.96434  171.43466   96.39688 ]
  [227.22656  200.38731  123.35714 ]]

 [[188.44951  134.44951   61.57342 ]
  [184.26274  129.4485    



tf.Tensor(
[[[202.53236  198.81758  220.8932  ]
  [214.01437  225.31877  249.11774 ]
  [220.92863  233.55362  254.63605 ]
  ...
  [123.65273  107.22689   82.53882 ]
  [115.97967  112.83234   88.07793 ]
  [106.50559   85.74356   67.07411 ]]

 [[184.09824  188.43452  210.21306 ]
  [211.52818  218.79918  237.36586 ]
  [220.10347  236.00534  252.66739 ]
  ...
  [ 99.68117   88.90027   77.619316]
  [119.24325  111.52515   82.85126 ]
  [109.74301  108.89077   75.95623 ]]

 [[172.7286   174.50491  194.88791 ]
  [204.15541  211.99213  228.33629 ]
  [213.36455  230.3455   249.54515 ]
  ...
  [ 83.42026   70.96993   55.90398 ]
  [ 92.44017   81.437965  56.78196 ]
  [125.11804  114.97679   82.448616]]

 ...

 [[ 48.413464  55.617626  53.619373]
  [ 54.31772   55.37624   52.503315]
  [ 52.694775  57.537148  55.73091 ]
  ...
  [ 64.39136   64.15586   52.94682 ]
  [ 62.043644  61.043644  57.135494]
  [ 74.315605  71.01343   58.86395 ]]

 [[ 55.098843  50.031048  53.92269 ]
  [ 54.345448  54.04626   



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[174.62407   167.62407   175.62407  ]
  [176.37195   170.88557   178.12875  ]
  [174.84282   169.84282   176.84282  ]
  ...
  [ 15.655384   16.192308   12.245538 ]
  [ 24.870043   23.870043   19.870043 ]
  [ 26.611668   25.970554   25.73508  ]]

 [[174.22241   167.22241   175.22241  ]
  [176.38461   168.8077    177.       ]
  [176.8077    169.8077    177.8077   ]
  ...
  [ 15.677533   16.453535   10.677533 ]
  [ 18.430876   17.558863   13.494869 ]
  [ 21.531023   21.531023   19.531023 ]]

 [[173.14542   165.70917   173.85458  ]
  [174.6923    164.6923    173.6923   ]
  [175.03879   168.03879   176.03879  ]
  ...
  [  9.937848   10.713849    6.0498476]
  [  6.689285    7.689285    3.3821838]
  [ 19.453115   21.230703   16.230703 ]]

 ...

 [[119.58949   122.88942   125.4666   ]
  [100.91117   112.7685    119.60124  ]
  [131.93512   132.62793   132.28152  ]
  ...
  [ 84.26954    70.75651    60.48806  ]
  [100.85128    78.1589     64.85128  ]
  [ 98.759384   74.2342     59.72



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



tf.Tensor(
[[[196.23077  193.23077  186.23077 ]
  [197.93336  194.93336  187.93336 ]
  [199.80783  196.80783  189.80783 ]
  ...
  [208.9466   205.9466   200.9466  ]
  [208.51666  205.51666  200.51666 ]
  [209.97403  206.97403  201.97403 ]]

 [[197.29959  194.29959  187.29959 ]
  [198.15384  195.15384  188.15384 ]
  [199.87625  196.87625  189.87625 ]
  ...
  [208.64752  205.64752  200.64752 ]
  [207.6423   204.6423   199.6423  ]
  [206.4269   203.4269   198.4269  ]]

 [[199.24916  196.24916  189.24916 ]
  [200.15384  197.15384  190.15384 ]
  [199.29263  196.29263  189.29263 ]
  ...
  [205.21577  202.21577  197.19441 ]
  [208.85257  205.85257  200.69873 ]
  [209.17218  206.17218  201.01834 ]]

 ...

 [[ 21.814503  20.814503  18.814503]
  [ 21.923035  20.923035  18.923035]
  [ 20.343874  19.343874  17.343874]
  ...
  [ 26.2029    25.341755  23.341755]
  [ 28.05639   28.093592  25.98199 ]
  [ 55.805878  54.805878  50.805878]]

 [[ 19.400381  18.400381  16.400381]
  [ 22.22814   21.22814   



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



tf.Tensor(
[[[255.       255.       255.      ]
  [255.       255.       255.      ]
  [255.       255.       255.      ]
  ...
  [ 88.12011   84.19369   73.34084 ]
  [ 89.738045  85.81162   76.81162 ]
  [ 90.22577   86.22577   74.29935 ]]

 [[255.       255.       255.      ]
  [255.       255.       255.      ]
  [255.       255.       255.      ]
  ...
  [ 84.247086  81.247086  72.247086]
  [ 86.219025  83.219025  74.219025]
  [ 86.078606  82.078606  71.078606]]

 [[255.       255.       255.      ]
  [255.       255.       255.      ]
  [255.       255.       255.      ]
  ...
  [ 81.896324  80.        71.07383 ]
  [ 84.33608   82.43976   73.07187 ]
  [ 83.784294  80.15219   69.88797 ]]

 ...

 [[ 85.94189   58.14382   30.15021 ]
  [ 80.1162    53.63211   29.22407 ]
  [ 78.14571   51.01887   28.605684]
  ...
  [251.32799  218.7259   189.53966 ]
  [243.12988  196.79196  132.23468 ]
  [239.35631  171.05144   98.23757 ]]

 [[ 88.397995  59.62759   29.742388]
  [ 80.09708   53.65728   



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[117.43848   118.82908   122.73143  ]
  [110.481476  114.365776  115.15569  ]
  [112.12881   111.672195  113.99499  ]
  ...
  [243.18896   248.18896   251.18896  ]
  [243.29295   248.29295   251.29295  ]
  [243.30032   248.30032   251.30032  ]]

 [[117.643814  119.649445  123.32472  ]
  [112.386116  115.65811   115.43404  ]
  [110.9295    109.66375   113.08903  ]
  ...
  [243.96335   248.96335   251.96335  ]
  [243.58621   248.58621   251.58621  ]
  [243.96854   248.96854   251.96854  ]]

 [[114.510216  119.50181   122.50181  ]
  [109.0095    114.004616  117.51285  ]
  [109.19331   113.35028   116.35652  ]
  ...
  [243.81104   248.81104   251.81104  ]
  [243.01004   248.01004   251.01004  ]
  [243.169     248.169     251.169    ]]

 ...

 [[  4.160574    4.160574    6.160574 ]
  [  4.4814515   3.9680736   7.508208 ]
  [  2.811037    2.811037    4.433111 ]
  ...
  [ 10.623973   10.623973   12.623973 ]
  [  9.00517     9.00517    11.00517  ]
  [ 10.837769   10.837769   12.83



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[ 2.1677666   2.2516499   2.6577778 ]
  [ 2.407213    2.3244593   1.9238012 ]
  [ 2.          2.          1.825455  ]
  ...
  [ 2.5705998   2.5705998   2.5705998 ]
  [ 2.1688962   2.1688962   2.1688962 ]
  [ 2.581709    2.581709    2.581709  ]]

 [[ 7.7814374   8.278093    4.5297875 ]
  [ 1.7415715   1.4998658   3.2252517 ]
  [ 2.          2.          1.5240829 ]
  ...
  [ 3.          3.          2.5101964 ]
  [ 3.          3.          1.9866219 ]
  [ 2.7483253   2.7483253   2.7483253 ]]

 [[12.947291   13.449595    9.2859    ]
  [ 2.731269    3.1450367   0.80412424]
  [ 2.          1.5918813   3.6428776 ]
  ...
  [ 2.5918305   2.5918305   2.4414947 ]
  [ 2.1555185   2.1555185   0.98309386]
  [ 2.0782719   2.0782719   2.0782719 ]]

 ...

 [[ 2.078279    2.078279    2.078279  ]
  [ 2.          2.          2.        ]
  [ 3.0803595   3.0803595   3.0803595 ]
  ...
  [ 4.          2.          3.        ]
  [ 4.          2.          3.        ]
  [ 2.580542    1.4194579   2.   



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[169.93047  154.93047  135.93047 ]
  [186.4497   173.4497   154.4497  ]
  [223.19083  210.84468  189.88313 ]
  ...
  [ 73.38469   86.07707   99.73088 ]
  [206.69897  227.92993  240.73767 ]
  [208.69186  236.23044  253.19057 ]]

 [[213.99557  200.99557  181.99557 ]
  [183.98964  170.98964  151.98964 ]
  [170.85799  158.51184  137.5503  ]
  ...
  [ 83.511795  96.20418  109.85799 ]
  [211.23297  232.46393  245.0542  ]
  [212.18748  239.72606  254.8032  ]]

 [[217.53403  204.53403  185.53403 ]
  [224.64201  211.64201  192.64201 ]
  [211.64645  199.3003   178.33876 ]
  ...
  [ 91.835396 104.52778  118.18159 ]
  [211.51266  232.74362  244.74362 ]
  [212.16823  239.7068   254.69965 ]]

 ...

 [[124.25504  106.25504   70.255035]
  [123.84836  104.84836   71.84836 ]
  [121.90525  102.90525   70.5591  ]
  ...
  [113.321465 106.321465  99.62908 ]
  [116.21722  109.21722   99.60174 ]
  [105.069275 101.069275  89.069275]]

 [[125.38019  107.38019   71.38019 ]
  [122.002884 103.002884  



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



tf.Tensor(
[[[182.       206.       218.      ]
  [183.       207.       219.      ]
  [184.       208.       220.      ]
  ...
  [ 96.07361   78.07361   68.07361 ]
  [ 93.48828   80.26758   68.75586 ]
  [ 91.        81.        69.      ]]

 [[182.       206.       218.      ]
  [183.       207.       219.      ]
  [184.       208.       220.      ]
  ...
  [ 96.63046   78.63046   68.63046 ]
  [ 93.90919   79.84667   68.75586 ]
  [ 92.11371   81.        69.      ]]

 [[182.       206.       218.      ]
  [183.       207.       219.      ]
  [184.       208.       220.      ]
  ...
  [ 97.07361   79.07361   69.07361 ]
  [ 94.470726  79.738304  68.982445]
  [ 93.54318   80.45682   69.      ]]

 ...

 [[180.11716  184.11716  187.11716 ]
  [186.0026   190.0026   193.0026  ]
  [190.20685  194.20685  197.20685 ]
  ...
  [161.90233  167.32306  170.18282 ]
  [158.82109  166.87546  169.74007 ]
  [156.35965  165.35965  170.35965 ]]

 [[186.56409  190.56409  193.56409 ]
  [190.15825  194.15825  1



tf.Tensor(
[[[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [ 64.35449   65.35449   49.043457]
  [ 93.42648   94.42648   76.42648 ]
  [ 88.195496  90.02661   69.533264]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [ 78.930786  79.930786  63.61975 ]
  [100.06044  101.06044   83.06044 ]
  [110.04862  111.87974   91.38639 ]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [ 81.90692   82.06578   78.05391 ]
  [ 62.280712  62.439575  59.58004 ]
  [ 62.570736  62.86163   61.004166]]

 ...

 [[108.81433  106.49664   81.178955]
  [109.02803  105.69697   80.45976 ]
  [109.230484 103.35797   78.90947 ]
  ...
  [ 48.        27.        22.      ]
  [ 48.414955  27.414957  22.414957]
  [ 48.973175  27.973173  22.973173]]

 [[107.35457  103.35457   76.35457 ]
  [106.01678  101.0034    



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tok



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[ 47.425674  47.425674  47.425674]
  [ 46.791546  46.791546  46.791546]
  [ 46.45358   46.45358   46.45358 ]
  ...
  [102.831924 102.831924 102.831924]
  [ 96.73268   96.73268   96.73268 ]
  [ 77.86354   77.86354   77.86354 ]]

 [[ 49.772     49.772     49.772   ]
  [ 47.409756  47.409756  47.409756]
  [ 47.717392  47.717392  47.717392]
  ...
  [103.18825  103.18825  103.18825 ]
  [101.83203  101.83203  101.83203 ]
  [ 95.80542   95.80542   95.80542 ]]

 [[ 46.20194   46.20194   46.20194 ]
  [ 46.712837  46.712837  46.712837]
  [ 49.148827  49.148827  49.148827]
  ...
  [102.24651  102.24651  102.24651 ]
  [101.30506  101.30506  101.30506 ]
  [ 94.46446   94.46446   94.46446 ]]

 ...

 [[ 46.        46.        46.      ]
  [ 46.        46.        46.      ]
  [ 44.938126  44.938126  44.938126]
  ...
  [ 46.272583  46.272583  46.272583]
  [ 52.362854  52.362854  52.362854]
  [ 59.56306   59.56306   59.56306 ]]

 [[ 47.276173  47.276173  47.276173]
  [ 44.816216  44.816216  



tf.Tensor(
[[[2.00768982e+02 2.05768982e+02 2.08768982e+02]
  [2.01915634e+02 2.05915634e+02 2.08915634e+02]
  [2.02010849e+02 2.06010849e+02 2.08686432e+02]
  ...
  [8.70034561e+01 8.42629852e+01 7.51764755e+01]
  [9.07469482e+01 8.77469482e+01 7.97869873e+01]
  [7.66193924e+01 7.50996170e+01 6.64329529e+01]]

 [[2.00993301e+02 2.04993301e+02 2.07993301e+02]
  [2.01486618e+02 2.05486618e+02 2.08486618e+02]
  [2.00727936e+02 2.05227036e+02 2.08227036e+02]
  ...
  [8.50937958e+01 8.26271210e+01 7.20271454e+01]
  [9.43393326e+01 9.13393326e+01 8.43393326e+01]
  [7.83795547e+01 7.53795547e+01 6.73527985e+01]]

 [[2.01397568e+02 2.06397568e+02 2.09397568e+02]
  [2.01389252e+02 2.06389252e+02 2.10389252e+02]
  [2.01378387e+02 2.06378387e+02 2.10378387e+02]
  ...
  [8.88666992e+01 8.58666992e+01 7.68666992e+01]
  [9.49304199e+01 9.19304199e+01 8.47338867e+01]
  [8.42599487e+01 8.42599487e+01 7.44514084e+01]]

 ...

 [[1.96173508e+02 5.04161060e-01 9.86621857e-01]
  [1.88899384e+02 0.00000000



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[ 92.91739  101.91739   98.91739 ]
  [100.86921  109.86921  106.350815]
  [103.70238  112.80104  107.50505 ]
  ...
  [ 97.73628  111.73628   98.73628 ]
  [ 94.25341  108.25341   95.25341 ]
  [ 90.642685 104.642685  91.642685]]

 [[ 95.36269  104.36269  101.36269 ]
  [ 99.01565  108.01565  104.49725 ]
  [101.170685 110.26935  104.97336 ]
  ...
  [101.596596 114.93439  102.927704]
  [ 95.47443  108.81222   96.80553 ]
  [ 92.93817  106.27596   94.26927 ]]

 [[ 96.41973  105.41973  102.41973 ]
  [ 97.5912   106.5912   103.0728  ]
  [ 99.29599  108.39465  103.09866 ]
  ...
  [ 95.314735 106.21106   96.7629  ]
  [ 92.976585 103.8729    94.42474 ]
  [ 94.04185  104.93817   95.49001 ]]

 ...

 [[ 47.188103  44.188103  61.188103]
  [ 47.707348  44.966545  61.188953]
  [ 48.546814  46.546814  60.44815 ]
  ...
  [ 37.60547   37.60547   39.60547 ]
  [ 37.963135  37.963135  39.963135]
  [ 36.678955  36.678955  38.678955]]

 [[ 47.807865  44.807865  61.807865]
  [ 48.259197  45.518394  



tf.Tensor(
[[[169.61705  178.61705  195.61705 ]
  [180.8266   192.8266   208.8266  ]
  [185.0602   197.0602   213.0602  ]
  ...
  [170.49231  154.01227  192.12793 ]
  [174.64577  169.25099  177.43532 ]
  [189.82784  184.82784  191.82784 ]]

 [[174.39966  183.39966  200.39966 ]
  [181.57846  193.57846  209.57846 ]
  [181.88293  193.88293  209.88293 ]
  ...
  [171.97473  155.49469  193.06519 ]
  [178.3078   172.91302  180.79787 ]
  [192.98167  187.98167  194.7091  ]]

 [[173.97324  182.97324  199.97324 ]
  [179.62039  191.62039  207.62039 ]
  [179.19899  191.19899  207.19899 ]
  ...
  [174.00404  157.524    193.63966 ]
  [182.28706  176.89229  183.97792 ]
  [196.59036  191.59036  197.59036 ]]

 ...

 [[ 37.922215  30.709812  23.771824]
  [ 30.903654  24.328459  17.178068]
  [115.885574 111.2741   103.46914 ]
  ...
  [252.7876   253.2124   253.      ]
  [252.7876   253.2124   253.      ]
  [252.7876   253.2124   253.      ]]

 [[ 34.42347   28.437988  29.799185]
  [ 31.213749  26.865906  



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[1.0785953  0.         0.07859528]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[1.3960022  0.         0.39600214]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[2.0785952  0.07859528 1.0785953 ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 ...

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [1.0678349  0.1461544  0.6069946 ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [133.67638  123.39712  122.83173 ]
  [109.33064  100.33064  104.86599 ]
  [ 76.243355  67.243355  71.792885]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [120.067955 109.78869  107.52493 ]
  [ 98.727234  89.727234  91.37602 ]
  [ 81.75173   72.75173   74.4474  ]]

 [[254.       254.       254.      ]
  [254.       254.       254.      ]
  [254.       254.       254.      ]
  ...
  [101.6535    92.02351   86.07523 ]
  [ 97.2338    88.20705   86.20705 ]
  [ 94.66541   85.63866   83.63866 ]]

 ...

 [[170.826    168.826    154.85274 ]
  [164.73526  162.73526  148.762   ]
  [158.86493  156.86493  142.89166 ]
  ...
  [148.28757  144.44977  129.1705  ]
  [149.54504  147.03168  133.00494 ]
  [152.56848  150.05511  136.02838 ]]

 [[159.6107   157.6107   142.6107  ]
  [159.33087  157.33087  1



tf.Tensor(
[[[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.10224348 0.10224348 0.10224348]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 ...

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  ...
  [0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[3.79915786e+00 1.67991581e+01 0.00000000e+00]
  [3.49273515e+00 1.56225939e+01 1.23301768e+00]
  [4.60240984e+00 1.06818714e+01 6.42140508e-01]
  ...
  [1.63755463e+02 1.67419754e+02 1.76419754e+02]
  [1.63041290e+02 1.61781586e+02 1.69185730e+02]
  [1.44124023e+02 1.43699219e+02 1.45911621e+02]]

 [[5.68342686e+00 1.96834259e+01 3.09254956e+00]
  [1.47156715e-01 1.21471567e+01 9.37570333e-02]
  [2.39269519e+00 1.21308985e+01 1.00715339e+00]
  ...
  [1.60151703e+02 1.66930969e+02 1.75004547e+02]
  [1.61321167e+02 1.60958313e+02 1.68232605e+02]
  [1.41255554e+02 1.41255554e+02 1.43255554e+02]]

 [[8.99163914e+00 1.86509666e+01 2.98454714e+00]
  [9.37199116e-01 1.20714426e+01 4.21404839e-01]
  [8.54512215e-01 1.44832773e+01 2.23739102e-01]
  ...
  [1.55306000e+02 1.64306000e+02 1.71306000e+02]
  [1.55559937e+02 1.59261429e+02 1.64898575e+02]
  [1.38700912e+02 1.38700912e+02 1.40700912e+02]]

 ...

 [[9.80868530e+01 7.98761597e+01 7.58761597e+01]
  [9.98240967e+01 8.16134033



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[136.       136.       236.      ]
  [126.352844 131.35284  225.35284 ]
  [130.74916  134.74916  233.74916 ]
  ...
  [252.54567  251.94191  249.56221 ]
  [ 74.533646  50.14418   15.086664]
  [ 15.279053  14.279053  12.279053]]

 [[133.88293  136.       235.94147 ]
  [128.80214  135.89244  227.      ]
  [128.82799  134.82799  232.71092 ]
  ...
  [254.       254.       254.      ]
  [159.5879   127.83899   89.86901 ]
  [ 16.762297  14.925138  13.204191]]

 [[129.06584  134.06584  234.06584 ]
  [136.       131.       223.      ]
  [130.30287  133.77444  226.53865 ]
  ...
  [203.6751   180.2303   137.51321 ]
  [ 66.745514  44.51696   16.783792]
  [ 10.        10.         8.      ]]

 ...

 [[ 37.484894  25.484896  13.484896]
  [ 39.83779   27.837793  15.837793]
  [ 31.349728  20.349728  14.349727]
  ...
  [175.46136  149.46136  126.46137 ]
  [174.87593  149.87593  127.87594 ]
  [173.       147.       124.      ]]

 [[ 32.500065  19.500065  13.500065]
  [ 33.95498   20.95498   



tf.Tensor(
[[[227.24248  226.24248  224.24248 ]
  [229.1745   228.1745   226.1745  ]
  [228.25084  227.25084  225.25084 ]
  ...
  [ 38.235966  40.235966  39.235966]
  [ 44.89647   46.89647   45.89647 ]
  [ 43.62317   45.62317   44.62317 ]]

 [[228.578    227.578    225.578   ]
  [231.01672  230.01672  228.01672 ]
  [229.71533  228.71533  226.71533 ]
  ...
  [ 39.099533  41.099533  40.099533]
  [ 39.585445  41.585445  40.585445]
  [ 44.926975  46.926975  45.926975]]

 [[229.57024  228.57024  226.57024 ]
  [231.85826  230.85826  228.85826 ]
  [231.       230.       228.      ]
  ...
  [113.702934 115.702934 114.702934]
  [ 72.38756   74.38756   73.38756 ]
  [ 46.561794  48.561794  47.561794]]

 ...

 [[131.79097  131.79097  139.79097 ]
  [136.3683   136.3683   144.3683  ]
  [134.16891  134.16891  142.16891 ]
  ...
  [ 49.296143  49.296143  49.296143]
  [ 33.120647  35.69914   34.69914 ]
  [ 28.398216  34.398216  32.398216]]

 [[129.10155  131.18346  138.48949 ]
  [132.44162  134.52353  1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask 



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[35.       39.       42.      ]
  [34.252857 38.252857 41.252857]
  [33.73743  37.73743  40.73743 ]
  ...
  [44.090275 45.090275 40.090275]
  [44.       45.       40.      ]
  [44.       45.       40.      ]]

 [[35.       39.       42.      ]
  [35.008987 39.008987 42.008987]
  [35.76756  39.76756  42.76756 ]
  ...
  [45.27083  46.27083  41.27083 ]
  [45.       46.       41.      ]
  [45.       46.       41.      ]]

 [[38.       39.       43.      ]
  [38.011707 39.011707 43.011707]
  [39.       40.       44.      ]
  ...
  [48.279266 49.279266 44.279266]
  [48.279266 49.279266 44.279266]
  [48.279266 49.279266 44.279266]]

 ...

 [[15.       15.       17.      ]
  [16.       16.       18.      ]
  [17.       17.       19.      ]
  ...
  [64.083534 52.083534 52.083534]
  [60.279297 49.26758  47.291016]
  [60.279297 49.279297 47.279297]]

 [[16.       16.       18.      ]
  [18.011705 18.011705 20.011705]
  [19.585264 19.585264 21.585264]
  ...
  [55.414734 44.414734 40.4



tf.Tensor(
[[[ 17.516722   13.516723   14.516723 ]
  [ 15.861204   11.861204   12.861204 ]
  [ 19.32237    15.3223715  16.32237  ]
  ...
  [ 66.14465    66.14465    66.14465  ]
  [ 59.63118    59.63118    59.63118  ]
  [ 67.46261    67.46261    67.46261  ]]

 [[ 20.776812   16.776812   17.776812 ]
  [ 22.59398    18.59398    19.59398  ]
  [ 25.350056   21.350056   22.350056 ]
  ...
  [ 67.53394    67.53394    67.53394  ]
  [ 56.536625   56.536625   56.536625 ]
  [ 68.22713    68.22713    68.22713  ]]

 [[ 20.214418   16.214418   17.214418 ]
  [ 25.45931    21.45931    22.45931  ]
  [ 27.932922   23.932922   24.932922 ]
  ...
  [ 66.28222    66.28222    66.28222  ]
  [ 54.691624   54.691624   54.691624 ]
  [ 71.097435   71.097435   71.097435 ]]

 ...

 [[136.6964    136.6964    136.6964   ]
  [137.16052   137.16052   137.16052  ]
  [142.43198   142.43198   142.43198  ]
  ...
  [103.06777   102.06777   107.06777  ]
  [ 94.94611    93.94611    98.94611  ]
  [ 77.314995   76.314995   81.31



tf.Tensor(
[[[127.33895  123.41555   72.59249 ]
  [108.43375  110.91079   43.04386 ]
  [112.75811  118.63925   53.92372 ]
  ...
  [147.43692  147.43846  128.85477 ]
  [128.62141  127.01942  108.48168 ]
  [ 96.41044   87.01779   60.539608]]

 [[101.13229   97.31554   68.19681 ]
  [112.46238  109.219574  57.431057]
  [118.67796  119.82178   53.53524 ]
  ...
  [130.49847  129.0754   113.88309 ]
  [152.31448  148.94373  128.34344 ]
  [142.18872  133.548    101.437645]]

 [[ 78.815796  73.71714   45.722214]
  [111.85992  104.52714   61.959476]
  [ 79.01569   78.38629   28.871561]
  ...
  [109.69293  105.96524   89.12647 ]
  [109.95076  101.50221   90.3254  ]
  [161.33849  151.09923  121.97581 ]]

 ...

 [[151.46161  111.66722   93.42319 ]
  [154.52744  117.39992   95.99051 ]
  [159.16112  117.58056   96.41944 ]
  ...
  [252.67027  224.43608  208.30609 ]
  [249.75447  233.21756  201.93509 ]
  [227.67673  202.74551  174.88608 ]]

 [[127.901825  96.34621   84.32461 ]
  [147.84724  110.52671   



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[ 55.476624 106.9716   104.71909 ]
  [ 48.737965  97.97664   94.220375]
  [ 57.099373 103.83683   98.62446 ]
  ...
  [112.7408   135.7408   127.7408  ]
  [115.45938  137.72946  127.245804]
  [120.00334  140.75081  126.50841 ]]

 [[ 50.490067 101.97882   99.720024]
  [ 55.06035  102.03527   99.02143 ]
  [ 56.53355  103.26229   98.04359 ]
  ...
  [114.73746  137.72742  129.73244 ]
  [118.2111   138.96863  129.24246 ]
  [122.00248  142.7387   128.50882 ]]

 [[ 47.55228   97.80056   94.29766 ]
  [ 61.08731  109.04049  103.29133 ]
  [ 58.22293  103.22293   96.74802 ]
  ...
  [114.729095 135.7247   128.7269  ]
  [117.953125 138.71066  129.22574 ]
  [122.46571  140.98328  129.23581 ]]

 ...

 [[109.51423  106.77046   93.762535]
  [101.24116  103.76277   83.970856]
  [100.57106  100.74118   75.42451 ]
  ...
  [134.183     83.64623   63.383686]
  [142.92784   96.02586   75.268326]
  [113.55002   92.26709   72.26709 ]]

 [[101.28373  100.03377   84.29    ]
  [100.51854  103.78733   



tf.Tensor(
[[[139.       122.        92.      ]
  [139.71071  122.7107    92.7107  ]
  [140.85117  123.85117   93.85117 ]
  ...
  [124.30484   79.13762   46.13762 ]
  [120.88138   75.71415   42.71415 ]
  [117.4837    72.31648   39.316483]]

 [[139.       122.        92.      ]
  [139.71071  122.7107    92.7107  ]
  [140.85117  123.85117   93.85117 ]
  ...
  [134.9719    85.09396   52.093964]
  [131.70862   81.830696  48.830696]
  [128.87498   78.997055  45.99706 ]]

 [[139.       122.        92.      ]
  [139.71071  122.7107    92.7107  ]
  [140.85117  123.85117   93.85117 ]
  ...
  [136.64217   79.806046  46.534805]
  [133.71075   76.87463   43.54468 ]
  [131.1192    74.28308   40.894386]]

 ...

 [[185.04507  177.04507  174.1349  ]
  [187.68752  176.89653  174.957   ]
  [187.86249  172.34273  172.16483 ]
  ...
  [216.48296  191.85591  181.6469  ]
  [209.66718  182.45816  174.6221  ]
  [198.15816  168.94914  162.35146 ]]

 [[182.74953  174.74953  171.74953 ]
  [186.1254   175.1254   1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tok



tf.Tensor(
[[[ 26.9599    50.939835  88.92478 ]
  [109.648926 123.603775 152.58873 ]
  [ 30.408947  45.50928   76.55945 ]
  ...
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]]

 [[ 26.485151  50.465084  88.450035]
  [ 93.18652  107.14137  136.12633 ]
  [ 28.452593  43.552925  74.603096]
  ...
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]]

 [[ 22.996822  46.976753  84.96171 ]
  [ 77.413376  91.368225 120.35317 ]
  [ 27.974916  43.07525   74.12542 ]
  ...
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]
  [ 51.        46.        42.      ]]

 ...

 [[ 22.994984  14.010034  17.005016]
  [ 20.506163  14.506163  16.506163]
  [ 29.057573  25.10774   26.10774 ]
  ...
  [ 69.40535   37.43044   16.606037]
  [ 81.125725  40.050346  11.974966]
  [ 80.400314  44.957024  23.012749]]

 [[ 23.        14.01505   17.010033]
  [ 20.992626  14.992625  



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



Features shape after CNN: (1, 256)
Encoder Input Shape: (1, 1, 256)
Encoder Input Shape before LayerNorm: (1, 1, 256)
Encoder Input Shape after LayerNorm: (1, 1, 256)
Encoder Output Shape: (1, 1, 256)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1



tf.Tensor(
[[[202.01698  219.01698  247.01698 ]
  [204.05096  219.60623  247.1638  ]
  [200.79097  217.79097  244.97826 ]
  ...
  [ 90.73719  103.5499   102.36261 ]
  [ 98.128876 103.128876 106.25412 ]
  [101.57337  107.386086 113.386086]]

 [[201.2099   221.2099   248.2099  ]
  [204.2998   221.2998   247.68298 ]
  [202.21906  217.6572   246.      ]
  ...
  [ 82.08148  101.94414   90.82672 ]
  [132.25455  184.21967  187.31146 ]
  [ 62.77709   73.33896   79.119896]]

 [[201.93512  218.93512  246.93512 ]
  [201.24057  220.8643   247.98972 ]
  [202.58194  220.20903  248.      ]
  ...
  [ 86.54362  101.60716   96.52512 ]
  [ 63.516544  76.14277   73.64179 ]
  [ 96.531425 130.09659  132.33894 ]]

 ...

 [[122.925125 120.925125 133.92513 ]
  [138.57375  135.69917  145.26675 ]
  [159.98672  155.       151.19576 ]
  ...
  [ 67.076744  66.076744  72.076744]
  [ 66.38368   65.38368   71.38368 ]
  [ 56.923862  56.923862  58.923862]]

 [[123.80034  119.36235  132.92436 ]
  [145.17226  143.85532  1



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Predictions shape: (1, 7, 3138)
Tokenized caption shape: (1, 7)
Mask shape: (1, 7)
Decoder Input Shape: (1, 7)
Positional Embedding Input Shape: (1, 7)
Positional Embedding Output Shape: (1, 7, 256)
embedded_tokens dtype: <dtype: 'float32'>
embedded_positions dtype: <dtype: 'float32'>
Decoder Output Shape: (1, 7, 3138)
Pr



tf.Tensor(
[[[249.        248.085     252.82997  ]
  [249.51004   249.51004   251.51004  ]
  [250.        250.        250.96655  ]
  ...
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]]

 [[249.        248.75499   251.49002  ]
  [249.51004   249.51004   251.51004  ]
  [250.        250.        250.96655  ]
  ...
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]]

 [[249.        249.        251.       ]
  [249.9238    249.9238    251.9238   ]
  [250.        250.        251.83928  ]
  ...
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]
  [  0.         34.         33.       ]]

 ...

 [[127.70799   189.92737   190.35349  ]
  [124.77125   183.94371   185.35747  ]
  [129.1752    192.11157   192.43933  ]
  ...
  [ 11.516663   32.516663   33.516663 ]
  [  9.51001    30.51001    31.51001  ]
  [  6.4250245  30.425024   30.42



tf.Tensor(
[[[76.75418  72.75418  60.75418 ]
  [76.75418  72.75418  60.75418 ]
  [74.82007  70.82007  58.820072]
  ...
  [58.       54.       45.      ]
  [56.516663 52.516663 43.516663]
  [53.17218  49.17218  40.17218 ]]

 [[80.       76.       64.      ]
  [80.       76.       64.      ]
  [80.226105 76.226105 64.226105]
  ...
  [58.       54.       45.      ]
  [56.516663 52.516663 43.516663]
  [53.17218  49.17218  40.17218 ]]

 [[77.45819  73.45819  61.458195]
  [77.45819  73.45819  61.458195]
  [78.98331  74.98331  62.983303]
  ...
  [58.       54.       45.      ]
  [56.516663 52.516663 43.516663]
  [53.17218  49.17218  40.17218 ]]

 ...

 [[44.39652  51.224277 10.741001]
  [45.864788 51.864788 17.864788]
  [49.1805   53.1805   27.902906]
  ...
  [19.478256 20.339401 16.033676]
  [19.08374  19.08374  19.08374 ]
  [19.08374  19.08374  19.08374 ]]

 [[54.45514  61.282898 20.79962 ]
  [47.973083 53.973083 19.973083]
  [57.31298  61.31298  36.035385]
  ...
  [17.72229  18.583435 14.2

In [28]:
# Save the generated captions to a JSON file
output_file = f"{output_directory}/captions.json"
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(gen_cap, outfile, ensure_ascii=False, indent=4)

In [29]:
# Generate captions for specific test cases
# Random image from the test set
random_image = np.random.choice(test_image_filenames)
random_image_path = f"{dataset_directory}images/{random_image}"
print("\nRandom Image Caption:")
generate_caption(caption_model, random_image_path, display_image=True)


Random Image Caption:
tf.Tensor(
[[[ 93.14159   55.53576   36.63048 ]
  [137.33943  100.54367   82.45816 ]
  [149.46815  119.63482  102.687965]
  ...
  [117.84178  151.84178  188.84178 ]
  [137.4588   171.4588   208.4588  ]
  [123.55076  157.55075  194.55075 ]]

 [[100.51919   66.48073   50.49411 ]
  [134.56404  104.24228   86.74597 ]
  [143.06754  116.39922  101.22865 ]
  ...
  [105.74366  139.84901  176.95436 ]
  [109.452515 143.55786  180.66321 ]
  [112.58829  146.69363  183.79898 ]]

 [[169.9792   146.83481  134.28763 ]
  [190.98409  169.56436  156.44229 ]
  [126.92689  108.2001    96.32568 ]
  ...
  [103.71895  138.71895  176.71895 ]
  [103.54364  138.54364  176.54364 ]
  [107.88373  142.88373  180.88373 ]]

 ...

 [[ 38.16083   64.7454    91.93715 ]
  [ 34.130356  61.65715   88.800446]
  [ 33.054672  60.581467  88.60353 ]
  ...
  [ 17.616941  50.327946  79.78784 ]
  [ 19.666641  50.893555  82.169495]
  [ 25.86962   56.23697   90.11452 ]]

 [[ 39.009693  63.7939    89.628494]
  [

'একটি অনেক মানুষ আছে। আছে। ও একটি'

In [30]:
# Caption for the last image in the dataset
last_image_path = f"{dataset_directory}images/{test_image_filenames[-1]}"
print("\nLast Image Caption:")
generate_caption(caption_model, last_image_path, display_image=True)


Last Image Caption:
tf.Tensor(
[[[ 47.         96.        137.       ]
  [ 48.316055   97.316055  138.31606  ]
  [ 49.526756   98.52676   139.52676  ]
  ...
  [  2.5267334   0.5267334   1.5267334]
  [  1.          1.          1.       ]
  [  1.          1.          1.       ]]

 [[ 47.71874    96.71874   137.71873  ]
  [ 48.         97.        138.       ]
  [ 49.20229    98.20229   139.2023   ]
  ...
  [  2.5267334   0.5267334   1.5267334]
  [  1.          1.          1.       ]
  [  1.          1.          1.       ]]

 [[ 48.7241     97.7241    138.7241   ]
  [ 48.         97.        138.       ]
  [ 48.670486   97.67049   138.67049  ]
  ...
  [  2.5267334   0.5267334   1.5267334]
  [  1.          1.          1.       ]
  [  1.          1.          1.       ]]

 ...

 [[ 11.584052   11.584052   13.584052 ]
  [ 14.828167   14.828167   16.828167 ]
  [  8.892918    8.892918   10.892918 ]
  ...
  [ 18.4198     17.4198     22.4198   ]
  [ 20.         19.         24.       ]
  [ 20.     

'একজন বসে কাজ একজন বসে কাজ করছে।'

In [31]:
# Caption for a specific index
specific_index = 8  # Example index, ensure it's within the range
if specific_index < len(test_image_filenames):
    specific_image_path = f"{dataset_directory}images/{test_image_filenames[specific_index]}"
    print(f"\nCaption for Image at Index {specific_index}:")
    generate_caption(caption_model, specific_image_path, display_image=True)
else:
    print(f"Index {specific_index} is out of range for the test dataset.")


Caption for Image at Index 8:
tf.Tensor(
[[[224.65395    215.65395    208.65395   ]
  [231.98117    222.98117    215.98117   ]
  [223.44647    214.44647    207.44647   ]
  ...
  [254.0316     250.75432    247.04155   ]
  [254.54437    248.70796    241.67383   ]
  [246.70575    238.46898    228.57965   ]]

 [[214.52689    205.52689    198.52689   ]
  [230.86845    221.86845    214.86845   ]
  [228.6269     219.6269     213.90115   ]
  ...
  [254.69112    254.17944    251.30489   ]
  [255.         252.30154    246.54446   ]
  [249.55525    244.1033     235.83038   ]]

 [[225.91718    217.04091    210.10278   ]
  [223.84102    214.96475    208.02663   ]
  [225.97913    217.10287    212.041     ]
  ...
  [254.08693    254.05267    253.7024    ]
  [254.37753    253.4573     249.25601   ]
  [250.33418    248.72058    242.5198    ]]

 ...

 [[  0.           0.           0.        ]
  [  0.           0.           0.        ]
  [  0.           0.           0.        ]
  ...
  [  1.           1