In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import einops


#HyperParameters :
#seting hyperparameters:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

patch_size = 16
latent_size = 512
n_channels = 3
num_heads = 12
num_layers = 12
dropout = 0.1
num_classes = 10
size = 224

epoch = 30
base_lr = 10e-3
weight_decay = 0.03
batch_size = 16





#Multi Head Attention Mechanism:

class Head_normal(nn.Module):
    # One head of self-attention (without masking)
    
    def __init__(self,latent_size = latent_size,dropout = dropout,num_heads = num_heads ):
        super().__init__()

        head_size = latent_size // num_heads
        self.key_normal = nn.Linear(latent_size, head_size, bias=False)
        self.query_normal = nn.Linear(latent_size, head_size, bias=False)
        self.value_normal = nn.Linear(latent_size, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, xk,xq,xv):

        # print('     Entered Head Normal\n')
        
        k = self.key_normal(xk)  # (B, T, head_size)
        q = self.query_normal(xq)   # (B, T, head_size)

        # Compute attention scores (affinities)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # No masking applied
        wei = self.dropout(wei)
        # print(wei.shape)
        # Perform weighted aggregation of values
        v = self.value_normal(xv)  # (B, T, head_size)
        # print(v.shape)
        out = wei @ v  # (B, T, head_size)
        # print('     Exited Head Normal\n')
        return out 

class MultiHeadAttention(nn.Module):
    # Multiple heads of self-attention in parallel (Unmasked)
    
    def __init__(self,latent_size = latent_size, num_head = num_heads, dropout = dropout):
        super().__init__()
        head_size = latent_size // num_head
        self.heads = nn.ModuleList([Head_normal() for _ in range(num_head)])  # Multiple heads
        self.proj = nn.Linear(head_size * num_head, latent_size)  # Projection layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, xk,xq,xv):
        # print(' Entered MultiHead\n')
        out = torch.cat([h(xk,xq,xv) for h in self.heads], dim=-1)  # Concatenate outputs from all heads
        out = self.dropout(self.proj(out))  # Apply projection and dropout
        # print(' Exited Multihead\n')
        return out
    

#Input Embedding for Vision Transformer:
# 1. Create a class which subclasses nn.Module
class InputEmbedding(nn.Module):
    """Turns a 2D input image into a 1D sequence learnable embedding vector.

    Args:
        in_channels (int): Number of color channels for the input images. Defaults to 3.
        patch_size (int): Size of patches to convert input image into. Defaults to 16.
        embedding_dim (int): Size of embedding to turn image into. Defaults to 768.
    """
    # 2. Initialize the class with appropriate variables
    def __init__(self,patch_size =patch_size,n_channels =n_channels,device= device,latent_size = latent_size,batch_size= batch_size):
        super().__init__()


        self.patch_size = patch_size
        self.latent_size =latent_size
        self.n_channels = n_channels
        self.device = device
        self.batch_size = batch_size
        self.input_size = self.patch_size *self.patch_size * self.n_channels

        # 3. Create a layer to turn an image into patches
        self.patcher = nn.Conv2d(in_channels=3,
                                 out_channels=self.latent_size,
                                 kernel_size=16,
                                 stride=16,
                                 padding=0)

        # 4. Create a layer to flatten the patch feature maps into a single dimension
        self.flatten = nn.Flatten(start_dim=2, # only flatten the feature map dimensions into a single vector
                                  end_dim=3)
        

        # class token
        self.class_token = nn.Parameter(torch.randn(self.batch_size,1,self.latent_size)).to(self.device)
        # print(self.class_token.shape)
        # position embedding
        self.pos_embedding = nn.Parameter(torch.randn(self.batch_size,1,self.latent_size)).to(self.device)

    # 5. Define the forward method
    def forward(self, x):
        # Create assertion to check that inputs are the correct shape
        image_resolution = x.shape[-1]
        assert image_resolution % patch_size == 0, f"Input image size must be divisible by patch size, image shape: {image_resolution}, patch size: {patch_size}"

        # Perform the forward pass
        x_patched = self.patcher(x)
        # print("X_patches size:",x_patched.shape)
        x_flattened = self.flatten(x_patched)

        # print("x_flatted normal:", x_flattened.shape)

        x_flattened = x_flattened.permute(0, 2, 1).to(self.device)
        b , n, _ = x_flattened.shape

        # print("x_flattened permuted:", x_flattened.shape)

        liner_projection = torch.cat((self.class_token, x_flattened), dim=1)
        # print(liner_projection.shape)
        pos_embedding = einops.repeat(self.pos_embedding, 'b 1 d -> b m d', m = n+1)

        liner_projection = liner_projection +pos_embedding
        
        return liner_projection# adjust so the embedding is on the final dimension [batch_size, P^2•C, N] -> [batch_size, N, P^2•C]
    

#Encoder Block For ViT:
class EncoderBlock(nn.Module):
    def __init__(self,latent_size = latent_size, num_heads = num_heads, device = device, dropout = dropout):
        super(EncoderBlock,self).__init__()

        self.latent_size = latent_size
        self.num_heads = num_heads
        self.device = device
        self.dropout = dropout

        #nOrm layer

        self.norm = nn.LayerNorm(self.latent_size)

        #MULTIHEADATTENTION

        # self.multihead = nn.MultiheadAttention(
        #     self.latent_size, self.num_heads, self.dropout
        # )

        self.multihead = MultiHeadAttention()

        # nn.MultiheadAttention()

        self.enc_MLP = nn.Sequential(
            nn.Linear(self.latent_size, self.latent_size *4),
            nn.GELU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.latent_size *4 , self.latent_size),
            nn.Dropout(self.dropout)
            
        )

    
    def forward(self, embedded_patches):

        first_norm = self.norm(embedded_patches)
        attention_out = self.multihead(first_norm,first_norm,first_norm)[0]

        # first_residuloa_connetion

        first_added = attention_out + embedded_patches

        second_norm =self.norm(first_added)

        ff_output = self.enc_MLP(second_norm)


        return ff_output + first_added
    

#puting everything toggether:

class VitModel(nn.Module):
    def __init__(self,num_encoders = num_layers, latent_size = latent_size, device = device,num_classes = num_classes, dropout = dropout):
        super(VitModel,self).__init__()

        self.num_encoders = num_encoders
        self.latent_size =latent_size
        self.device = device
        self.dropout = dropout
        self.num_classes = num_classes

        self.embd = InputEmbedding()

        self.encstack = nn.ModuleList([ EncoderBlock() for i in range(self.num_encoders)])


    def forward(self, test_input):

        enc_output = self.embd(test_input)

        for enc_layer in self.encstack:
            enc_output = enc_layer.forward(enc_output)
        
        

        return enc_output
        

    

# model = VitModel().to(device)
# test_input = torch.randn((16,3,224,224)).to(device)
# print(model(test_input).shape)


# print(sum(p.numel() for p in model.parameters() if p.requires_grad))

In [3]:
import torch
import torch.nn as nn
import math

class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x.long()) * math.sqrt(self.d_model)

    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class ResidualConnection(nn.Module):
    
        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)
    
        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e4)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
        return self.w_o(x)

class EncoderBlock_1(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x
    
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock_1(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x
    
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)
    
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, tgt_embed: InputEmbeddings, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.tgt_embed = tgt_embed
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)
    
def build_transformer(tgt_vocab_size: int, tgt_seq_len: int, d_model: int=512, N: int=12, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock_1(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock_1(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, tgt_embed, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer


# model = build_transformer(15698,197,768,12,8,0.1,9216)

# print(sum(p.numel() for p in model.parameters() if p.requires_grad))

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# from Vit_Model import VitModel
# from transformer_model import build_transformer
from PIL import Image
import torchvision.transforms as transforms
# from data_loader import causal_mask



batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'

device = torch.device(device)
# print(device)

def print_examples(model, device, dataset):
    transform = transforms.Compose(
        [
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]
    )

    model.eval()
    test_img1 = transform(Image.open("/kaggle/input/example/dog.jpeg").convert("RGB")).unsqueeze(
        0
    )
    print("Example 1 CORRECT: Dog on a beach by the ocean")
    print(
        "Example 1 OUTPUT: "
        + " ".join(model.Caption_Generation(test_img1.to(device), dataset.vocab))
    )
    test_img2 = transform(
        Image.open("/kaggle/input/example/child_frisbee.jpeg").convert("RGB")
    ).unsqueeze(0)
    print("Example 2 CORRECT: Child holding red frisbee outdoors")
    print(
        "Example 2 OUTPUT: "
        + " ".join(model.Caption_Generation(test_img2.to(device), dataset.vocab))
    )
    test_img3 = transform(Image.open("/kaggle/input/example/bus_car.jpeg").convert("RGB")).unsqueeze(
        0
    )
    print("Example 3 CORRECT: Bus driving by parked cars")
    print(
        "Example 3 OUTPUT: "
        + " ".join(model.Caption_Generation(test_img3.to(device), dataset.vocab))
    )
    test_img4 = transform(
        Image.open("/kaggle/input/example/small_boat.jpeg").convert("RGB")
    ).unsqueeze(0)
    print("Example 4 CORRECT: A small boat in the ocean")
    print(
        "Example 4 OUTPUT: "
        + " ".join(model.Caption_Generation(test_img4.to(device), dataset.vocab))
    )
    test_img5 = transform(
        Image.open("/kaggle/input/example/cow_boy.jpeg").convert("RGB")
    ).unsqueeze(0)
    print("Example 5 CORRECT: A cowboy riding a horse in the desert")
    print(
        "Example 5 OUTPUT: "
        + " ".join(model.Caption_Generation(test_img5.to(device), dataset.vocab))
    )




class VisionWithTransformer(nn.Module):
    def __init__(self,vocab_size):
        super(VisionWithTransformer,self).__init__()


        self.vocab_size = vocab_size

        self.vision_model = VitModel()

        self.transformer_model = build_transformer(tgt_vocab_size=self.vocab_size,tgt_seq_len=100,d_model=512,N=12,h=8,dropout=0.1,d_ff=2048)

        self.params = list(self.vision_model.parameters()) + list(self.transformer_model.parameters())

    def forward(self,imgs,encoder_mask,decoder_input,decoder_mask):
        
        


        encoder_input = self.vision_model(imgs)

        encoder_output = self.transformer_model.encode(encoder_input,encoder_mask)
        decoder_output = self.transformer_model.decode(encoder_output,encoder_mask,decoder_input,decoder_mask)
        proj_output = self.transformer_model.project(decoder_output)


        return proj_output
    

    def Caption_Generation(self,imgs,vocab,max_length = 50):

        sos_idx = vocab.stoi["<SOS>"]
        eos_idx = vocab.stoi["<EOS>"]

        # print("Image Size:",imgs.shape)

        input = []
        input = torch.tensor(input).to(device)
        for _ in range(batch_size):
            input=torch.cat((input,imgs),dim=0)

        # print("Input Size:",input.shape)
        source = self.vision_model(input)
        # print("Source Size:",source.shape)

        source = source[0,:,:].unsqueeze(0)
        # print("Source Size:",source.shape)
        # print("Source Type:", type(source))
        encoder_mask = None
        decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
        # print("Decoder Input Shape:",decoder_input.shape)
        encoder_output = self.transformer_model.encode(source,encoder_mask)
        # print("Encoder Output Size:",encoder_output.shape)
        while True:
            if decoder_input.size(1) == max_length:
                break

            # build mask for target
            decoder_mask = causal_mask(decoder_input.size(1)).type_as(source).to(device)
            # print("Decoder Maskk size:", decoder_mask.shape)
            decoder_output = self.transformer_model.decode(encoder_output,encoder_mask,decoder_input,decoder_mask)
            # print("Decoder output size:", decoder_output.shape)
            

            prob = self.transformer_model.project(decoder_output[:,-1])
            # print("Final output size:", prob.size)
            _, next_word = torch.max(prob, dim=1)

            decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
            )



            if next_word == eos_idx:
                break
        # print(decoder_input.int())
        return [vocab.itos[int(idx)] for idx in decoder_input.view(-1).tolist()]






In [5]:
import os  # when loading file paths
import pandas as pd  # for lookup in annotation file
import spacy  # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence  # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image  # Load img
import torchvision.transforms as transforms


# We want to convert text -> numerical values
# 1. We need a Vocabulary mapping each word to a index
# 2. We need to setup a Pytorch dataset to load the data
# 3. Setup padding of every batch (all examples should be
#    of same seq_len and setup dataloader)
# Note that loading the image is very easy compared to the text!

# Download with: python -m spacy download en
spacy_eng = spacy.load("en_core_web_sm")

seq_len = 100

class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(str(text))]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0


class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)
        self.transform = transform

        # Get img, caption columns
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.tolist())


    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]

        self.pad_token = self.vocab.stoi["<PAD>"]



        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)

        decoder_input = [self.vocab.stoi["<SOS>"]]
        decoder_input += self.vocab.numericalize(caption)
        decoder_input.append(self.vocab.stoi["<EOS>"])

        num_pad_tokens_input = seq_len - len(decoder_input)

        for _ in range(num_pad_tokens_input):
            decoder_input.append(self.pad_token)

        decoder_input = torch.tensor(decoder_input)

        tgt_mask = ((decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0))).clone().detach()

        label = []

        label = label + self.vocab.numericalize(caption)
        label.append(self.vocab.stoi["<EOS>"])

        num_pad_tokens_label = seq_len - len(label)

        for _ in range(num_pad_tokens_label):
            label.append(self.pad_token)

        label = torch.tensor(label)

        return img, decoder_input, tgt_mask, label

def get_loader(
    root_folder,
    annotation_file,
    transform,
    batch_size=16,
    num_workers=,
    shuffle=True,
    pin_memory=True,
):
    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        
        pin_memory=pin_memory,
        
        drop_last=True
    )

    return loader, dataset


In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr30k")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/flickr30k


In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

# from model_imagecaptioning import VisionWithTransformer,print_examples
# from data_loader import get_loader
from tqdm import tqdm

# Ensure that the script runs properly on Windows
if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using Device:", device)
    if device == "cuda":
        print(f"Device name: {torch.cuda.get_device_name(0)}")
        print(f"Device memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3} GB")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")

    device = torch.device(device)

    transform = transforms.Compose(
        [
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]
    )

    # DataLoader
    loader, dataset = get_loader(
        "/kaggle/input/flickr8k/Images", "/kaggle/input/flickr8k/captions.txt", transform=transform
    )

    vocab_size = len(dataset.vocab)
    print("Total number of words in vocab:", vocab_size)
    # print("vOCAB:", dataset.vocab)
    model = VisionWithTransformer(vocab_size=vocab_size).to(device)

    params = model.parameters()

    loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"], label_smoothing=0.1).to(device)
    optimizer = optim.AdamW(params, lr=3e-4, weight_decay=1e-4, betas=(0.9, 0.98))
    print("Total number of learnable parameters:",sum(p.numel() for p in model.parameters() if p.requires_grad))

    # model_filename = "/kaggle/working/saved_model.pt"
    # if model_filename:
    #     print(f'Preloading model {model_filename}')
    #     state = torch.load(model_filename)
    #     model.load_state_dict(state['model_state_dict'])
    #     # initial_epoch = state['epoch'] + 1
    #     optimizer.load_state_dict(state['optimizer_state_dict'])
    #     # global_step = state['global_step']
    # else:
    #     print('No model to preload, starting from scratch')

    

    # # scaler = torch.amp.GradScaler('cuda')

    for epoch in range(30):
        print_examples(model,device,dataset)
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(loader, desc=f"Processing Epoch {epoch:02d}")
        for imgs, captions, tgt_masks, labels in batch_iterator:
            # print_examples(model,device,dataset)
            # break
            
            optimizer.zero_grad(set_to_none=True)
            imgs = imgs.to(device)
            
            captions = captions.to(device)
            
            tgt_masks = tgt_masks.to(device)
            
            labels = labels.to(device)
            

            
            

            
                
            output = model(imgs,None,captions,tgt_masks)
                
            loss = loss_fn(output.view(-1, vocab_size), labels.view(-1))
            
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            
            loss.backward()
            
            optimizer.step()


            

        

        model_filename = "/kaggle/working/saved_model.pt" 
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            # 'global_step': global_step
        }, model_filename)
        print("Model Saved successfully")
        

Using Device: cuda
Device name: Tesla P100-PCIE-16GB
Device memory: 15.887939453125 GB




Total number of words in vocab: 2994
Total number of learnable parameters: 129269170
Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> laid hugs laid laid laid laid laid robot robot lot robot lot robot watery laid laid laid laid laid laid laid laid laid laid laid laid watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery watery
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> laid laid laid laid laid laid laid laid whilst laid whilst laid laid laid laid laid laid laid laid laid laid laid laid laid laid laid laid laid watery watery watery watery watery watery watery watery watery watery telephone telephone watery watery watery watery laid watery telephone laid watery
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> laid laid laid laid laid laid laid whilst formal whilst formal laid laid laid laid laid laid laid laid la

Processing Epoch 00: 100%|██████████| 2528/2528 [24:03<00:00,  1.75it/s, loss=4.069]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a man in a red shirt is climbing a rock . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt is climbing a rock . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a red shirt is standing on a sidewalk . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man in a red shirt is climbing a rock . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a red shirt is climbing a rock . <EOS>


Processing Epoch 01: 100%|██████████| 2528/2528 [24:03<00:00,  1.75it/s, loss=3.750]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a man in a blue shirt is standing in front of a large building . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a white shirt and white shirt is standing in front of a large building . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a white shirt is standing in front of a large building . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man in a white jacket is standing in front of a large building . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is standing in front of a large building . <EOS>


Processing Epoch 02: 100%|██████████| 2528/2528 [24:04<00:00,  1.75it/s, loss=3.298]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a dog is running through the water . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a brown dog is running through a field . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a blue shirt is standing on a rock . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is running through the snow . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a dog is running through the water . <EOS>


Processing Epoch 03: 100%|██████████| 2528/2528 [24:06<00:00,  1.75it/s, loss=3.555]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is swimming in a pool . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a dog is running through a field . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a blue shirt is standing in front of a mountain . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is standing in the ocean . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is standing on a rock in front of a mountain . <EOS>


Processing Epoch 04: 100%|██████████| 2528/2528 [24:07<00:00,  1.75it/s, loss=3.466]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is jumping into the air on a beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt is standing on a rock in a field . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a blue shirt is standing on a rock with a rope in the background . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is climbing a rock wall . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is climbing a rock wall . <EOS>


Processing Epoch 05: 100%|██████████| 2528/2528 [24:10<00:00,  1.74it/s, loss=3.681]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young girl in a blue bathing suit is jumping into the water . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a football player in a red jersey is running with the ball . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man is riding a bicycle on a dirt road . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is surfing on a wave . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is standing on a beach with a dog . <EOS>


Processing Epoch 06: 100%|██████████| 2528/2528 [24:09<00:00,  1.74it/s, loss=3.484]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a boy in a blue shirt is jumping into the ocean . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and white shorts is jumping on the grass . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a yellow shirt and a woman in a black jacket is standing on a rock overlooking a waterfall . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man in a wetsuit is surfing . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt and a woman in a black shirt is walking on the beach . <EOS>


Processing Epoch 07: 100%|██████████| 2528/2528 [24:08<00:00,  1.75it/s, loss=3.497]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a boy is jumping into the air on the sand . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and black shorts is jumping a ramp . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a blue shirt and a woman in a green jacket sit on a bench . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a black dog running through the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is jumping a ramp on his skateboard . <EOS>


Processing Epoch 08: 100%|██████████| 2528/2528 [24:07<00:00,  1.75it/s, loss=3.480]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a dog is jumping into the water . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt is climbing a rock . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man is standing on a rocky mountain . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is standing on top of a mountain . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is standing on top of a mountain . <EOS>


Processing Epoch 09: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=3.287]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is jumping into the sand at the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt is riding a dirt bike through a forest . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a yellow shirt is standing on a rock overlooking a waterfall . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man in a black wetsuit is riding a wave . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is standing on a rock in front of a building . <EOS>


Processing Epoch 10: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=3.514]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is jumping in the sand on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and white shorts is riding a bike on a dirt road . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man is standing on a hill looking at a woman . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is standing on a cliff overlooking the ocean . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man is standing on the beach with a dog . <EOS>


Processing Epoch 11: 100%|██████████| 2528/2528 [24:12<00:00,  1.74it/s, loss=2.976]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a girl in a blue shirt is running on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and black pants is climbing a rock face . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man and a woman are walking down a city street . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is surfing on a wave . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is walking along a mountain path . <EOS>


Processing Epoch 12: 100%|██████████| 2528/2528 [24:12<00:00,  1.74it/s, loss=2.963]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a girl in a blue bathing suit is jumping into the air on the sand . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and helmet is climbing a rock face . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a yellow jacket and black pants is sitting on a rock in front of a waterfall . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is standing on a cliff with a fishing pole in the background . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a black jacket and hat is standing on a rocky ledge overlooking the ocean . <EOS>


Processing Epoch 13: 100%|██████████| 2528/2528 [24:14<00:00,  1.74it/s, loss=3.182]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a girl in a blue bathing suit is jumping into the sand at the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a little girl in a red shirt and blue shorts is swinging on a rope swing . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy on a skateboard is airborne . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is swimming in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a black shirt is climbing a mountain . <EOS>


Processing Epoch 14: 100%|██████████| 2528/2528 [24:14<00:00,  1.74it/s, loss=3.119]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a girl is jumping in the sand at the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a boy in a red shirt is riding a bicycle on a dirt track . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a skateboarder in the air above a ramp . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a man is standing on top of a mountain with his arms out to the right . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a group of people are standing on the sand at the beach . <EOS>


Processing Epoch 15: 100%|██████████| 2528/2528 [24:10<00:00,  1.74it/s, loss=3.074]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a girl in a white shirt is running along a beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a grey shirt and a white hat is riding a bike . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a yellow shirt and blue jeans is standing on a rock overlooking a waterfall . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is running through the snow . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a white shirt and jeans is riding a bicycle on a <UNK> street . <EOS>


Processing Epoch 16: 100%|██████████| 2528/2528 [24:05<00:00,  1.75it/s, loss=3.076]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a little girl is running on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a dirt biker rides through the dirt . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a man in a yellow jacket stands on a rock in the mountains . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a brown dog is running through the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt and shorts is standing on a rock in front of a mountain . <EOS>


Processing Epoch 17: 100%|██████████| 2528/2528 [24:05<00:00,  1.75it/s, loss=2.971]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a woman in a white shirt and a woman in a blue bikini running on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red helmet and helmet is riding a dirt bike on a dirt path . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a skateboarder does a trick on a ramp . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a black dog is swimming in water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a white shirt and jeans walks along a mountain . <EOS>


Processing Epoch 18: 100%|██████████| 2528/2528 [24:11<00:00,  1.74it/s, loss=3.188]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is running along a beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red and white uniform is riding a dirt bike . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy is doing a trick on his skateboard in midair . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is swimming in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is riding a brown horse . <EOS>


Processing Epoch 19: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=2.645]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> two women in bathing suits play in the sand . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a group of children are playing with a red and white dog . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a skateboarder is doing a trick in the air . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog swimming in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> two women in bikinis are walking on the sand . <EOS>


Processing Epoch 20: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=2.539]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young girl is running on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red and white suit is riding a bike on a dirt path . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> two people are sitting on a rock overlooking a river . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog swimming in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is riding a brown horse on a beach . <EOS>


Processing Epoch 21: 100%|██████████| 2528/2528 [24:14<00:00,  1.74it/s, loss=2.804]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> two women play volleyball on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red helmet riding a dirt bike on a dirt path . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a young boy skateboards at a skate park . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is jumping up at the man wearing a blue shirt . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a black dog is running across a grassy field . <EOS>


Processing Epoch 22: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=2.731]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young boy is running along a sandy beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red shirt and helmet is riding a dirt bike . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy skateboards off of a cement ramp . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is swimming in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is standing in the sand on a pink snowboard . <EOS>


Processing Epoch 23: 100%|██████████| 2528/2528 [24:13<00:00,  1.74it/s, loss=2.638]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a young girl is running on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red jacket and white helmet is riding a dirt bike on a path . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> two people are walking across a street that is snow covered . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog swims through water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a grey shirt is standing in front of a mountain range . <EOS>


Processing Epoch 24: 100%|██████████| 2528/2528 [24:10<00:00,  1.74it/s, loss=2.617]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> two people are standing on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red helmet riding a dirt bike . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> two people are sitting in a bus stop . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a brown and white dog is jumping in the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man and a woman walk their dogs . <EOS>


Processing Epoch 25: 100%|██████████| 2528/2528 [24:05<00:00,  1.75it/s, loss=2.366]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a group of people are standing on a beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man on a bike is riding on a dirt bike . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> two people sit in a pool on a hillside . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is swimming in a lake with a stick in his mouth . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a dog is running through the sand . <EOS>


Processing Epoch 26: 100%|██████████| 2528/2528 [24:05<00:00,  1.75it/s, loss=2.583]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> two women play beach volleyball on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red shirt is riding a dirt bike over dirt . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy does tricks on his skateboard . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a person jumping off of a dock into a lake . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt and dark shorts is doing a back flip on a beach . <EOS>


Processing Epoch 27: 100%|██████████| 2528/2528 [24:09<00:00,  1.74it/s, loss=2.168]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> a boy is leaping into the air whilst playing in the sand . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a person in a red and white suit rides a bike down a hill in the woods . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy does a skateboard trick off of a graffiti covered wall . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a dog is jumping through the water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a brown shirt holding a camera on a mountain . <EOS>


Processing Epoch 28: 100%|██████████| 2528/2528 [24:12<00:00,  1.74it/s, loss=2.247]


Example 1 CORRECT: Dog on a beach by the ocean
Example 1 OUTPUT: <SOS> two women play volleyball on the beach . <EOS>
Example 2 CORRECT: Child holding red frisbee outdoors
Example 2 OUTPUT: <SOS> a man in a red shirt and helmet riding a bike down a dirt path . <EOS>
Example 3 CORRECT: Bus driving by parked cars
Example 3 OUTPUT: <SOS> a boy does a skateboard trick while a crowd watches . <EOS>
Example 4 CORRECT: A small boat in the ocean
Example 4 OUTPUT: <SOS> a large black and brown dog is swimming in a deep water . <EOS>
Example 5 CORRECT: A cowboy riding a horse in the desert
Example 5 OUTPUT: <SOS> a man in a blue shirt is riding a skateboard up a ramp . <EOS>


Processing Epoch 29:  66%|██████▌   | 1663/2528 [15:56<08:17,  1.74it/s, loss=2.310]