## Prior Code (Data Fetch and Prep)

In [None]:
######################################################################################################################
#                               Import Python Files for Sentence & Annotations Extraction                            #
#                                        Provided as a Simple API on Github                                          #
#                                https://github.com/BryanPlummer/flickr30k_entities                                  #
######################################################################################################################


# Uncomment the below line once in order for the code to run smoothly
!pip install imagesize

import os
os.chdir(r"/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/")

import Utils.flickr30k_entities_utils
from Utils.flickr30k_entities_utils import get_sentence_data, get_annotations

import Utils.helper_functions
from Utils.helper_functions import *

import random 
import seaborn as sb



######################################################################################################################
#                                                                                                                    #
#                                                   Mapping Function                                                 #
#                                                                                                                    #
######################################################################################################################
"""
Mapping Function does the following,
    - takes list of Image names as i/p and fetch Sentences & Annotations (contains bounding boxes)of all those Images
    - passes those Sentences & Annotations to the func Phrase_Id_to_Bbox & gets Bounding Boxes for all phrases
    in every image.
    - also, passes those Sentences & Annotations to the func Phrase_Id_to_Phrases & extracts phrases contained in all
    images.
    
    A typical look of the outputs would be:
    
        _Image_Train_Phrase_Id_to_Bbox -----> {'image_id_1' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              }
                                                              
                                                              
        _Image_Train_Phrase_Id_to_Phrase -----> {'image_id_1' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              }
        

NOTE: Please alter any folder paths for Images, Sentences and Annotations (Phrase & Bounding Boxes) in Helper Function File


"""



from collections import defaultdict
def Mapping(_Image_Names, _paths_dict):
    _Phrase_Id_to_Bbox = defaultdict()
    _Phrase_Id_to_Phrase = defaultdict()

    for _img in tqdm(_Image_Names):
        _img_sentences_path, _img_annotations_path, _img_absolute_path = get_Paths(_img, _paths_dict)
        sents = get_sentence_data(_img_sentences_path)
        anns = get_annotations(_img_annotations_path)
        _Phrase_Id_to_Bbox[_img] = phrase_Id_to_Bbox(sents, anns)
        _Phrase_Id_to_Phrase[_img] = phrase_Id_to_Phrases(sents, anns)
        
    
    return _Phrase_Id_to_Bbox, _Phrase_Id_to_Phrase




######################################################################################################################
#                                                                                                                    #
#                                                 All Important Paths                                                #
#                                                                                                                    #
######################################################################################################################
_paths_dict = {
                '_sentences_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Sentences',
                '_annotations_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Annotations',
                '_image_folder_path' : r'/kaggle/input/flickr30k/flickr30k_images'
                }
_train_len = 1000 #len(_trainimg)




######################################################################################################################
#                                                                                                                    #
#                       Enter path for train, val & test split in their respective variables                         #
#                                                                                                                    #
######################################################################################################################
"""
train.txt, val.txt and test.txt are text files that contains predefined splits, i.e each file contains the split it
belongs to.

train.txt contains all image names as strings, that should be used for training
val.txt contains all image names as strings, that should be used for validation
test.txt contains all image names as strings, that should be used for testing
"""

_trainimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/train.txt')
_vlimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/val.txt')
_tsimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/test.txt')


# VISION TRANSFORMER


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import vit_b_16 as vit_model
from torchvision.models import ViT_B_16_Weights as vit_model_weights
from tqdm.notebook import tqdm
import PIL 
from PIL import Image

In [None]:

class ViT(nn.Module):
    
    #Constructor
    def __init__(self):
        super(ViT, self).__init__()
        
        #Load the Pretrained Model
        self.ViT_Weights = vit_model_weights(vit_model_weights.DEFAULT)
        self.ViT_Transforms = self.ViT_Weights.transforms()
        self.model = vit_model(weights=self.ViT_Weights)
        
        #Does not require training, so set the Gradients to False
        for param in self.model.parameters():
            param.requires_grad = False
            
        #Hook Variables
        self.Features = {}
        self.hook = self.model.encoder.ln.register_forward_hook(self.get_features('last_hidden_state'))

    #Hook Registration
    def get_features(self, name):
        def hook(model, input, output):
            self.Features[name] = output[:,0,:].detach()
        return hook
    
    def forward(self, batches, paths_dict, _image=None, training = True):
        if training:
            Vision_Embeddings = []
            Image_Ids = []
            for batch in tqdm(batches):
                image_batch = []
                for image in batch:
                    Image_Ids.append(image)
                    _,_,img_path = get_Paths(image_name= image, _paths= paths_dict)
                    image_loaded = Image.open(img_path)
                    image_transformed = self.ViT_Transforms(image_loaded)
                    image_batch.append(image_transformed)

                transformed_image_batch = torch.stack(image_batch)
                _ = self.model.forward(transformed_image_batch)
                Vision_Embeddings.append(self.Features['last_hidden_state'])
            Vision_Embeddings = torch.vstack(Vision_Embeddings)
        else:
            Vision_Embeddings = None
            image_transformed = self.ViT_Transforms(_image)
            image_transformed = image_transformed.unsqueeze(0)
            _ = self.model(image_transformed)
            Vision_Embeddings = self.Features['last_hidden_state']
        
        return Vision_Embeddings


In [None]:



_trainimg_temp = _trainimg[:_train_len]
batch_size = 32
train_batches = [_trainimg[i:i+batch_size] for i in range(0,len(_trainimg_temp),batch_size)]
val_batches = [_vlimg[i:i+batch_size] for i in range(0,len(_vlimg),batch_size)]
test_batches = [_tsimg[i:i+batch_size] for i in range(0,len(_tsimg),batch_size)]
del _trainimg_temp

obj = ViT()
_,_,img_path = get_Paths(image_name= train_batches[0][0], _paths= _paths_dict)
one_instance = Image.open(img_path)
emb = obj.forward(batches = None, paths_dict = None, _image = one_instance, training = False)