### Importing Necessary Libraries for Data Preparation & Util Functions

In [1]:
######################################################################################################################
#                               Import Python Files for Sentence & Annotations Extraction                            #
#                                        Provided as a Simple API on Github                                          #
#                                https://github.com/BryanPlummer/flickr30k_entities                                  #
######################################################################################################################

# Uncomment the below line once in order for the code to run smoothly
!pip install imagesize

import os
os.chdir(r"/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/")

import Utils.flickr30k_entities_utils
from Utils.flickr30k_entities_utils import get_sentence_data, get_annotations

import Utils.helper_functions
from Utils.helper_functions import *

import random 
import seaborn as sb

import pickle
import torch

import ast
import torch
import pickle
import torch.nn as nn
import torch.optim
import torch.utils.data.distributed
from torch.utils.data import Dataset,DataLoader
from torchvision.ops import box_iou,generalized_box_iou_loss





Collecting imagesize
  Downloading imagesize-1.4.1-py2.py3-none-any.whl (8.8 kB)
Installing collected packages: imagesize
Successfully installed imagesize-1.4.1
[0m

### Driver Functions

In [2]:
######################################################################################################################
#                                                                                                                    #
#                                                   Mapping Function                                                 #
#                                                                                                                    #
######################################################################################################################
"""
Mapping Function does the following,
    - takes list of Image names as i/p and fetch Sentences & Annotations (contains bounding boxes)of all those Images
    - passes those Sentences & Annotations to the func Phrase_Id_to_Bbox & gets Bounding Boxes for all phrases
    in every image.
    - also, passes those Sentences & Annotations to the func Phrase_Id_to_Phrases & extracts phrases contained in all
    images.
    
    A typical look of the outputs would be:
    
        _Image_Train_Phrase_Id_to_Bbox -----> {'image_id_1' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              }
                                                              
                                                              
        _Image_Train_Phrase_Id_to_Phrase -----> {'image_id_1' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              }
        

NOTE: Please alter any folder paths for Images, Sentences and Annotations (Phrase & Bounding Boxes) in Helper Function File


"""



from collections import defaultdict
def Mapping(_Image_Names, _paths_dict):
    _Phrase_Id_to_Bbox = defaultdict()
    _Phrase_Id_to_Phrase = defaultdict()

    for _img in tqdm(_Image_Names):
        _img_sentences_path, _img_annotations_path, _img_absolute_path = get_Paths(_img, _paths_dict)
        sents = get_sentence_data(_img_sentences_path)
        anns = get_annotations(_img_annotations_path)
        _Phrase_Id_to_Bbox[_img] = phrase_Id_to_Bbox(sents, anns)
        _Phrase_Id_to_Phrase[_img] = phrase_Id_to_Phrases(sents, anns)
        
        
    return _Phrase_Id_to_Bbox, _Phrase_Id_to_Phrase




### Driver Code

In [3]:
_paths_dict = {
                '_sentences_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Sentences',
                '_annotations_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Annotations',
                '_image_folder_path' : r'/kaggle/input/flickr30k/flickr30k_images'
                }
_train_len = 5000 #len(_trainimg)



In [4]:
"""
######################################################################################################################
#                                                                                                                    #
#                       Enter path for train, val & test split in their respective variables                         #
#                                                                                                                    #
######################################################################################################################


train.txt, val.txt and test.txt are text files that contains predefined splits, i.e each file contains the split it
belongs to.

train.txt contains all image names as strings, that should be used for training
val.txt contains all image names as strings, that should be used for validation
test.txt contains all image names as strings, that should be used for testing

"""

_trainimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/train.txt')
_vlimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/val.txt')
_tsimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/test.txt')


In [5]:
"""
######################################################################################################################
#                                                                                                                    #
#                                                 Call to the Mapping Functions                                      #
#                                                                                                                    #
######################################################################################################################
"""

_fractional_trainimg = _trainimg[:_train_len]
_Image_Train_Phrase_Id_to_Bbox, _Image_Train_Phrase_Id_to_Phrase = Mapping(_fractional_trainimg, _paths_dict)
_Image_Val_Phrase_Id_to_Bbox, _Image_Val_Phrase_Id_to_Phrase = Mapping(_vlimg, _paths_dict)
_Image_Test_Phrase_Id_to_Bbox, _Image_Test_Phrase_Id_to_Phrase = Mapping(_tsimg, _paths_dict)



  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [6]:
import pandas as pd
def prepare_DataFrame(Phrase_Dict, Bbox_Dict):
    Final_DF = pd.DataFrame()
    for Image_Id in tqdm(Phrase_Dict.keys()):
        
        Phrase_DF = pd.DataFrame.from_dict(Phrase_Dict[Image_Id], orient = 'index')
        Phrase_DF = pd.DataFrame(Phrase_DF.stack(level=0)).reset_index().drop('level_1', axis = 1)

        Bbox_DF = pd.DataFrame.from_dict(Bbox_Dict[Image_Id], orient = 'index')
        Bbox_DF = pd.DataFrame(Bbox_DF.stack(level=0)).reset_index().drop('level_1', axis = 1)
        Bbox_DF = Bbox_DF.groupby(['level_0'])[0].apply(list)
        

        Merged_DF = pd.merge(Phrase_DF, Bbox_DF, on = 'level_0', how='inner')
        Merged_DF['Image_Id'] = Image_Id

        Final_DF = pd.concat([Final_DF, Merged_DF], axis = 0)

    Final_DF = Final_DF.rename(columns = {'level_0' : 'Phrase_Id', '0_x': 'Phrase', '0_y':'Bounding_Box'})
    Final_DF = Final_DF[['Image_Id', 'Phrase_Id', 'Phrase', 'Bounding_Box']]
    Final_DF.reset_index(drop = True, inplace = True)
    print("Local Function Called......")
    return Final_DF



"""***************************************************************************************************************"""


'***************************************************************************************************************'

In [7]:
"""
######################################################################################################################
#                                                                                                                    #
#                                                Converting to DataFrames.                                           #
#                                                                                                                    #
######################################################################################################################
"""

_Fractional_Train_Set_Pid_to_P = {img : _Image_Train_Phrase_Id_to_Phrase[img] for img in _trainimg[:_train_len]}
_Fractional_Train_Set_Pid_to_B = {img : _Image_Train_Phrase_Id_to_Bbox[img] for img in _trainimg[:_train_len]}

Train_Frame = prepare_DataFrame(_Fractional_Train_Set_Pid_to_P, _Fractional_Train_Set_Pid_to_B)
Test_Frame = prepare_DataFrame(_Image_Test_Phrase_Id_to_Phrase, _Image_Test_Phrase_Id_to_Bbox)
Val_Frame = prepare_DataFrame(_Image_Val_Phrase_Id_to_Phrase, _Image_Val_Phrase_Id_to_Bbox)


Train_Frame.Phrase = Train_Frame.Phrase.str.lower()
Val_Frame.Phrase = Val_Frame.Phrase.str.lower()
Test_Frame.Phrase = Test_Frame.Phrase.str.lower()


  0%|          | 0/5000 [00:00<?, ?it/s]

Local Function Called......


  0%|          | 0/1000 [00:00<?, ?it/s]

Local Function Called......


  0%|          | 0/1000 [00:00<?, ?it/s]

Local Function Called......


# Retrieval Code for ViT Embeddings

In [8]:
"""
######################################################################################################################
#                                                                                                                    #
#                    image_index corresponding to image_id is the index of its embedding                             #
#                                                                                                                    #
######################################################################################################################
"""


_v_train_Image_Indices = pd.DataFrame(_trainimg[:_train_len], columns = ['Image_Id']).reset_index().rename(columns = {'index':'image_index'})
_v_val_Image_Indices = pd.DataFrame(_vlimg[:_train_len], columns = ['Image_Id']).reset_index().rename(columns = {'index':'image_index'})
_v_test_Image_Indices = pd.DataFrame(_tsimg[:_train_len], columns = ['Image_Id']).reset_index().rename(columns = {'index':'image_index'})


Train_Frame = Train_Frame.merge(_v_train_Image_Indices, on = 'Image_Id', how='left')
Val_Frame = Val_Frame.merge(_v_val_Image_Indices, on = 'Image_Id', how='left')
Test_Frame = Test_Frame.merge(_v_test_Image_Indices, on = 'Image_Id', how='left')

Vision_Embeddings_train = torch.load('/kaggle/input/embeddings-7k/v_train_embeds.pt')
Vision_Embeddings_val = torch.load('/kaggle/input/embeddings-7k/v_val_embeds.pt')
Vision_Embeddings_test = torch.load('/kaggle/input/embeddings-7k/v_test_embeds.pt')


# Retrieval Code for BERT Embeddings

In [9]:
"""
######################################################################################################################
#                                                                                                                    #
#               image_index corresponding to unique Phrase is the index of its embedding                             #
#                                                                                                                    #
######################################################################################################################
"""
with open('/kaggle/input/embeddings-7k/_train_Phrase_to_Index_Map.pkl', 'rb') as fp:
    _train_Phrase_to_Index_Map = pickle.load(fp)

with open('/kaggle/input/embeddings-7k/_val_Phrase_to_Index_Map.pkl', 'rb') as fp:
    _val_Phrase_to_Index_Map = pickle.load(fp)
    
with open('/kaggle/input/embeddings-7k/_test_Phrase_to_Index_Map.pkl', 'rb') as fp:
    _test_Phrase_to_Index_Map = pickle.load(fp)
    
    
    
_t_train_Image_Indices = pd.DataFrame(_train_Phrase_to_Index_Map.items(), columns = ['Phrase', 'text_index'])
_t_val_Image_Indices = pd.DataFrame(_val_Phrase_to_Index_Map.items(), columns = ['Phrase', 'text_index'])
_t_test_Image_Indices = pd.DataFrame(_test_Phrase_to_Index_Map.items(), columns = ['Phrase', 'text_index'])

Train_Frame = Train_Frame.merge(_t_train_Image_Indices, on = 'Phrase', how = 'left')
Val_Frame = Val_Frame.merge(_t_val_Image_Indices, on = 'Phrase', how = 'left')
Test_Frame = Test_Frame.merge(_t_test_Image_Indices, on = 'Phrase', how = 'left')


Textual_Embeddings_train = torch.load('/kaggle/input/embeddings-7k/t_train_embeds.pt')
Textual_Embeddings_val = torch.load('/kaggle/input/embeddings-7k/t_val_embeds.pt')
Textual_Embeddings_test = torch.load('/kaggle/input/embeddings-7k/t_test_embeds.pt')

# Actual Dataframes Look Like

In [10]:
just_to_see = ['Image_Id', 'Phrase_Id', 'Phrase']
necessary_columns = ['image_index', 'text_index', 'Bounding_Box']
train = Train_Frame[just_to_see + necessary_columns] 
val = Val_Frame[just_to_see + necessary_columns] 
test = Test_Frame[just_to_see + necessary_columns] 

## Training Set

In [11]:
train.head(30)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,image_index,text_index,Bounding_Box
0,3359636318,112630,two people,0,1657,"[[46, 182, 105, 333], [143, 165, 207, 333]]"
1,3359636318,112632,the video game shop,0,15491,"[[0, 54, 168, 307]]"
2,3359636318,112631,the mobile phone store,0,15492,"[[191, 0, 498, 230]]"
3,3359636318,112625,people,0,0,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
4,3359636318,112625,a group of people,0,15493,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
5,3359636318,112625,several people,0,1658,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
6,3359636318,112627,some stores,0,1659,"[[191, 0, 498, 230], [1, 0, 190, 307]]"
7,3359636318,112626,a sidewalk,0,1660,"[[2, 212, 499, 333]]"
8,6959556104,262504,many people of all races,1,18720,"[[5, 70, 103, 314], [120, 54, 206, 172], [197,..."
9,6959556104,262504,a series of spectators,1,15494,"[[5, 70, 103, 314], [120, 54, 206, 172], [197,..."


## Validation Set

In [12]:
val.head(30)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,image_index,text_index,Bounding_Box
0,100652400,197,a man,0,616,"[[52, 44, 109, 202]]"
1,100652400,197,a construction worker,0,2825,"[[52, 44, 109, 202]]"
2,100652400,198,a blue hard hat,0,4767,"[[58, 43, 87, 65]]"
3,100652400,198,hard hat,0,617,"[[58, 43, 87, 65]]"
4,100652400,198,a hard hat,0,2826,"[[58, 43, 87, 65]]"
5,100652400,199,a reflective vest,0,2828,"[[61, 68, 97, 118]]"
6,100652400,199,orange safety vest,0,2827,"[[61, 68, 97, 118]]"
7,100652400,199,bright vest,0,618,"[[61, 68, 97, 118]]"
8,100652400,199,a caution vest,0,2829,"[[61, 68, 97, 118]]"
9,100652400,200,an intersection,0,619,"[[0, 89, 373, 499]]"


## Testing Set

In [13]:
test.head(30)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,image_index,text_index,Bounding_Box
0,1016887272,547,several climbers,0,599,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
1,1016887272,547,a group of people,0,4644,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
2,1016887272,547,seven climbers,0,600,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
3,1016887272,547,a collage of one person,0,5381,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
4,1016887272,548,a rock climbing wall,0,4645,"[[0, 53, 332, 499]]"
5,1016887272,548,a cliff,0,603,"[[0, 53, 332, 499]]"
6,1016887272,548,a rock face,0,2731,"[[0, 53, 332, 499]]"
7,1016887272,548,the rock,0,601,"[[0, 53, 332, 499]]"
8,1016887272,548,a rock,0,602,"[[0, 53, 332, 499]]"
9,1016887272,549,one man,0,606,"[[73, 301, 180, 499]]"


# Prepare DataLoaders

In [14]:
num_hid_dims = 0
def _the_Collate(batch):
    batch_size = len(batch)
    #print(batch_size)
    image_index_tensor = []
    text_index_tensor = []
    image_emb_tensor = []
    phrase_emb_tensor = []
    bbox_tensor = []
    
    for idx, (im_idx, t_idx, im_emb, p_emb, bbox) in enumerate(batch):
        image_index_tensor.append(im_idx)
        text_index_tensor.append(t_idx)
        image_emb_tensor.append(im_emb)
        phrase_emb_tensor.append(p_emb)
        bbox_tensor.append(bbox[0])
        
        
    """pad = [[0, 0, 0, 0]] * num_hid_dims
    for index, _ in enumerate(bbox_tensor):
        temp_pad = pad
        temp_pad[:len(bbox_tensor[index])] = bbox_tensor[index]
        bbox_tensor[index] = torch.tensor(temp_pad)"""
    
    image_index_tensor = torch.tensor(image_index_tensor)
    text_index_tensor = torch.tensor(text_index_tensor)
    image_emb_tensor = torch.stack(image_emb_tensor)
    phrase_emb_tensor = torch.stack(phrase_emb_tensor)
    bbox_tensor = torch.tensor(bbox_tensor)
    #print(bbox_tensor)
    
    return image_index_tensor, text_index_tensor, image_emb_tensor, phrase_emb_tensor, bbox_tensor

In [15]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, img_emb,text_emb):
        self.dataframe = dataframe
        self.image_emb = img_emb
        self.text_emb = text_emb

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        image_index = self.dataframe['image_index'][index]
        text_index = self.dataframe['text_index'][index]
        image_embedding = self.image_emb[image_index]
        phrase_embedding = self.text_emb[text_index]
        bounding_boxes = self.dataframe['Bounding_Box'][index]
        return image_index, text_index, image_embedding, phrase_embedding, bounding_boxes

In [16]:

train_dataset = CustomDataset(train, img_emb=Vision_Embeddings_train, text_emb=Textual_Embeddings_train)
val_dataset = CustomDataset(val, img_emb=Vision_Embeddings_val, text_emb=Textual_Embeddings_val)
test_dataset = CustomDataset(test,img_emb=Vision_Embeddings_test,text_emb=Textual_Embeddings_test)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn= _the_Collate,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, collate_fn= _the_Collate,drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn= _the_Collate,drop_last=True)



In [17]:
"""for idx, (im_idx, t_idx, im_emb, p_emb, bbox) in enumerate(train_loader):
    print("Batch number : ", idx)
    print(im_idx)
    print(t_idx)
    print(im_emb)
    print(p_emb)
    print(bbox.size())
    break"""


'for idx, (im_idx, t_idx, im_emb, p_emb, bbox) in enumerate(train_loader):\n    print("Batch number : ", idx)\n    print(im_idx)\n    print(t_idx)\n    print(im_emb)\n    print(p_emb)\n    print(bbox.size())\n    break'

In [18]:
class PositionalEncoding(nn.Module):
    """
    A module that adds positional encoding to each of the token's features.
    So that the Transformer is position aware.
    """
    def __init__(self, input_dim: int, max_len: int=10000):
        """
        Inputs:
        - input_dim: Input dimension about the features for each token
        - max_len: The maximum sequence length
        """
        super(PositionalEncoding, self).__init__()
        
        self.input_dim = input_dim
        self.max_len = max_len
        
        
    def forward(self, x):
        """
        Compute the positional encoding and add it to x.
        
        Input:
        - x: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
          
        Return:
        - x: Tensor of the shape BxLxC, with the positional encoding added to the input
        """
        seq_len = x.shape[0]
        input_dim = x.shape[1]
        
        pe = None
        ###########################################################################
        # TODO: Compute the positional encoding                                   #
        # Check Section 3.5 for the definition (https://arxiv.org/pdf/1706.03762.pdf)
        #                                                                         #
        # It's a bit messy, but the definition is provided for your here for your #
        # convenience (in LaTex).                                                 #
        # PE_{(pos,2i)} = sin(pos / 10000^{2i/\dmodel})                           #
        # PE_{(pos,2i+1)} = cos(pos / 10000^{2i/\dmodel})                         #
        #                                                                         #
        # You should replace 10000 with max_len here.
        ###########################################################################
        #Finding even values
        even = torch.arange(0, self.input_dim, 2)
        #Since odd den = even den
        den = self.max_len ** (even/self.input_dim)
        #Pos as ascending values from 0 to seq_len
        pos = torch.arange(seq_len).reshape(seq_len, 1)
        even_pe = torch.sin(pos / den)
        odd_pe = torch.cos(pos / den)
        #Stacking odd and even together
        even_odd_stack = torch.stack([even_pe, odd_pe], dim=2)
        pe = torch.flatten(even_odd_stack, 1, 2)
        x = x + pe
        
        
        return x

In [19]:
class MultiHeadAttention(nn.Module):
    """
    A module that computes multi-head attention given query, key, and value tensors.
    """
    def __init__(self, input_dim: int, num_heads: int):
        """
        Constructor.
        
        Inputs:
        - input_dim: Dimension of the input query, key, and value. Here we assume they all have
          the same dimensions. But they could have different dimensions in other problems.
        - num_heads: Number of attention heads
        """
        super(MultiHeadAttention, self).__init__()
        
        assert input_dim % num_heads == 0
        
        self.input_dim = input_dim
        self.num_heads = num_heads
        self.dim_per_head = input_dim // num_heads
        
        ###########################################################################
        # TODO: Define the linear transformation layers for key, value, and query.#
        # Also define the output layer.
        ###########################################################################
        self.key = nn.Linear(self.input_dim,self.input_dim)
        self.value = nn.Linear(self.input_dim,self.input_dim)
        self.query = nn.Linear(self.input_dim,self.input_dim)
        self.output = nn.Linear(self.input_dim,self.input_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        
    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor=None):
        """
        Compute the attended feature representations.
        
        Inputs:
        - query: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
        - key: Tensor of the shape BxLxC
        - value: Tensor of the shape BxLxC
        - mask: Tensor indicating where the attention should *not* be performed
        """
        b = query.shape[0]        
        
        dot_prod_scores = None
        ###########################################################################
        # TODO: Compute the scores based on dot product between transformed query,#
        # key, and value. You may find torch.matmul helpful, whose documentation  #
        # can be found at                                                         #
        # https://pytorch.org/docs/stable/generated/torch.matmul.html#torch.matmul#
        # Remember to devide the doct product similarity scores by square root of #
        # the channel dimension per head.   
        #                                                                         #
        # Since no for loops are allowed here, think of how to use tensor reshape #
        # to process multiple attention heads at the same time.                   #
        ###########################################################################
        
        q = self.query(query)
        q = torch.reshape(q,(b, -1,self.num_heads,self.dim_per_head)) 
        q = q.transpose(1,2)
        #print('qshape',q.shape)
        k = self.key(key)
        k = torch.reshape(k,(b, -1,self.num_heads,self.dim_per_head)) 
        k = k.transpose(1,2)
        #print('kshape',k.shape)
        v = self.value(value)
        v = torch.reshape(v,(b, -1,self.num_heads,self.dim_per_head)) 
        v = v.transpose(1,2)
        #print('vshape',v.shape)
        
        key_t = k.transpose(3,2)
        #print(key_t.shape)
        dot_prod_scores = torch.matmul(q,key_t)
        dot_prod_scores = dot_prod_scores / math.sqrt(self.dim_per_head)
        #print(dot_prod_scores.shape)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        if mask is not None:
            # We simply set the similarity scores to be near zero for the positions
            # where the attention should not be done. Think of why we do this.
            dot_prod_scores = dot_prod_scores.masked_fill(mask == 0, -1e9)

        out = None
        ###########################################################################
        # TODO: Compute the attention scores, which are then used to modulate the #
        # value tensor. Finally concate the attended tensors from multiple heads  #
        # and feed it into the output layer. You may still find torch.matmul      #
        # helpful.                                                                #
        #                                                                         #
        # Again, think of how to use reshaping tensor to do the concatenation.    #
        ###########################################################################
        S = torch.nn.Softmax(dim = -1)
        scores = S(dot_prod_scores)
        #print(scores.shape)
        scores = torch.matmul(scores, v)
        #print(scores.shape)
        out = scores.transpose(1,2)
        out = torch.reshape(out,(b,-1,self.num_heads*self.dim_per_head))
        out = self.output(out)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return out

In [20]:
class Text_Guided_Self_Attention(nn.Module):
    """
    A module that computes multi-head attention given query, key, and value tensors.
    """
    def __init__(self, input_dim: int, num_heads: int):
        """
        Constructor.
        
        Inputs:
        - input_dim: Dimension of the input query, key, and value. Here we assume they all have
          the same dimensions. But they could have different dimensions in other problems.
        - num_heads: Number of attention heads
        """
        super(Text_Guided_Self_Attention, self).__init__()
        
        assert input_dim % num_heads == 0
        
        self.input_dim = input_dim
        self.num_heads = num_heads
        self.dim_per_head = input_dim // num_heads
        
        ###########################################################################
        # TODO: Define the linear transformation layers for key, value, and query.#
        # Also define the output layer.
        ###########################################################################
        self.key = nn.Linear(self.input_dim,self.input_dim)
        self.value = nn.Linear(self.input_dim,self.input_dim)
        self.query = nn.Linear(self.input_dim,self.input_dim)
        self.y_text = nn.Linear(self.input_dim,self.input_dim)
        self.output = nn.Linear(self.input_dim,self.input_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        
    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, y_text: torch.Tensor, mask: torch.Tensor=None):
        """
        Compute the attended feature representations.
        
        Inputs:
        - query: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
        - key: Tensor of the shape BxLxC
        - value: Tensor of the shape BxLxC
        - mask: Tensor indicating where the attention should *not* be performed
        """
        b = query.shape[0]        
        
        dot_prod_scores = None
        ###########################################################################
        # TODO: Compute the scores based on dot product between transformed query,#
        # key, and value. You may find torch.matmul helpful, whose documentation  #
        # can be found at                                                         #
        # https://pytorch.org/docs/stable/generated/torch.matmul.html#torch.matmul#
        # Remember to devide the doct product similarity scores by square root of #
        # the channel dimension per head.   
        #                                                                         #
        # Since no for loops are allowed here, think of how to use tensor reshape #
        # to process multiple attention heads at the same time.                   #
        ###########################################################################
        
        q = self.query(query)
        q = torch.reshape(q,(b, -1,self.num_heads,self.dim_per_head)) 
        q = q.transpose(1,2)
        
        #q_cap
        y_t = self.y_text(y_text)
        y_t = torch.reshape(y_t,(b, -1,self.num_heads,self.dim_per_head)) 
        y_t = y_t.transpose(1,2)
        y_t_t = y_t.transpose(3,2)
        q_temp = torch.matmul(q,y_t_t)
        q_temp = q_temp / math.sqrt(self.dim_per_head)
        S = torch.nn.Softmax(dim = -1)
        q_cap = S(q_temp)
        q_cap = torch.matmul(q_cap, y_t)
        
        #print('qshape',q.shape)
        k = self.key(key)
        k = torch.reshape(k,(b, -1,self.num_heads,self.dim_per_head)) 
        k = k.transpose(1,2)
        #print('kshape',k.shape)
        v = self.value(value)
        v = torch.reshape(v,(b, -1,self.num_heads,self.dim_per_head)) 
        v = v.transpose(1,2)
        #print('vshape',v.shape)
        
        key_t = k.transpose(3,2)
        #print(key_t.shape)
        dot_prod_scores = torch.matmul(q_cap,key_t)
        dot_prod_scores = dot_prod_scores / math.sqrt(self.dim_per_head)
        #print(dot_prod_scores.shape)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        if mask is not None:
            # We simply set the similarity scores to be near zero for the positions
            # where the attention should not be done. Think of why we do this.
            dot_prod_scores = dot_prod_scores.masked_fill(mask == 0, -1e9)

        out = None
        ###########################################################################
        # TODO: Compute the attention scores, which are then used to modulate the #
        # value tensor. Finally concate the attended tensors from multiple heads  #
        # and feed it into the output layer. You may still find torch.matmul      #
        # helpful.                                                                #
        #                                                                         #
        # Again, think of how to use reshaping tensor to do the concatenation.    #
        ###########################################################################
        S = torch.nn.Softmax(dim = -1)
        scores = S(dot_prod_scores)
        #print(scores.shape)
        scores = torch.matmul(scores, v)
        #print(scores.shape)
        out = scores.transpose(1,2)
        out = torch.reshape(out,(b,-1,self.num_heads*self.dim_per_head))
        out = self.output(out)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return out

In [21]:
class FeedForwardNetwork(nn.Module):
    """
    A simple feedforward network. Essentially, it is a two-layer fully-connected
    neural network.
    """
    def __init__(self, input_dim, ff_dim, dropout):
        """
        Inputs:
        - input_dim: Input dimension
        - ff_dim: Hidden dimension
        """
        super(FeedForwardNetwork, self).__init__()
        
        ###########################################################################
        # TODO: Define the two linear layers and a non-linear one.
        ###########################################################################
        self.ll1 = nn.Linear(input_dim, ff_dim)
        self.nl = nn.ReLU()
        self.ll2 = nn.Linear(ff_dim, input_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
    def forward(self, x: torch.Tensor):
        """
        Input:
        - x: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
         and C is the channel dimension
          
        Return:
        - y: Tensor of the shape BxLxC
        """
        
        y = None
        ###########################################################################
        # TODO: Process the input.                                                #
        ###########################################################################
        l1_out = self.ll1(x)
        n_out = self.nl(l1_out)
        y = self.ll2(n_out)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return y
        

In [22]:
class TextEncoderCell(nn.Module):
    """
    A single cell (unit) for the Transformer encoder.
    """
    def __init__(self, word_emb_dim: int, num_heads: int, ff_dim: int, dropout: float):
        """
        Inputs:
        - input_dim: Input dimension for each token in a sequence
        - num_heads: Number of attention heads in a multi-head attention module
        - ff_dim: The hidden dimension for a feedforward network
        - dropout: Dropout ratio for the output of the multi-head attention and feedforward
          modules.
        """
        super(TextEncoderCell, self).__init__()
        
        ###########################################################################
        # TODO: A single Transformer encoder cell consists of 
        # 1. A multi-head attention module
        # 2. Followed by dropout
        # 3. Followed by layer norm (check nn.LayerNorm)
        # https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm
        #                                                                         #
        # At the same time, it also has
        # 1. A feedforward network
        # 2. Followed by dropout
        # 3. Followed by layer norm
        ###########################################################################
        self.textual_self_attention = MultiHeadAttention(word_emb_dim,num_heads)
        self.dropout1_text = torch.nn.Dropout(dropout)
        self.norm1_text = torch.nn.LayerNorm(word_emb_dim)
        self.feed_forward_text = FeedForwardNetwork(word_emb_dim,ff_dim,dropout)
        self.dropout2_text = torch.nn.Dropout(dropout)
        self.norm2_text = torch.nn.LayerNorm(word_emb_dim)
        

        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
    def forward(self,text:torch.Tensor, mask: torch.Tensor=None):
        """
        Inputs:
        - x: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
        - mask: Tensor for multi-head attention
        """
        
        y_text = None
        ###########################################################################
        # TODO: Get the output of the multi-head attention part (with dropout     #
        # and layer norm), which is used as input to the feedforward network (    #
        # again, followed by dropout and layer norm).                             #
        #                                                                         #
        # Don't forget the residual connections for both parts.                   #
        ###########################################################################

        attention_op_text = self.textual_self_attention(text,text,text,mask)
        dropout1_op_text = self.dropout1_text(attention_op_text)
        norm1_op_text = self.norm1_text(text + dropout1_op_text)

        feed_forward_op_text = self.feed_forward_text(norm1_op_text)
        dropout2_op_text = self.dropout2_text(feed_forward_op_text)
        norm2_op_text = self.norm2_text(norm1_op_text + dropout2_op_text)
        y_text = norm2_op_text
        
        
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return y_text

In [23]:
class ImgEncoderCell(nn.Module):
    """
    A single cell (unit) for the Transformer encoder.
    """
    def __init__(self, img_emb_dim: int, num_heads: int, ff_dim: int, dropout: float):
        """
        Inputs:
        - input_dim: Input dimension for each token in a sequence
        - num_heads: Number of attention heads in a multi-head attention module
        - ff_dim: The hidden dimension for a feedforward network
        - dropout: Dropout ratio for the output of the multi-head attention and feedforward
          modules.
        """
        super(ImgEncoderCell, self).__init__()
        
        ###########################################################################
        # TODO: A single Transformer encoder cell consists of 
        # 1. A multi-head attention module
        # 2. Followed by dropout
        # 3. Followed by layer norm (check nn.LayerNorm)
        # https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm
        #                                                                         #
        # At the same time, it also has
        # 1. A feedforward network
        # 2. Followed by dropout
        # 3. Followed by layer norm
        ###########################################################################
        self.text_guided_self_attention = Text_Guided_Self_Attention(img_emb_dim,num_heads)
        self.dropout1_img = torch.nn.Dropout(dropout)
        self.norm1_img = torch.nn.LayerNorm(img_emb_dim)
        self.feed_forward_img = FeedForwardNetwork(img_emb_dim,ff_dim,dropout)
        self.dropout2_img = torch.nn.Dropout(dropout)
        self.norm2_img = torch.nn.LayerNorm(img_emb_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
    def forward(self, img: torch.Tensor, y_text:torch.Tensor, mask: torch.Tensor=None):
        """
        Inputs:
        - x: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
        - mask: Tensor for multi-head attention
        """
        
       
        y_img = None
        ###########################################################################
        # TODO: Get the output of the multi-head attention part (with dropout     #
        # and layer norm), which is used as input to the feedforward network (    #
        # again, followed by dropout and layer norm).                             #
        #                                                                         #
        # Don't forget the residual connections for both parts.                   #
        ###########################################################################
        
        attention_op_img = self.text_guided_self_attention(img,img,img,y_text,mask)
        dropout1_op_img = self.dropout1_img(attention_op_img)
        norm1_op_img = self.norm1_img(img + dropout1_op_img)

        feed_forward_op_img = self.feed_forward_img(norm1_op_img)
        dropout2_op_img = self.dropout2_img(feed_forward_op_img)
        norm2_op_img = self.norm2_img(norm1_op_img + dropout2_op_img)
        y_img = norm2_op_img
        
        
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return y_img

In [24]:
class GroundingEncoder(nn.Module):
    """
    A full encoder consisting of a set of TransformerEncoderCell.
    """
    def __init__(self, img_emb_dim: int, word_emb_dim: int, num_heads: int, ff_dim: int=768, dropout: float=0.1):
        """
        Inputs:
        - input_dim: Input dimension for each token in a sequence
        - num_heads: Number of attention heads in a multi-head attention module
        - ff_dim: The hidden dimension for a feedforward network
        - num_cells: Number of TransformerEncoderCells
        - dropout: Dropout ratio for the output of the multi-head attention and feedforward
          modules.
        """
        super(GroundingEncoder, self).__init__()
        
        self.norm = None
        ###########################################################################
        # TODO: Construct a nn.ModuleList to store a stack of                     #
        # TranformerEncoderCells. Check the documentation here of how to use it   #
        # https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html#torch.nn.ModuleList
        
        # At the same time, define a layer normalization layer to process the     #
        # output of the entire encoder.                                           #
        ###########################################################################
        self.text_cell = TextEncoderCell(word_emb_dim, num_heads, ff_dim, dropout)
        self.img_cell = ImgEncoderCell(img_emb_dim,num_heads, ff_dim, dropout) 
        self.norm = torch.nn.LayerNorm(img_emb_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
    def forward(self, x_img_pe: torch.Tensor,x_text: torch.Tensor, mask: torch.Tensor=None):
        """
        Inputs:
        - x: Tensor of the shape BxLxC, where B is the batch size, L is the sequence length,
          and C is the channel dimension
        - mask: Tensor for multi-head attention
        
        Return:
        - y: Tensor of the shape of BxLxC, which is the normalized output of the encoder
        """
        
        y = None
        ###########################################################################
        # TODO: Feed x into the stack of TransformerEncoderCells and then         #
        # normalize the output with layer norm.                                   #
        ###########################################################################
        x_text = self.text_cell(x_text,mask)
        x_img = self.img_cell(x_img_pe,x_text,mask)
        y_text = self.norm(x_text)
        y_img = self.norm(x_img)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return y_text,y_img
        

In [25]:
class VGEncoder(nn.Module):
    """
    A Transformer-based text classifier.
    """
    def __init__(self, 
            img_emb_dim:int, word_emb_dim: int, num_heads: int, trx_ff_dim: int, 
         dropout: float=0.1, pad_token: int=0
        ):
        """
        Inputs:
        - vocab_size: Vocabulary size, indicating how many tokens we have in total.
        - embed_dim: The dimension of word embeddings
        - num_heads: Number of attention heads in a multi-head attention module
        - trx_ff_dim: The hidden dimension for a feedforward network
        - num_trx_cells: Number of TransformerEncoderCells
        - dropout: Dropout ratio
        - pad_token: The index of the padding token.
        """
        super(VGEncoder, self).__init__()
        
        self.img_emb_dim = img_emb_dim
        self.word_emb_dim = word_emb_dim
        
        ###########################################################################
        # TODO: Define a module for positional encoding, Transformer encoder, and #
        # a output layer                                                          #
        ###########################################################################
        self.positional_encoding = PositionalEncoding(img_emb_dim)
        self.grounding_encoder = GroundingEncoder(word_emb_dim, img_emb_dim, num_heads, trx_ff_dim, dropout )
        self.output_layer = torch.nn.Linear(img_emb_dim,768)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################

    def forward(self, img_emb, text_emb, mask=None):
        """
        Inputs:
        - text: Tensor with the shape of BxLxC.
        - mask: Tensor for multi-head attention
        
        Return:
        - logits: Tensor with the shape of BxK, where K is the number of classes
        """
        
        ###########################################################################
        # TODO: Apply positional embedding to the input, which is then fed into   #
        # the encoder. Average pooling is applied then to all the features of all #
        # tokens. Finally, the logits are computed based on the pooled features.  #
        ###########################################################################
        positional_encoded_img = self.positional_encoding(img_emb)
        grounding_encoder_text, grounding_encoder_img = self.grounding_encoder(positional_encoded_img,text_emb)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return grounding_encoder_text, grounding_encoder_img

In [26]:
class GroundingDecoderCell(nn.Module):
    """
    A single cell (unit) of the Transformer decoder.
    """
    def __init__(self, input_dim: int, num_heads: int, ff_dim: int=768, dropout: float=0.1):
        """
        Inputs:
        - input_dim: Input dimension for each token in a sequence
        - num_heads: Number of attention heads in a multi-head attention module
        - ff_dim: The hidden dimension for a feedforward network
        - dropout: Dropout ratio for the output of the multi-head attention and feedforward
          modules.
        """
        super(GroundingDecoderCell, self).__init__()
        
        ###########################################################################
        # TODO: Similar to the TransformerEncoderCell, define two                 #
        # MultiHeadAttention modules. One for processing the tokens on the        # 
        # decoder side. The other for getting the attention across the encoder.   #
        # and the decoder. Also define a feedforward network. Don't forget the    #
        # Dropout and Layer Norm layers.                                          #
        ###########################################################################
        self.grounding_query_self_attention = MultiHeadAttention(input_dim,num_heads)
        self.dropout1 = torch.nn.Dropout(dropout)
        self.norm1 = torch.nn.LayerNorm(input_dim)
        self.encoder_decoder_self_attention = MultiHeadAttention(input_dim,num_heads)
        self.dropout2 = torch.nn.Dropout(dropout)
        self.norm2 = torch.nn.LayerNorm(input_dim)
        self.feed_forward = FeedForwardNetwork(input_dim,ff_dim,dropout)
        self.dropout3 = torch.nn.Dropout(dropout)
        self.norm3 = torch.nn.LayerNorm(input_dim)
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
    def forward(self, grounding_query: torch.Tensor, encoder_img: torch.Tensor, src_mask=None, tgt_mask=None):            
        """
        Inputs: 
        - x: Tensor of BxLdxC, word embeddings on the decoder side
        - encoder_output: Tensor of BxLexC, word embeddings on the encoder side
        - src_mask: Tensor, masks of the tokens on the encoder side
        - tgt_mask: Tensor, masks of the tokens on the decoder side
        
        Return:
        - y: Tensor of BxLdxC. Attended features for all tokens on the decoder side.
        """
        
        y = None
        ###########################################################################
        # TODO: Compute the self-attended features for the tokens on the decoder  #
        # side. Then compute the corss-attended features for the tokens on the    #
        # decoder side to the encoded features, which are finally feed into the   #
        # feedforward network                                                     #
        ###########################################################################
        attention_op1 = self.grounding_query_self_attention(grounding_query,grounding_query,grounding_query,tgt_mask)
        attention_residual_op1 = attention_op1 + grounding_query
        dropout1_op = self.dropout1(attention_residual_op1)
        norm1_op = self.norm1(dropout1_op)
        
        attention_op2 = self.encoder_decoder_self_attention(grounding_query,encoder_img,encoder_img,src_mask)
        attention_residual_op2 = attention_op2 + norm1_op
        dropout2_op = self.dropout2(attention_residual_op2)
        norm2_op = self.norm2(dropout2_op)
        
        feed_forward_op = self.feed_forward(norm2_op)
        feed_forward_residual_op = feed_forward_op + norm2_op
        dropout3_op = self.dropout3(feed_forward_residual_op)
        norm3_op = self.norm3(dropout3_op)
        y = norm3_op
        ###########################################################################
        #                             END OF YOUR CODE                            #
        ###########################################################################
        
        return y

In [27]:
class VGDecoder(nn.Module):
   
    def __init__(self, input_dim: int, num_heads: int, ff_dim: int, dropout=0.1):
        
        super(VGDecoder, self).__init__()
        self.d_cell = GroundingDecoderCell(input_dim, num_heads, ff_dim, dropout)
        self.norm_l = torch.nn.LayerNorm(input_dim)
        
    
    def forward(self, grounding_query: torch.Tensor, encoder_img: torch.Tensor, src_mask=None, tgt_mask=None):            
        y = None
        cell_output = self.d_cell(grounding_query,encoder_img,src_mask,tgt_mask)
        y = self.norm_l(cell_output)
        
        return y

In [28]:
class MainModel(nn.Module):

    def __init__(self,img_emb_dim:int, word_emb_dim: int, num_heads: int, trx_ff_dim: int
                 ,hidden_dim, dropout: float=0.1, pad_token: int=0):
        super().__init__()
        self.visual_grounding_encoder = VGEncoder(img_emb_dim, word_emb_dim, num_heads, trx_ff_dim,dropout=0.1)
        self.visual_grounding_decoder = VGDecoder(word_emb_dim, num_heads, trx_ff_dim, dropout=0.1)
        
        self.prediction_head = nn.Sequential(
            nn.Linear(word_emb_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim,4)
        )

    def forward(self, img_emb, word_emb):

        grounding_encoder_text, grounding_encoder_img = self.visual_grounding_encoder(img_emb, word_emb)
        #print(grounding_encoder_img.size())
        transformed_emb = self.visual_grounding_decoder(grounding_encoder_text, grounding_encoder_img)
        #print(transformed_emb.size())
        pred = self.prediction_head(transformed_emb).sigmoid()
        #print(pred.size())
        pred = pred.squeeze(1)
        return pred
 

In [29]:

import random
import numpy as np
import torch
import torch.nn.functional as F



class AverageMeter(object):
    """Computes and stores the average and current value"""
    
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0.
        self.avg = 0.
        self.sum = 0.
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum / self.count
def xywh2xyxy(x):  # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
    y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
    y[:, 0] = (x[:, 0] - x[:, 2] / 2)
    y[:, 1] = (x[:, 1] - x[:, 3] / 2)
    y[:, 2] = (x[:, 0] + x[:, 2] / 2)
    y[:, 3] = (x[:, 1] + x[:, 3] / 2)
    return y

In [30]:
class Criterion(nn.Module):
    def __init__(self):
        super(Criterion, self).__init__()
        self.loss_weight = [3, 1]
        self.MSELoss = torch.nn.MSELoss(reduction='none')
    def forward(self, pred, gt, img_size=256):
        """
        :param pred:  (bs, 4)
        :param gt: (bs, 4)
        :return:
        """
        bs = pred.shape[0]
        gt = gt / img_size

        loss_bbox = F.l1_loss(pred, gt, reduction='none')
        loss_bbox = loss_bbox.sum() / bs

        loss_giou = 1 - torch.diag(generalized_box_iou_loss(
                                   self.box_cxcywh_to_xyxy(pred),
                                   self.box_cxcywh_to_xyxy(gt)))

        loss_giou = loss_giou.sum() / bs
        loss = 5 * loss_bbox + loss_giou * 2
        return loss, 5 * loss_bbox, loss_giou * 2
    
    def box_cxcywh_to_xyxy(self, x):
        x_c, y_c, w, h = x.unbind(-1)
        b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
             (x_c + 0.5 * w), (y_c + 0.5 * h)]
        return torch.stack(b, dim=-1)
    

In [31]:
import time
import logging
import numpy as np
from torch.autograd import Variable


def train_epoch(train_loader, model, optimizer, epoch, criterion=None, img_size=512):
    bs =32
    batch_time = AverageMeter()
    losses = AverageMeter()

    losses_bbox = AverageMeter()
    losses_giou = AverageMeter()

    acc = AverageMeter()
    miou = AverageMeter()

    model.train()
    end = time.time()

    for batch_idx, batch in enumerate(train_loader):
        imgs = batch[0]
        word_id = batch[1]
        img_emb = batch[2]
        word_emb =batch[3]
        bbox = batch[4]
        #bbox = torch.clamp(bbox, min=0, max=(512 - 1))
        image_emb = Variable(img_emb.unsqueeze(1))
        word_emb = Variable(word_emb.unsqueeze(1))
        bbox = Variable(bbox)

        norm_bbox = torch.zeros_like(bbox)

        norm_bbox[:, 0] = (bbox[:, 0] + bbox[:, 2]) / 2.0  # x_center
        norm_bbox[:, 1] = (bbox[:, 1] + bbox[:, 3]) / 2.0  # y_center
        norm_bbox[:, 2] = bbox[:, 2] - bbox[:, 0]   # w
        norm_bbox[:, 3] = bbox[:, 3] - bbox[:, 1]    # h
        #print(norm_bbox)

        # forward
        pred_box = model(image_emb, word_emb)
        #print(pred_box.size())# [bs, C, H, W]
        loss, loss_box, loss_giou = criterion(pred_box, norm_bbox, img_size=img_size)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # pred-box
        pred_bbox = pred_box.detach()
        pred_bbox = pred_bbox * img_size
        pred_box = xywh2xyxy(pred_bbox)

        losses.update(loss.item(), bs)
        losses_bbox.update(loss_box.item(), bs)
        losses_giou.update(loss_giou.item(), bs)

        target_bbox = bbox
        iou = box_iou(pred_box, target_bbox.data)
#         print("in here")
        
        accu = np.sum(np.array((iou.data.numpy() > 0.5), dtype=float)) / bs

        # metrics
        miou.update(torch.mean(iou).item(), image_emb.size(0))
        acc.update(accu, image_emb.size(0))

        batch_time.update(time.time() - end)
        end = time.time()

        if (batch_idx%300)== 0 :
            print_str = 'Epoch: [{0}][{1}/{2}]\t' \
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                        'Loss_bbox {loss_box.val:.4f} ({loss_box.avg:.4f})\t' \
                        'Loss_giou {loss_giou.val:.4f} ({loss_giou.avg:.4f})\t' \
                        'Accu {acc.val:.4f} ({acc.avg:.4f})\t' \
                        'Mean_iu {miou.val:.4f} ({miou.avg:.4f})\t' \
                .format(epoch+1, batch_idx+1, len(train_loader),
                        batch_time=batch_time,
                        loss=losses,
                        loss_box=losses_bbox,
                        loss_giou=losses_giou,
                        acc=acc,
                        miou=miou)

            print(print_str)
            

def validate_epoch(val_loader, model, train_epoch, img_size=512):
    bs=32
    batch_time = AverageMeter()
    acc = AverageMeter()
    miou = AverageMeter()

    model.eval()
    end = time.time()

    for batch_idx,batch in enumerate(val_loader):
        imgs = batch[0]
        word_id = batch[1]
        img_emb = batch[2]
        word_emb =batch[3]
        bbox = batch[4]
        #bbox = torch.clamp(bbox, min=0, max=(512 - 1))
        
        image_emb = Variable(img_emb.unsqueeze(1))
        word_emb = Variable(word_emb.unsqueeze(1))
        bbox = Variable(bbox)

        norm_bbox = torch.zeros_like(bbox)

        norm_bbox[:, 0] = (bbox[:, 0] + bbox[:, 2]) / 2.0  # x_center
        norm_bbox[:, 1] = (bbox[:, 1] + bbox[:, 3]) / 2.0  # y_center
        norm_bbox[:, 2] = bbox[:, 2] - bbox[:, 0]   # w
        norm_bbox[:, 3] = bbox[:, 3] - bbox[:, 1]    # h

        with torch.no_grad():
            pred_box = model(image_emb, word_emb)  # [bs, C, H, W]
            

        pred_bbox = pred_box.detach()
        pred_bbox = pred_bbox * img_size
        pred_bbox = xywh2xyxy(pred_bbox)

        # constrain
        pred_bbox[pred_bbox < 0.0] = 0.0
        pred_bbox[pred_bbox > img_size-1] = img_size-1

        target_bbox = bbox
        # metrics
        iou = box_iou(pred_bbox, target_bbox.data)
        # accu = np.sum(np.array((iou.data.cpu().numpy() > 0.5), dtype=float)) / args.batch_size
        accu = np.sum(np.array((iou.data.cpu().numpy() > 0.5), dtype=float)) / bs

        acc.update(accu, bs)
        miou.update(torch.mean(iou).item(), bs)

        batch_time.update(time.time() - end)
        end = time.time()

        if (batch_idx%100) == 0:
            print_str = 'Validate: [{0}/{1}]\t' \
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  ' \
                        'Acc {acc.val:.4f} ({acc.avg:.4f})  ' \
                        'Mean_iu {miou.val:.4f} ({miou.avg:.4f})  ' \
                .format(batch_idx+1, len(val_loader), batch_time=batch_time, acc=acc, miou=miou)

            print(print_str)
            
    print(f"Train_epoch {train_epoch+1}  Validate Result:  Acc {acc.avg}, MIoU {miou.avg}.")


    return acc.avg, miou.avg

def test_epoch(test_loader, model, img_size=512):
    bs = 32
    acc = AverageMeter()
    miou = AverageMeter()
    model.eval()

    for batch_idx, batch in enumerate(test_loader):
        imgs = batch[0]
        word_id = batch[1]
        img_emb = batch[2]
        word_emb =batch[3]
        bbox = batch[4]
        #bbox = torch.clamp(bbox, min=0, max=(512 - 1))
        image_emb = Variable(img_emb.unsqueeze(1))
        word_emb = Variable(word_emb.unsqueeze(1))
        bbox = Variable(bbox)
        norm_bbox = torch.zeros_like(bbox)

        norm_bbox[:, 0] = (bbox[:, 0] + bbox[:, 2]) / 2.0  # x_center
        norm_bbox[:, 1] = (bbox[:, 1] + bbox[:, 3]) / 2.0  # y_center
        norm_bbox[:, 2] = bbox[:, 2] - bbox[:, 0]   # w
        norm_bbox[:, 3] = bbox[:, 3] - bbox[:, 1]    # h

        with torch.no_grad():
            pred_box = model(image_emb, word_emb)  # [bs, C, H, W]

        pred_bbox = pred_box.detach()
        pred_bbox = pred_bbox * img_size
        pred_bbox = xywh2xyxy(pred_bbox)

        # constrain
        pred_bbox[pred_bbox < 0.0] = 0.0
        pred_bbox[pred_bbox > img_size-1] = img_size-1

        target_bbox = bbox
        # metrics
        iou = box_iou(pred_bbox, target_bbox.data)
        accu = np.sum(np.array((iou.data.cpu().numpy() > 0.5), dtype=float)) / bs

        acc.update(accu, bs)
        miou.update(torch.mean(iou).item(), bs)
    print(f"Test Result:  Acc {acc.avg}, MIoU {miou.avg}.")

In [32]:

import matplotlib as mpl
import math
import torch.nn.parallel
import torch.optim
import torch.utils.data.distributed

epochs = 7
hidden_dim = 32
img_emb_dim = 768
word_emb_dim = 768
num_heads = 8
trx_ff_dim = 50
bs = 32
model = MainModel(img_emb_dim, word_emb_dim, num_heads, trx_ff_dim
                 ,hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=10e-4, weight_decay=10e-3)

# get criterion
criterion = Criterion()
best_accu = -float('Inf')

# train
for epoch in range(epochs):
    model.train()
    train_epoch(train_loader, model, optimizer, epoch,criterion, 512)
    model.eval()
    accu_new, miou_new = validate_epoch(val_loader, model, epoch, 512)

    is_best = accu_new > best_accu
    best_accu = max(accu_new, best_accu)
    

print(f'Best Acc: {best_accu}.')



Epoch: [1][1/1843]	Time 0.565 (0.565)	Loss 66.9724 (66.9724)	Loss_bbox 4.9187 (4.9187)	Loss_giou 62.0537 (62.0537)	Accu 0.4688 (0.4688)	Mean_iu 0.1325 (0.1325)	
Epoch: [1][301/1843]	Time 0.244 (0.250)	Loss 65.2144 (65.1017)	Loss_bbox 3.5418 (3.2841)	Loss_giou 61.6727 (61.8176)	Accu 0.2500 (0.5257)	Mean_iu 0.0766 (0.1173)	
Epoch: [1][601/1843]	Time 0.255 (0.252)	Loss 64.9198 (65.0466)	Loss_bbox 2.9135 (3.2318)	Loss_giou 62.0063 (61.8149)	Accu 1.7812 (0.5519)	Mean_iu 0.1676 (0.1159)	
Epoch: [1][901/1843]	Time 0.268 (0.261)	Loss 64.9652 (64.9955)	Loss_bbox 3.2436 (3.1818)	Loss_giou 61.7217 (61.8137)	Accu 0.3125 (0.5413)	Mean_iu 0.0994 (0.1147)	
Epoch: [1][1201/1843]	Time 0.374 (0.266)	Loss 64.5128 (64.9548)	Loss_bbox 2.7934 (3.1355)	Loss_giou 61.7194 (61.8193)	Accu 0.0312 (0.5574)	Mean_iu 0.1032 (0.1154)	
Epoch: [1][1501/1843]	Time 1.504 (0.392)	Loss 64.7178 (64.9240)	Loss_bbox 3.0097 (3.1004)	Loss_giou 61.7082 (61.8235)	Accu 0.0625 (0.5734)	Mean_iu 0.0961 (0.1163)	
Epoch: [1][1801/1843]	

# Test

In [33]:
torch.save(model, '/kaggle/working/model_7ep.pth')

In [34]:
model.eval()
test_epoch(test_loader, model,512)

Test Result:  Acc 0.9088255494505495, MIoU 0.1353150097100617.
