In [1]:
######################################################################################################################
#                               Import Python Files for Sentence & Annotations Extraction                            #
#                                        Provided as a Simple API on Github                                          #
#                                https://github.com/BryanPlummer/flickr30k_entities                                  #
######################################################################################################################

# Uncomment the below line once in order for the code to run smoothly
!pip install imagesize
import os
os.chdir(r"/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/")
import Utils.flickr30k_entities_utils
from Utils.flickr30k_entities_utils import get_sentence_data, get_annotations
import Utils.helper_functions
from Utils.helper_functions import *
import random 
import seaborn as sb
import pickle




######################################################################################################################
#                                                                                                                    #
#                                                   Mapping Function                                                 #
#                                                                                                                    #
######################################################################################################################
"""
Mapping Function does the following,
    - takes list of Image names as i/p and fetch Sentences & Annotations (contains bounding boxes)of all those Images
    - passes those Sentences & Annotations to the func Phrase_Id_to_Bbox & gets Bounding Boxes for all phrases
    in every image.
    - also, passes those Sentences & Annotations to the func Phrase_Id_to_Phrases & extracts phrases contained in all
    images.
    
    A typical look of the outputs would be:
    
        _Image_Train_Phrase_Id_to_Bbox -----> {'image_id_1' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_2' : [Bbox1, Bbox2 ... Bboxn],
                                                              'Phrase_id_3' : [Bbox1, Bbox2 ... Bboxn],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Bbox1, Bbox2 ... Bboxn]}
                                                              
                                                              }
                                                              
                                                              
        _Image_Train_Phrase_Id_to_Phrase -----> {'image_id_1' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                'image_id_2' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              
                                                              
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              .
                                                              
                                                              
                                                'image_id_n' : {'Phrase_id_1' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_2' : [Phrase1, Phrase2.... Phrase_n],
                                                              'Phrase_id_3' : [Phrase1, Phrase2.... Phrase_n],
                                                              .
                                                              .
                                                              .
                                                              'Phrase_id_n' : [Phrase1, Phrase2.... Phrase_n]}
                                                              
                                                              }
        

NOTE: Please alter any folder paths for Images, Sentences and Annotations (Phrase & Bounding Boxes) in Helper Function File


"""



from collections import defaultdict
def Mapping(_Image_Names, _paths_dict):
    _Phrase_Id_to_Bbox = defaultdict()
    _Phrase_Id_to_Phrase = defaultdict()

    for _img in tqdm(_Image_Names):
        _img_sentences_path, _img_annotations_path, _img_absolute_path = get_Paths(_img, _paths_dict)
        sents = get_sentence_data(_img_sentences_path)
        anns = get_annotations(_img_annotations_path)
        _Phrase_Id_to_Bbox[_img] = phrase_Id_to_Bbox(sents, anns)
        _Phrase_Id_to_Phrase[_img] = phrase_Id_to_Phrases(sents, anns)
        
        
    return _Phrase_Id_to_Bbox, _Phrase_Id_to_Phrase


_paths_dict = {
                '_sentences_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Sentences',
                '_annotations_path' : r'/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/annotations/Annotations',
                '_image_folder_path' : r'/kaggle/input/flickr30k/flickr30k_images'
                }
_train_len = 1000 #len(_trainimg)




"""
######################################################################################################################
#                                                                                                                    #
#                       Enter path for train, val & test split in their respective variables                         #
#                                                                                                                    #
######################################################################################################################


train.txt, val.txt and test.txt are text files that contains predefined splits, i.e each file contains the split it
belongs to.

train.txt contains all image names as strings, that should be used for training
val.txt contains all image names as strings, that should be used for validation
test.txt contains all image names as strings, that should be used for testing

"""

_trainimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/train.txt')
_vlimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/val.txt')
_tsimg = load_Splits('/kaggle/input/vgrutils/Visual Grounding RefEx/Flickr30/Data/Splits/test.txt')


"""
######################################################################################################################
#                                                                                                                    #
#                                                 Call to the Mapping Functions                                      #
#                                                                                                                    #
######################################################################################################################
"""

_fractional_trainimg = _trainimg[:_train_len]
_Image_Train_Phrase_Id_to_Bbox, _Image_Train_Phrase_Id_to_Phrase = Mapping(_fractional_trainimg, _paths_dict)
_Image_Val_Phrase_Id_to_Bbox, _Image_Val_Phrase_Id_to_Phrase = Mapping(_vlimg, _paths_dict)
_Image_Test_Phrase_Id_to_Bbox, _Image_Test_Phrase_Id_to_Phrase = Mapping(_tsimg, _paths_dict)




"""
######################################################################################################################
#                                                                                                                    #
#                                                 Part of Helper Functions.                                          #
#                                                Needs to be updated there.                                          #
#                                                                                                                    #
######################################################################################################################
"""
import pandas as pd
def prepare_DataFrame(Phrase_Dict, Bbox_Dict):
    Final_DF = pd.DataFrame()
    for Image_Id in tqdm(Phrase_Dict.keys()):
        
        Phrase_DF = pd.DataFrame.from_dict(Phrase_Dict[Image_Id], orient = 'index')
        Phrase_DF = pd.DataFrame(Phrase_DF.stack(level=0)).reset_index().drop('level_1', axis = 1)

        Bbox_DF = pd.DataFrame.from_dict(Bbox_Dict[Image_Id], orient = 'index')
        Bbox_DF = pd.DataFrame(Bbox_DF.stack(level=0)).reset_index().drop('level_1', axis = 1)
        Bbox_DF = Bbox_DF.groupby(['level_0'])[0].apply(list)
        

        Merged_DF = pd.merge(Phrase_DF, Bbox_DF, on = 'level_0', how='inner')
        Merged_DF['Image_Id'] = Image_Id

        Final_DF = pd.concat([Final_DF, Merged_DF], axis = 0)

    Final_DF = Final_DF.rename(columns = {'level_0' : 'Phrase_Id', '0_x': 'Phrase', '0_y':'Bounding_Box'})
    Final_DF = Final_DF[['Image_Id', 'Phrase_Id', 'Phrase', 'Bounding_Box']]
    Final_DF.reset_index(drop = True, inplace = True)
    print("Local Function Called......")
    return Final_DF



"""
######################################################################################################################
#                                                                                                                    #
#                                                Converting to DataFrames.                                           #
#                                                                                                                    #
######################################################################################################################
"""


Train_Frame = prepare_DataFrame(_Image_Train_Phrase_Id_to_Phrase, _Image_Train_Phrase_Id_to_Bbox)
Val_Frame = prepare_DataFrame(_Image_Val_Phrase_Id_to_Phrase, _Image_Val_Phrase_Id_to_Bbox)
Test_Frame = prepare_DataFrame(_Image_Test_Phrase_Id_to_Phrase, _Image_Test_Phrase_Id_to_Bbox)


"""
######################################################################################################################
#                                                                                                                    #
#                                    Lower Case Phrases & Phrase to Index Map                                        #
#                                                                                                                    #
######################################################################################################################
"""

def sort_by_number_of_words(phrase):
    return len(phrase.split())

Train_Frame.Phrase = Train_Frame.Phrase.str.lower()
Val_Frame.Phrase = Val_Frame.Phrase.str.lower()
Test_Frame.Phrase = Test_Frame.Phrase.str.lower()


unique_Phrases_Train = list(Train_Frame.Phrase.unique())
unique_Phrases_Train.sort(key = sort_by_number_of_words)
_train_Phrase_to_Index_Map = dict(zip(unique_Phrases_Train, range(len(unique_Phrases_Train))))

unique_Phrases_Val = list(Val_Frame.Phrase.unique())
unique_Phrases_Val.sort(key = sort_by_number_of_words)
_val_Phrase_to_Index_Map = dict(zip(unique_Phrases_Val, range(len(unique_Phrases_Val))))

unique_Phrases_Test = list(Test_Frame.Phrase.unique())
unique_Phrases_Test.sort(key = sort_by_number_of_words)
_test_Phrase_to_Index_Map = dict(zip(unique_Phrases_Test, range(len(unique_Phrases_Test))))






[0m

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Local Function Called......


  0%|          | 0/1000 [00:00<?, ?it/s]

Local Function Called......


  0%|          | 0/1000 [00:00<?, ?it/s]

Local Function Called......


In [2]:
"""
######################################################################################################################
#                                                                                                                    #
#                                                Printing the DataFrames.                                            #
#                                                                                                                    #
######################################################################################################################
"""

Train_Frame.head(10)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,Bounding_Box
0,3359636318,112630,two people,"[[46, 182, 105, 333], [143, 165, 207, 333]]"
1,3359636318,112632,the video game shop,"[[0, 54, 168, 307]]"
2,3359636318,112631,the mobile phone store,"[[191, 0, 498, 230]]"
3,3359636318,112625,several people,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
4,3359636318,112625,people,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
5,3359636318,112625,a group of people,"[[46, 182, 105, 333], [143, 165, 207, 333], [2..."
6,3359636318,112627,some stores,"[[191, 0, 498, 230], [1, 0, 190, 307]]"
7,3359636318,112626,a sidewalk,"[[2, 212, 499, 333]]"
8,6959556104,262504,the crowd,"[[5, 70, 103, 314], [120, 54, 206, 172], [197,..."
9,6959556104,262504,a small crowd,"[[5, 70, 103, 314], [120, 54, 206, 172], [197,..."


In [3]:
"""
######################################################################################################################
#                                                                                                                    #
#                                                Printing the DataFrames.                                            #
#                                                                                                                    #
######################################################################################################################
"""

Test_Frame.head(10)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,Bounding_Box
0,1016887272,547,a collage of one person,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
1,1016887272,547,a group of people,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
2,1016887272,547,several climbers,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
3,1016887272,547,seven climbers,"[[193, 369, 230, 453], [207, 303, 255, 383], [..."
4,1016887272,548,the rock,"[[0, 53, 332, 499]]"
5,1016887272,548,a rock climbing wall,"[[0, 53, 332, 499]]"
6,1016887272,548,a rock face,"[[0, 53, 332, 499]]"
7,1016887272,548,a rock,"[[0, 53, 332, 499]]"
8,1016887272,548,a cliff,"[[0, 53, 332, 499]]"
9,1016887272,549,another man,"[[73, 301, 180, 499]]"


In [4]:
"""
######################################################################################################################
#                                                                                                                    #
#                                                Printing the DataFrames.                                            #
#                                                                                                                    #
######################################################################################################################
"""

Val_Frame.head(10)

Unnamed: 0,Image_Id,Phrase_Id,Phrase,Bounding_Box
0,100652400,197,a construction worker,"[[52, 44, 109, 202]]"
1,100652400,197,a man,"[[52, 44, 109, 202]]"
2,100652400,198,hard hat,"[[58, 43, 87, 65]]"
3,100652400,198,a blue hard hat,"[[58, 43, 87, 65]]"
4,100652400,198,a hard hat,"[[58, 43, 87, 65]]"
5,100652400,199,a reflective vest,"[[61, 68, 97, 118]]"
6,100652400,199,a caution vest,"[[61, 68, 97, 118]]"
7,100652400,199,orange safety vest,"[[61, 68, 97, 118]]"
8,100652400,199,bright vest,"[[61, 68, 97, 118]]"
9,100652400,200,an intersection,"[[0, 89, 373, 499]]"


# BERT Transformers

In [5]:
!pip install pytorch-pretrained-bert

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM



[0m

In [7]:

class Bert(nn.Module):
    
    #Constructor
    def __init__(self):
        super(Bert, self).__init__()
        
        #Load the Pretrained Model
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.model = BertModel.from_pretrained('bert-base-uncased')
        
        #Hook Features
        self.Features = {}
        self.hook = self.model.encoder.layer[-2].register_forward_hook(self.get_features('second_last_hidden_state'))
      
    #Hook Registration
    def get_features(self, name):
        def hook(model, input, output):
#             print("hook invoked")
            processed = torch.mean(output,dim = 1)
            self.Features[name] = processed.detach()
        return hook
    
    
    
    def preprocess_Phrases(self, phrase):
        marked_phrase = "[CLS] " + phrase + " [SEP]"
        tokenized_phrase = self.tokenizer.tokenize(marked_phrase)
        tokenized_index = self.tokenizer.convert_tokens_to_ids(tokenized_phrase)
        return tokenized_index


    def preprocess_Batch(self, batch, padding_id):

        token_Ids = list(map(self.preprocess_Phrases, batch))
        max_len = len(max(token_Ids, key = len))


        for index, tkn_ind in enumerate(token_Ids):
            pad_len = max_len - len(tkn_ind)
            pad_seq = [padding_id] * pad_len
            token_Ids[index] += pad_seq

        token_Ids_Tensor = torch.tensor(token_Ids)
        segment_Ids_Tensor = torch.ones(token_Ids_Tensor.shape, dtype= torch.long)
        return token_Ids_Tensor, segment_Ids_Tensor
    
    
    def forward(self, batches, _phrase = None, training = True):
        Textual_Embeddings = None
        if training:
            Textual_Embeddings = []
            pad = self.tokenizer.vocab['pad']
            for batch in tqdm(batches):
                tk, sg = self.preprocess_Batch(batch, pad)
                _, _ = self.model(tk, sg)
                Textual_Embeddings.append(self.Features['second_last_hidden_state'])
            Textual_Embeddings = torch.vstack(Textual_Embeddings)
            
        else:
            tk_ids = self.preprocess_Phrases(_phrase)
            print(tk_ids)
            tk = torch.tensor(tk_ids).reshape(1,-1)
            sg = torch.ones(tk.shape, dtype = torch.long).reshape(1,-1)
            _, _ = self.model(tk, sg)
            Textual_Embeddings = self.Features['second_last_hidden_state']
            

        return Textual_Embeddings


    

batch_size = 64
train_batches = [unique_Phrases_Train[i:i+batch_size] for i in range(0,len(unique_Phrases_Train),batch_size)]
val_batches = [unique_Phrases_Val[i:i+batch_size] for i in range(0,len(unique_Phrases_Val),batch_size)]
test_batches = [unique_Phrases_Test[i:i+batch_size] for i in range(0,len(unique_Phrases_Test),batch_size)]
 
    
    
Obj = Bert()
emb = Obj.forward(None, _phrase = "Almost There", training=False)
t_train_embeds = Obj.forward(train_batches)
torch.save(t_train_embeds,"/kaggle/working/t_train_embeds.pt")

with open('/kaggle/working/_train_Phrase_to_Index_Map.pkl', 'wb') as fp:
    pickle.dump(_train_Phrase_to_Index_Map, fp)
    

[101, 2471, 2045, 102]


  0%|          | 0/93 [00:00<?, ?it/s]