In [146]:
# Load the annotation file
# Consider a particular annotation
# Load the corresponding gif
# Track the bounding boxes
# Repurpose the IKEA-ASM feature extraction code to extract the features
# Will need to implement the code for I3D network or repurpose the code 

In [147]:
# Load the annotation file
anno_path = '/workspace/work/O2ONet/data/annotations_minus_unavailable_yt_vids.pkl'

import pickle as pkl

f = open(anno_path, 'rb')
anno = pkl.load(f)
f.close()

In [148]:

def tracker(frames, main_bbox_tb):
    import cv2
    import sys
    
    image_height, image_width,_ = frames[0].shape

    
    main_bbox_wh = (main_bbox_tb[0], main_bbox_tb[1], main_bbox_tb[2]-main_bbox_tb[0], main_bbox_tb[3]-main_bbox_tb[1])

    (major_ver, minor_ver, subminor_ver) = cv2.__version__.split('.')


    # Set up tracker.
    # Instead of MIL, you can also use

    tracker_types = ['BOOSTING', 'MIL','KCF', 'TLD', 'MEDIANFLOW', 'GOTURN', 'MOSSE', 'CSRT']
    tracker_type = tracker_types[-1]

    if int(minor_ver) < 3:
        tracker = cv2.Tracker_create(tracker_type)
    else:
        if tracker_type == 'BOOSTING':
            tracker = cv2.TrackerBoosting_create()
            tracker_rev = cv2.TrackerBoosting_create()
        if tracker_type == 'MIL':
            tracker = cv2.TrackerMIL_create()
            tracker_rev = cv2.TrackerMIL_create()
        if tracker_type == 'KCF':
            tracker = cv2.TrackerKCF_create()
            tracker_rev = cv2.TrackerKCF_create()
        if tracker_type == 'TLD':
            tracker = cv2.TrackerTLD_create()
            tracker_rev = cv2.TrackerTLD_create()
        if tracker_type == 'MEDIANFLOW':
            tracker = cv2.TrackerMedianFlow_create()
            tracker_rev = cv2.TrackerMedianFlow_create()
        if tracker_type == 'GOTURN':
            tracker = cv2.TrackerGOTURN_create()
            tracker_rev = cv2.TrackerGOTURN_create()
        if tracker_type == 'MOSSE':
            tracker = cv2.TrackerMOSSE_create()
            tracker_rev = cv2.TrackerMOSSE_create()
        if tracker_type == "CSRT":
            tracker = cv2.TrackerCSRT_create()
            tracker_rev = cv2.TrackerCSRT_create()

    num_frames = len(frames)

    central_index = int((num_frames - 1)/2)
    window_size = int(num_frames/2)

    central_frame = frames[central_index]

    # Initialize tracker with first frame and bounding box

    ok = tracker.init(central_frame, main_bbox_wh)
    bboxes_forward = []

    for i in range(window_size):

        # Read a new frame
        frame = frames[central_index + 1 + i]        

        # Update tracker
        ok, bbox_wh = tracker.update(frame)

        # add to the bbox list
        if ok:
            bbox_tb = [ bbox_wh[0], bbox_wh[1], bbox_wh[0] + bbox_wh[2], bbox_wh[1] + bbox_wh[3] ]
            import numpy as np

            bbox_tb[0], bbox_tb[2] = np.clip(bbox_tb[0],0, image_width-1), np.clip(bbox_tb[2],0, image_width-1)
            bbox_tb[1], bbox_tb[3] = np.clip(bbox_tb[1],0, image_height-1), np.clip(bbox_tb[3],0, image_height-1)

            bboxes_forward.append(bbox_tb)
            # # Tracking success
            # p1 = (int(bbox[0]), int(bbox[1]))
            # p2 = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3]))
            # cv2.rectangle(frame, p1, p2, (255,0,0), 2, 1)
        else :
            print("Tracking Failure")
            return 0
            # Tracking failure
            # cv2.putText(frame, "Tracking failure detected", (100,80), cv2.FONT_HERSHEY_SIMPLEX, 0.75,(0,0,255),2)

    # Initialize tracker with first frame and bounding box
    ok = tracker_rev.init(central_frame, main_bbox_wh)
    bboxes_backward = []
    for i in range(window_size):
        
        # Read a new frame
        frame = frames[central_index - 1 - i]        
        
        # Update tracker
        ok, bbox_wh = tracker_rev.update(frame)

        # Add to the bbox list
        if ok:
            bbox_tb = [ bbox_wh[0], bbox_wh[1], bbox_wh[0] + bbox_wh[2], bbox_wh[1] + bbox_wh[3] ]

            bbox_tb[0], bbox_tb[2] = np.clip(bbox_tb[0],0, image_width-1), np.clip(bbox_tb[2],0, image_width-1)
            bbox_tb[1], bbox_tb[3] = np.clip(bbox_tb[1],0, image_height-1), np.clip(bbox_tb[3],0, image_height-1)

            bboxes_backward.append(bbox_tb)
            # # Tracking success
            # p1 = (int(bbox[0]), int(bbox[1]))
            # p2 = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3]))
            # cv2.rectangle(frame, p1, p2, (255,0,0), 2, 1)
        else:
            print("Tracking Failure")
            return 0
            # Tracking failure
            # cv2.putText(frame, "Tracking failure detected", (100,80), cv2.FONT_HERSHEY_SIMPLEX, 0.75,(0,0,255),2)

    bboxes_backward_reversed = bboxes_backward[-1::-1]
    all_bbox = bboxes_backward_reversed + [main_bbox_tb] + bboxes_forward
    
    return all_bbox

In [149]:
def visualise_tracking(frames, bboxes):

        import cv2
        vis_frames = []
        
        for i, frame in enumerate(frames):

            bbox = bboxes[i]
            p1 = ( int(bbox[0]), int(bbox[1]) )
            p2 = ( int(bbox[2]), int(bbox[3]) )
            temp_frame = cv2.rectangle(frame, p1, p2, (255,0,0), 2, 1)
            rgb_frame = cv2.cvtColor(temp_frame, cv2.COLOR_BGR2RGB)

            vis_frames.append(rgb_frame)
        
        import imageio
        fps = 4
        imageio.mimsave( './visualisation.gif', vis_frames, fps=4)

def track_bbox(anno, gif_folder):

    bbox = anno['bboxes']['1']['bbox']
    
    yt_id = anno['metadata']['yt_id']
    frame_index = anno['metadata']['frame no.']
    window_size = 5
    
    filename = yt_id + '_' + str(frame_index) + '_' + str(window_size) + '.gif'
    import os
    file_location = os.path.join(gif_folder, filename)
    import cv2
    vid = cv2.VideoCapture(file_location)
    frames = []

    frame_count = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))

    for i in range(frame_count):
        success, frame = vid.read()
        frames.append(frame)

    bboxes = tracker(frames, bbox)
    visualise_tracking(frames, bboxes)
    return

gif_path = '/workspace/data/data_folder/o2o/gifs_11'
track_bbox(anno[0], gif_path)

to generate features what we need now is the code to do it.
the code works according to x,y,w,h
what should be targeted - feature generation code for one gif.
the feature is a dictionary. has the following fields
metadata, relations, bboxes.

image metadata has to be included
then there are other keys: relative features, vgg_feature, bbox_features, motion features, i3d features 
how to go about doing this.

All of the relative features need to be generated - like ikea asm.


create a function which takes an annotation and generates it's features and returns it,

In [150]:
def geometric_feature(bbox, im_width, im_height):
    '''
    In Modeling Context Between Objects for Referring Expression Understanding, ECCV 2016
    [x_min/W, y_min/H, x_max/W, y_max/H, bbox_area/image_area]
    
    The annotation are given in Image Coordinate system (X is horizontal & Y is vertical ,(0,0) top left)
    The features are calculated in Image Coordinate System as well
    '''
    x_min = bbox[0]   
    y_min = bbox[1]

    x_max = bbox[2]
    y_max = bbox[3]

    bbox_width = x_max - x_min
    bbox_height = y_max - y_min
    
    area_bbox = bbox_width * bbox_height
    area_image = im_width * im_height
    
    feature = [x_min/im_width, y_min/im_height, x_max/im_width, y_max/im_height, area_bbox/area_image]
    import numpy as np
    import torch
    feature = np.asarray(feature, dtype=np.float32)
    feature = torch.from_numpy(feature)
    return feature

In [151]:
# Doesn't need correction
import torch.nn as nn

class ImageFeatureExtractor(nn.Module):
    
    """
    Object feature extractor
    """
    
    def __init__(self, submodule, layer, device, deep_net):
    
        """
        input the object detector module and the layer
        number on which we want to extract features
        """
        
        super(ImageFeatureExtractor, self).__init__()
        
        self.pretrain_model = submodule
        self.layer = layer
        
        if deep_net == 'resnet50_fpn':
    
            self.layer_list = list(self.pretrain_model._modules['backbone']._modules['body']._modules.keys())
            layer_generator = self.pretrain_model._modules['backbone']._modules['body'].named_children()
            self.transform_module = self.pretrain_model._modules['transform']

        if deep_net == 'vgg16':
    
            self.layer_list = list(self.pretrain_model._modules['features']._modules.keys())
            layer_generator = self.pretrain_model._modules['features'].named_children()
            self.transform_module = None

        output_layer = self.layer_list[self.layer]
        # just change the number of the layer to get the output
        self.children_list = []
        

        for (name, comp_layer) in layer_generator:
            self.children_list.append(comp_layer)
            if name == output_layer:
                break
        
        self.feature_extrac_net = nn.Sequential(*self.children_list).to(device)
        self.pretrain_model = None
        
    def forward(self, image):
        
        if self.transform_module:

            transformation = self.transform_module(image)[0]
            shape = transformation.image_sizes[0]
            transformed_image = transformation.tensors
            image = transformed_image[:,:,:shape[0], :shape[1]]
        
        feature = self.feature_extrac_net(image)

        return feature

from time import time
import numpy as np
import torch

# Doesn't need correction
def extract_image_deep_feature_faster(image, feature_extractor, device):
    
    image = np.swapaxes(image, 0, 2)
    image = np.swapaxes(image, 1, 2)
    image = np.expand_dims(image, 0)
    
    image_tensor = torch.from_numpy(image).to(device)
    image_feature = feature_extractor(image_tensor)

    return image_feature


import torchvision

# Doesn't need correction
def roi_align(feature_map, boxes):
    
    pooler = torchvision.ops.RoIAlign(output_size=(7, 7), spatial_scale = 1.0, sampling_ratio=1)
    boxes_list = [boxes]
    output = pooler(feature_map, boxes_list)

    return output

import torch.nn.functional as F 
import torch

# Corrected
def extract_bbox_deep_features_faster(image, bboxes, im_shape, feature_extractor, device):
    '''
    bboxes: tensor with bbox coords
    '''
    image = image.astype('float32')
    fmap = extract_image_deep_feature_faster(image, feature_extractor, device)
    
    im_width_annotation = im_shape[0]
    im_height_annotation = im_shape[1]

    # boxes_list = []
    # num_boxes = int(bboxes.shape[0])
    # for i in range(num_boxes):
    #     temp_bbox = bboxes[i]
    #     boxes_list.append(temp_bbox)

    fmap_device = fmap.device

    im_height, im_width, _ = image.shape
    # im_scale_width, im_scale_height = (im_width*1.0)/im_width_annotation, (im_height*1.0)/im_height_annotation
    
    _, fmap_height, fmap_width, __ = fmap.shape
    fmap_scale_width, fmap_scale_height = (fmap_width*1.0)/im_width_annotation, (fmap_height*1.0)/im_height_annotation
    im_size = (im_width, im_height)

    # boxes = np.asarray(boxes_list, dtype='float32')
    # boxes = torch.from_numpy(boxes)

    # print(annotation['objects_coco'])

    # scaling of bbox coordinates according to the resized fmap
    from copy import copy as copy
    boxes = copy(bboxes)
    
    boxes[:,0] *= fmap_scale_width
    boxes[:,2] *= fmap_scale_width
    boxes[:,1] *= fmap_scale_height
    boxes[:,3] *= fmap_scale_height
    
    boxes = boxes.to(fmap_device)
    
    bbox_features = roi_align(fmap, boxes)
    bbox_features = F.avg_pool2d(bbox_features, (7,7)).squeeze(2).squeeze(2)
        
    return bbox_features


In [152]:
from shapely.geometry import Polygon

# Corrected
def calculate_iou(box_1, box_2):

    '''
    boxes in [min_x, min_y, max_x, max_y] format
    '''
    # if torch.sum(box_1 == box_2) == 4:
    # return 1

    b1_min_x, b1_min_y = box_1[0], box_1[1]
    b1_max_x, b1_max_y = box_1[2], box_2[3]

    b2_min_x, b2_min_y = box_2[0], box_2[1]
    b2_max_x, b2_max_y = box_2[2], box_2[3]


    b1 = [[b1_min_x, b1_min_y], [b1_min_x, b1_max_y], [b1_max_x, b1_max_y], [b1_max_x, b1_min_y]]
    b2 = [[b2_min_x, b2_min_y], [b2_min_x, b2_max_y], [b2_max_x, b2_max_y], [b2_max_x, b2_min_y]]

    poly_1 = Polygon(b1)
    poly_2 = Polygon(b2)

    i_area = poly_1.intersection(poly_2).area
    u_area = poly_1.union(poly_2).area
    
    iou = i_area / u_area
    
    return iou

# Corrected
def calculate_distance_normalized(box_1, box_2, im_width, im_height):
    
    '''
    boxes in [min_x, min_y, max_x, max_y] format
    '''

    b1_c_x = (box_1[0] + box_1[2]) * 0.5
    b1_c_y = (box_1[1] + box_1[3]) * 0.5

    b2_c_x = (box_2[0] + box_2[2]) * 0.5
    b2_c_y = (box_2[1] + box_2[3]) * 0.5

    b1_x, b1_y = b1_c_x/im_width, b1_c_y/im_height
    b2_x, b2_y = b2_c_x/im_width, b2_c_y/im_height
    
    # normalized distance in 0 to 1
    dis = np.sqrt( (b1_x-b2_x)**2 + (b1_y-b2_y)**2 ) / np.sqrt(2)

    return dis

In [156]:
import numpy as np
import torch
from shapely.geometry import Polygon

# Corrected
def box_deltas(subject_box, object_box):
    '''
    boxes in [centre_x, centre_y, width, height] format
    '''

    s_width = subject_box[2] - subject_box[0]
    s_height = subject_box[3] - subject_box[1]
    
    o_width = object_box[2] - object_box[0]
    o_height = object_box[3] - object_box[1]

    s_centre_x = subject_box[0] + (s_width/2)
    s_centre_y = subject_box[1] + (s_height/2)

    o_centre_x = object_box[0] + (o_width/2)
    o_centre_y = object_box[1] + (o_height/2)
    
    t_so_x = (s_centre_x - o_centre_x)/s_width
    t_so_y = (s_centre_y - o_centre_y)/s_height
    
    t_so_w = torch.log(s_width/o_width)
    t_so_h = torch.log(s_height/o_height)
    
    t_os_x = (o_centre_x - s_centre_x)/o_width
    t_os_y = (o_centre_y - s_centre_y)/o_height
    
    data = [t_so_x, t_so_y, t_so_w, t_so_h, t_os_x, t_os_y]

    return torch.FloatTensor(data)


def get_union_box(box_1, box_2):

    '''
    boxes in [min_x, min_y, max_x, max_y] format
    '''

    b1_min_x, b1_min_y = box_1[0], box_1[1]
    b1_max_x, b1_max_y = box_1[2], box_2[3]

    b2_min_x, b2_min_y = box_2[0], box_2[1]
    b2_max_x, b2_max_y = box_2[2], box_2[3]

    bu_min_x, bu_min_y = min(b1_min_x, b2_min_x), min(b1_min_y, b2_min_y)
    bu_max_x, bu_max_y = max(b1_max_x, b2_max_x), max(b1_max_y, b2_max_y)
  
    return [bu_min_x, bu_min_y, bu_max_x, bu_max_y]

def calculate_distance(box_1, box_2):
    
    '''
    boxes in [min_x, min_y, max_x, max_y] format
    '''

    b1_c_x = (box_1[0] + box_1[2]) * 0.5
    b1_c_y = (box_1[1] + box_1[3]) * 0.5

    b2_c_x = (box_2[0] + box_2[2]) * 0.5
    b2_c_y = (box_2[1] + box_2[3]) * 0.5

    dis = np.sqrt( (b1_c_x-b2_c_x)**2 + (b1_c_y-b2_c_y)**2 )

    return dis

# Corrected
def relative_spatial_features_old(image_annotation, pad_dimension=15):
    
    bbox_coordinates = []
    num_objects = len(image_annotation['objects_coco'])

    im_height = image_annotation['image_metadata']['height']
    im_width = image_annotation['image_metadata']['width']


    for obj in image_annotation['objects_coco']:

        # Normalizing bbox coordinates    

        x0, y0, x1, y1 = obj['bbox']

        x0_n = x0/im_width
        y0_n = y0/im_height
        x1_n = x1/im_width
        y1_n = y1/im_height
        
        bbox_coordinates.append([x0_n, y0_n, x1_n, y1_n])
    
    bbox_coordinates = torch.from_numpy(np.asarray(bbox_coordinates, dtype='float32'))
    relative_features = torch.zeros(pad_dimension, pad_dimension, 20, dtype=torch.float32)
    
    for i in range(num_objects):
        for j in range(num_objects):

            # To make the edge feature matrix symmetric
            if (i<=j):
                subject_box = bbox_coordinates[i]
                object_box = bbox_coordinates[j]
                union_box = get_union_box(subject_box, object_box)
            else:
                subject_box = bbox_coordinates[j]
                object_box = bbox_coordinates[i]
                union_box = get_union_box(subject_box, object_box)

            relative_features[i,j,:6] = box_deltas(subject_box=subject_box, object_box=object_box)
            relative_features[i,j,6:12] = box_deltas(subject_box=subject_box, object_box=union_box)
            relative_features[i,j,12:18] = box_deltas(subject_box=object_box, object_box=union_box)
            relative_features[i,j,18] = calculate_iou(subject_box, object_box)
            # check implementation for semantics of div = 1 
            relative_features[i,j,19] = calculate_distance_normalized(subject_box, object_box)
    
    return relative_features









# Corrected
def relative_spatial_features(bbox_1, bbox_2, im_width, im_height):
    
    bbox_1[0]/=im_width
    bbox_1[2]/=im_width
    bbox_1[1]/=im_height
    bbox_1[3]/=im_height

    bbox_2[0]/=im_width
    bbox_2[2]/=im_width
    bbox_2[1]/=im_height
    bbox_2[3]/=im_height
    
    relative_features = torch.zeros(20, dtype=torch.float32)
    
    subject_box = bbox_1
    object_box = bbox_2

    union_box = get_union_box(subject_box, object_box)

    relative_features[:6] = box_deltas(subject_box=subject_box, object_box=object_box)
    relative_features[6:12] = box_deltas(subject_box=subject_box, object_box=union_box)
    relative_features[12:18] = box_deltas(subject_box=object_box, object_box=union_box)
    relative_features[18] = calculate_iou(subject_box, object_box)
    relative_features[19] = calculate_distance(subject_box, object_box)
    
    return relative_features




In [159]:
import torch
def master_feature_generator(annotation, gif_folder, cnn_feature_extractor, device):

    # Getting details to load the GIF
    yt_id = annotation['metadata']['yt_id']
    frame_index = annotation['metadata']['frame no.']

    temp = int(int(gif_folder.split('_')[-1])/2)
    window_size = temp

    # Loading the gif    
    # getting the file location
    filename = yt_id + '_' + str(frame_index) + '_' + str(window_size) + '.gif'
    import os
    file_location = os.path.join(gif_folder, filename)
    import cv2

    # getting the frames
    vid = cv2.VideoCapture(file_location)
    frames = []
    frame_count = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    for i in range(frame_count):
        success, frame = vid.read()
        frames.append(frame)
    central_frame = frames[window_size]

    # Sanity Check    
    assert window_size == (len(frames) - 1)/2, "Possible issue, please check"

    # Output Dictionary
    output = {}
    output['legend'] = {}
    
    # Adding the metadata
    output['metadata'] = annotation['metadata']
    im_height, im_width, _ = frames[0].shape    # NumPy has num rows, num cols which is height and width according to opencv conventions
    output['metadata']['frame_width'] = im_width
    output['metadata']['frame_height'] = im_height
    
    # total num of annotated objects 
    output['num_obj'] = len(list(annotation['bboxes'].keys()))

    # saving bbox co-ordinates of objects according to their key in bboxes field
    max_num_obj = 12

    # bounding box coordinates. not normalized. for image width and height see the metadata
    output['bboxes'] = torch.zeros(max_num_obj,len(frames),4, dtype=torch.float)
    bbox_keys = annotation['bboxes'].keys()
    
    for key in bbox_keys:
        key_val = int(key)
        temp_bbox = annotation['bboxes'][key]['bbox']
        
        tracked_bboxes = tracker(frames, temp_bbox)
        tracked_bboxes = torch.from_numpy(np.asarray(tracked_bboxes, dtype=float))
        output['bboxes'][key_val,:,:] = tracked_bboxes


    # saving relations in tensors

    # maps to transform text to indices
    cr_map = {'Contact': 0, 'No Contact': 1, 'None of these': 2, '': 2}
    lr_map = {'Below/Above': 0, 'Behind/Front': 1, 'Left/Right': 2, 'Inside': 3, 'None of these': 4, '': 4}
    mr_map = {'Holding': 0, 'Carrying': 1, 'Adjusting': 2, 'Rubbing': 3, 'Sliding': 4, 'Rotating': 5, 'Twisting': 6,
              'Raising': 7, 'Lowering': 8, 'Penetrating': 9, 'Moving Toward': 10, 'Moving Away': 11, 
              'Negligible Relative Motion': 12, 'None of these': 13, '': 13}

    max_num_rels = 15

    # tensor storing relations between objects at the corresponding index in object_pairs key
    output['lr'] = torch.zeros(max_num_rels, 5)
    output['mr'] = torch.zeros(max_num_rels, 13)
    output['cr'] = torch.zeros(max_num_rels, 2)
    
    # object indices between which the corresponding relation is annotated
    output['object_pairs'] = torch.zeros(max_num_rels, 2)
    
    # reading relations and saving them to the tensors
    for i, rel in enumerate(annotation['relations']):

        object_pairs = rel[0]

        mr = rel[1]['mr']
        lr = rel[1]['lr']
        cr = rel[1]['scr']

        for r in mr:
            temp_val = mr_map[r]
            output['mr'][i, temp_val] = 1
        for r in lr:
            temp_val = lr_map[r]
            output['lr'][i, temp_val] = 1
        for r in cr:
            temp_val = cr_map[r]
            output['cr'][i, temp_val] = 1

        output['object_pairs'][i] = torch.from_numpy(np.asarray(object_pairs,dtype=float))

    # total number of relations and hence the total number of object pairs as well
    output['num_relation'] = len(annotation['relations'])

    # Now we have bounding boxes, metadata, relations, number of objects, number of relations
    
    # image features - cnn features for bboxes, bbox coordinate based features, relative feature, miou, distance,
    
    # bbox coordinate based features
    output['geometric_feature'] = torch.zeros(max_num_obj, len(frames), 5, dtype=float)
    
    for f in range(len(frames)):
        for i in range( int(output['num_obj']) ):
            temp_bbox = output['bboxes'][i, f]
            output['geometric_feature'][i, f] = geometric_feature(temp_bbox, im_width, im_height)

    # 2d cnn based features for the bounding boxes of central frame
    output['cnn_bbox_feature'] = torch.zeros(max_num_obj, 64, dtype=float)
    
    # window_size is also the index of the central frame
    central_frame_bboxes = output['bboxes'][window_size, :output['num_obj']]
    temp = extract_bbox_deep_features_faster(central_frame, central_frame_bboxes, [im_width, im_height], cnn_feature_extractor, device )
    output['cnn_bbox_feature'][:output['num_obj'],:] = temp

    # miou and distance of bounding boxes
    output['iou'] = torch.zeros(max_num_obj, max_num_obj, len(frames))

    for f in range(len(frames)):
        for i in range( int(output['num_obj']) ):
                for j in range( int(output['num_obj']) ):
                    
                    temp_box_1 = output['bboxes'][i, f]
                    temp_box_2 = output['bboxes'][j, f]
                    output['iou'][i, j, f] = calculate_iou(temp_box_1, temp_box_2)
                        

    output['distance'] = torch.zeros(max_num_obj, max_num_obj, len(frames))

    for f in range(len(frames)):
        for i in range( int(output['num_obj']) ):
                for j in range( int(output['num_obj']) ):

                    temp_box_1 = output['bboxes'][i, f]
                    temp_box_2 = output['bboxes'][j, f]
                    output['distance'][i, j, f] = calculate_distance_normalized(temp_box_1, temp_box_2, im_width, im_height)
    
    # relative features
    output['relative_spatial_feature'] = torch.zeros(max_num_obj, max_num_obj, len(frames), 20, dtype=float)

    for f in range(len(frames)):
        for i in range( int(output['num_obj']) ):
                for j in range( int(output['num_obj']) ):
                    
                    if i<j:
                        temp_box_1 = output['bboxes'][i, f]
                        temp_box_2 = output['bboxes'][j, f]

                    # To keep the features symmetric

                    if i>=j:
                        temp_box_2 = output['bboxes'][i, f]
                        temp_box_1 = output['bboxes'][j, f]

                    output['relative_spatial_feature'][i, j, f] = relative_spatial_features(temp_box_1, temp_box_2, im_width, im_height)
    
    # video features - i3d features, motion features, others? 
    
    
    
    return output


In [160]:
# For unit testing

annotation = anno[0]
gif_folder = '/workspace/data/data_folder/o2o/gifs_11'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
deep_net = 'vgg16'
layer_no = 4

if deep_net == 'vgg16':
    model = torchvision.models.vgg16(pretrained=True)

model.to(device)
model.eval()

cnn_feature_extractor = ImageFeatureExtractor(model, layer_no, device, deep_net)

feature_dict = master_feature_generator(annotation, gif_folder, cnn_feature_extractor, device)

In [161]:
feature_dict.keys()

dict_keys(['legend', 'metadata', 'num_obj', 'bboxes', 'lr', 'mr', 'cr', 'object_pairs', 'num_relation', 'geometric_feature', 'cnn_bbox_feature', 'iou', 'distance', 'relative_spatial_feature'])

In [162]:
feature_dict['relative_spatial_feature']

tensor([[[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00],
          ...,
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            1.0000e+00,  0.0000e+00]],

         [[-2.8696e+06, -9.1008e+05, -1.2603e+01,  ..., -3.2722e-01,
            0.0000e+00,  9.4535e-01],
          [-2.8087e+06, -8.8466e+05, -1.2646e+01,  ..., -3.1771e-01,
            0.0000e+00,  9.4427e-01],
          [-2.6977e+06, -8.2620e+05, -1.2544e+01,  ..., -3.2258e-01,
            0.0000e+00,  9.4148e-01],
          ...,
     

In [86]:
annotation

{'metadata': {'activity name': 'AssembleCabinet',
  'yt_id': '0dqx7VOjiJI',
  'frame no.': '1572',
  'frame_width': 720,
  'frame_height': 1280},
 'bboxes': {'0': {'class': 'generic_object', 'bbox': [35, 176, 1277, 343]},
  '1': {'class': 'hand', 'bbox': [1113, 116, 1233, 181]},
  '2': {'class': 'hand', 'bbox': [663, 81, 843, 176]}},
 'relations': [[[2, 0],
   {'scr': ['Contact'],
    'lr': ['Below/Above'],
    'mr': ['Negligible Relative Motion']}],
  [[1, 0],
   {'scr': ['Contact'],
    'lr': ['Below/Above', 'Behind/Front'],
    'mr': ['Holding', 'Adjusting']}]]}