In [1]:
import torch
from PIL import Image
from torch.utils.data import Dataset
from torchvision.transforms import Resize, Compose, ToTensor, Normalize
import random
# from scipy.misc import imread
from imageio import imread
import numpy as np
import pickle
import os
# from fasterRCNN.lib.model.utils.blob import prep_im_for_blob, im_list_to_blob

In [15]:
class MG(Dataset):

    def __init__(self, mode, datasize, data_path=None, filter_nonperson_box_frame=True, filter_small_box=False):

        root_path = data_path
        self.frames_path = os.path.join(root_path, 'frames/')

        # collect the object classes
        self.object_classes = ['__background__']
        with open(os.path.join(root_path, 'annotations/object_classes.txt'), 'r') as f:
            for line in f.readlines():
                line = line.strip('\n')
                self.object_classes.append(line)
        f.close()
        self.object_classes[9] = 'closet/cabinet'
        self.object_classes[11] = 'cup/glass/bottle'
        self.object_classes[23] = 'paper/notebook'
        self.object_classes[24] = 'phone/camera'
        self.object_classes[31] = 'sofa/couch'

        # collect relationship classes
        self.relationship_classes = []
        with open(os.path.join(root_path, 'annotations/relationship_classes.txt'), 'r') as f:
            for line in f.readlines():
                line = line.strip('\n')
                self.relationship_classes.append(line)
        f.close()
        self.relationship_classes[0] = 'looking_at'
        self.relationship_classes[1] = 'not_looking_at'
        self.relationship_classes[5] = 'in_front_of'
        self.relationship_classes[7] = 'on_the_side_of'
        self.relationship_classes[10] = 'covered_by'
        self.relationship_classes[11] = 'drinking_from'
        self.relationship_classes[13] = 'have_it_on_the_back'
        self.relationship_classes[15] = 'leaning_on'
        self.relationship_classes[16] = 'lying_on'
        self.relationship_classes[17] = 'not_contacting'
        self.relationship_classes[18] = 'other_relationship'
        self.relationship_classes[19] = 'sitting_on'
        self.relationship_classes[20] = 'standing_on'
        self.relationship_classes[25] = 'writing_on'

        self.attention_relationships = self.relationship_classes[0:3]
        self.spatial_relationships = self.relationship_classes[3:9]
        self.contacting_relationships = self.relationship_classes[9:]


#         print('-------loading annotations---------slowly-----------')

#         if filter_small_box:
#             with open(root_path + 'annotations/person_bbox.pkl', 'rb') as f:
#                 person_bbox = pickle.load(f)
#             f.close()
#             with open('dataloader/object_bbox_and_relationship_filtersmall.pkl', 'rb') as f:
#                 object_bbox = pickle.load(f)
#         else:
#             with open(root_path + 'annotations/person_bbox.pkl', 'rb') as f:
#                 person_bbox = pickle.load(f)
#             f.close()
#             with open(root_path+'annotations/object_bbox_and_relationship.pkl', 'rb') as f:
#                 object_bbox = pickle.load(f)
#             f.close()
#         print('--------------------finish!-------------------------')

#         if datasize == 'mini':
#             small_person = {}
#             small_object = {}
#             for i in list(person_bbox.keys())[:80000]:
#                 small_person[i] = person_bbox[i]
#                 small_object[i] = object_bbox[i]
#             person_bbox = small_person
#             object_bbox = small_object


        # collect valid frames
        video_dict = {}
        for video_name in os.listdir(root_path + "videos"):
#             print(video_name)
            path_to_frames = os.path.join(root_path, "frames", video_name)
            for frame_path in os.listdir(os.path.join(root_path, "frames", video_name)):
                if video_name in video_dict.keys():
                    video_dict[video_name].append(frame_path)
                else:
                    video_dict[video_name] = [frame_path]
            print(video_name)
            print(video_dict[video_name])
            
#         for i in person_bbox.keys():
#             if object_bbox[i][0]['metadata']['set'] == mode: #train or testing?
#                 frame_valid = False
#                 for j in object_bbox[i]: # the frame is valid if there is visible bbox
#                     if j['visible']:
#                         frame_valid = True
#                 if frame_valid:
#                     video_name, frame_num = i.split('/')
#                     if video_name in video_dict.keys():
#                         video_dict[video_name].append(i)
#                     else:
#                         video_dict[video_name] = [i]

        self.video_list = []
        self.video_size = [] # (w,h)
        self.gt_annotations = []
        self.non_gt_human_nums = 0
        self.non_heatmap_nums = 0
        self.non_person_video = 0
        self.one_frame_video = 0
        self.valid_nums = 0

        '''
        filter_nonperson_box_frame = True (default): according to the stanford method, remove the frames without person box both for training and testing
        filter_nonperson_box_frame = False: still use the frames without person box, FasterRCNN may find the person
        '''
        print(video_dict.keys())
        for i in video_dict.keys():
            video = []
            gt_annotation_video = []
            for j in video_dict[i]:
                if filter_nonperson_box_frame:
                    if person_bbox[j]['bbox'].shape[0] == 0:
                        self.non_gt_human_nums += 1
                        continue
                    else:
                        video.append(j)
                        self.valid_nums += 1

#                 print(person_bbox)
#                 gt_annotation_frame = [{'person_bbox': person_bbox[j]['bbox']}]
                # each frames's objects and human
#                 for k in object_bbox[j]:
#                     if k['visible']:
#                         assert k['bbox'] != None, 'warning! The object is visible without bbox'
#                         k['class'] = self.object_classes.index(k['class'])
#                         k['bbox'] = np.array([k['bbox'][0], k['bbox'][1], k['bbox'][0]+k['bbox'][2], k['bbox'][1]+k['bbox'][3]]) # from xywh to xyxy
#                         k['attention_relationship'] = torch.tensor([self.attention_relationships.index(r) for r in k['attention_relationship']], dtype=torch.long)
#                         k['spatial_relationship'] = torch.tensor([self.spatial_relationships.index(r) for r in k['spatial_relationship']], dtype=torch.long)
#                         k['contacting_relationship'] = torch.tensor([self.contacting_relationships.index(r) for r in k['contacting_relationship']], dtype=torch.long)
#                         gt_annotation_frame.append(k)
#                 gt_annotation_video.append(gt_annotation_frame)

            if len(video) > 2:
                self.video_list.append(video)
                self.video_size.append(person_bbox[j]['bbox_size'])
                self.gt_annotations.append(gt_annotation_video)
            elif len(video) == 1:
                self.one_frame_video += 1
            else:
                self.non_person_video += 1

        print('x'*60)
        if filter_nonperson_box_frame:
            print('There are {} videos and {} valid frames'.format(len(self.video_list), self.valid_nums))
            print('{} videos are invalid (no person), remove them'.format(self.non_person_video))
            print('{} videos are invalid (only one frame), remove them'.format(self.one_frame_video))
            print('{} frames have no human bbox in GT, remove them!'.format(self.non_gt_human_nums))
        else:
            print('There are {} videos and {} valid frames'.format(len(self.video_list), self.valid_nums))
            print('{} frames have no human bbox in GT'.format(self.non_gt_human_nums))
            print('Removed {} of them without joint heatmaps which means FasterRCNN also cannot find the human'.format(non_heatmap_nums))
        print('x' * 60)

    def __getitem__(self, index):

        frame_names = self.video_list[index]
        processed_ims = []
        im_scales = []

        for idx, name in enumerate(frame_names):
            im = imread(os.path.join(self.frames_path, name)) # channel h,w,3
            im = im[:, :, ::-1] # rgb -> bgr
            im, im_scale = prep_im_for_blob(im, [[[102.9801, 115.9465, 122.7717]]], 600, 1000) #cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE
            im_scales.append(im_scale)
            processed_ims.append(im)

        blob = im_list_to_blob(processed_ims)
        im_info = np.array([[blob.shape[1], blob.shape[2], im_scales[0]]],dtype=np.float32)
        im_info = torch.from_numpy(im_info).repeat(blob.shape[0], 1)
        img_tensor = torch.from_numpy(blob)
        img_tensor = img_tensor.permute(0, 3, 1, 2)

        gt_boxes = torch.zeros([img_tensor.shape[0], 1, 5])
        num_boxes = torch.zeros([img_tensor.shape[0]], dtype=torch.int64)

        return img_tensor, im_info, gt_boxes, num_boxes, index

    def __len__(self):
        return len(self.video_list)

In [13]:
def cuda_collate_fn(batch):
    """
    don't need to zip the tensor

    """
    return batch[0]

In [16]:
# conf = Config()
# print(conf)
# for i in conf.args:
#     print(i,':', conf.args[i])
datasize = 'large'
ag_data_path = "../../ActionGenome/dataset/ag/"
mg_data_path = "../../TER_MovieGraph/scene_library/"
mode = 'sgdet'


# AG_dataset = AG(mode="test", datasize=datasize, data_path=ag_data_path, filter_nonperson_box_frame=True, 
#                 filter_small_box=False if mode == 'predcls' else True)

MG_dataset = MG(mode ="test", datasize = datasize, data_path = mg_data_path, filter_nonperson_box_frame=False, filter_small_box=False if mode == 'predcls' else True)

# AG_dataset = AG(mode="test", datasize=conf.datasize, data_path=conf.data_path, filter_nonperson_box_frame=True, 
#                 filter_small_box=False if conf.mode == 'predcls' else True)

dict_keys(['scene_43_valid.mp4', 'scene_17_valid.mp4', 'scene_6_valid.mp4', 'scene_34_valid.mp4', 'scene_51_valid.mp4', 'scene_7_valid.mp4', 'scene_19_valid.mp4', 'scene_64_valid.mp4', 'scene_46_valid.mp4', 'scene_62_valid.mp4', 'scene_39_invalid.mp4', 'scene_20_valid.mp4', 'scene_53_valid.mp4', 'scene_11_valid.mp4', 'scene_15_valid.mp4', 'scene_67_valid.mp4', 'scene_37_valid.mp4', 'scene_12_valid.mp4', 'scene_32_valid.mp4', 'scene_35_valid.mp4', 'scene_44_valid.mp4', 'scene_4_valid.mp4', 'scene_27_valid.mp4', 'scene_66_valid.mp4', 'scene_26_valid.mp4', 'scene_8_valid.mp4', 'scene_31_valid.mp4', 'scene_21_valid.mp4', 'scene_22_valid.mp4', 'scene_48_valid.mp4', 'scene_28_valid.mp4', 'scene_60_valid.mp4', 'scene_49_valid.mp4', 'scene_1_invalid.mp4', 'scene_10_valid.mp4', 'scene_59_valid.mp4', 'scene_63_valid.mp4', 'scene_56_valid.mp4', 'scene_38_valid.mp4', 'scene_65_valid.mp4', 'scene_14_valid.mp4', 'scene_54_valid.mp4', 'scene_3_valid.mp4', 'scene_18_valid.mp4', 'scene_29_valid.mp4', '

NameError: name 'object_bbox' is not defined