In [None]:
import numpy as np
np.set_printoptions(precision=4)
import copy
import torch
import importlib

import dataloader
from dataloader.action_genome import AG, cuda_collate_fn
from dataloader.movie_graph import MG

from lib.config import Config
from lib.evaluation_recall import BasicSceneGraphEvaluator
from lib.evaluation_recall_mg import MGSceneGraphEvaluator
from lib.object_detector import detector
from lib.sttran import STTran

In [2]:
# conf = Config()
# print(conf)
# for i in conf.args:
#     print(i,':', conf.args[i])
datasize = 'large'
ag_data_path = "../ActionGenome/dataset/ag/"
mg_data_path = "../TER_MovieGraph/scene_library/"
mode = 'test'


# AG_dataset = AG(mode="test", datasize=datasize, data_path=ag_data_path, filter_nonperson_box_frame=True, 
#                 filter_small_box=False if mode == 'predcls' else True)

MG_dataset = MG(mode =mode, datasize = datasize, data_path = mg_data_path, filter_nonperson_box_frame=False, filter_small_box=False if mode == 'predcls' else True)


In [3]:
mg_dataloader = torch.utils.data.DataLoader(MG_dataset, shuffle=False, num_workers=0, collate_fn=cuda_collate_fn)


gpu_device = torch.device('cuda:0')
mode = 'sgdet'
object_detector = detector(train=False, object_classes=MG_dataset.object_classes, use_SUPPLY=True, mode=mode).to(device=gpu_device)
object_detector.eval()

detector(
  (fasterRCNN): resnet(
    (RCNN_rpn): _RPN(
      (RPN_Conv): Conv2d(1024, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (RPN_cls_score): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1))
      (RPN_bbox_pred): Conv2d(512, 48, kernel_size=(1, 1), stride=(1, 1))
      (RPN_proposal): _ProposalLayer()
      (RPN_anchor_target): _AnchorTargetLayer()
    )
    (RCNN_proposal_target): _ProposalTargetLayer()
    (RCNN_roi_pool): ROIPool(output_size=(7, 7), spatial_scale=0.0625)
    (RCNN_roi_align): ROIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0)
    (RCNN_base): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64

In [4]:
enc_layer = 1
dec_layer = 3
model_path = "pretrained_models/sgdet.tar"

model = STTran(mode=mode,
               attention_class_num=len(MG_dataset.attention_relationships),
               spatial_class_num=len(MG_dataset.spatial_relationships),
               contact_class_num=len(MG_dataset.contacting_relationships),
               obj_classes=MG_dataset.object_classes,
               enc_layer_num=enc_layer,
               dec_layer_num=dec_layer).to(device=gpu_device)

model.eval()

ckpt = torch.load(model_path, map_location=gpu_device)
model.load_state_dict(ckpt['state_dict'], strict=False)
print('*'*50)
print('CKPT {} is loaded'.format(model_path))

word vector location: data data
loading word vectors from data/data/glove.6B.200d.pt
word vector location: /home/cong/Dokumente/neural-motifs-master/data /hom
loading word vectors from data//home/cong/Dokumente/neural-motifs-master/data/glove.6B.200d.pt
__background__ -> __background__ 
fail on __background__
**************************************************
CKPT pretrained_models/sgdet.tar is loaded


In [5]:
def makeSTTranPrediction(model, data):
    im_data = copy.deepcopy(data[0].cuda(0))
    im_info = copy.deepcopy(data[1].cuda(0))
    gt_boxes = copy.deepcopy(data[2].cuda(0))
    num_boxes = copy.deepcopy(data[3].cuda(0))
    index = data[4]
    scene_id = data[5]
    print(f"Prediction and Scene ID: {scene_id}")
    print(im_data.shape, im_info.shape, gt_boxes.shape, num_boxes.shape)
    gt_annotation = torch.zeros([im_data.shape[0], 1, 5])
    
    entry = object_detector(im_data, im_info, gt_boxes, num_boxes, gt_annotation, im_all=None)
    pred = model(entry)
    return entry, pred, scene_id

In [None]:
predictions = {}
obj_entries = {}
with torch.no_grad():
    for b, data in enumerate(mg_dataloader):
        if b < 19:
            obj_entry, pred, scene_id = makeSTTranPrediction(model, data)
            predictions[scene_id] = pred
            obj_entries[scene_id] = obj_entry
        else:
            break

In [7]:
scene_id = 14
# scene_id = 18
pred = predictions[scene_id]
obj = obj_entries[scene_id]
print(pred.keys())

dict_keys(['boxes', 'scores', 'distribution', 'pred_labels', 'features', 'fmaps', 'im_info', 'pred_scores', 'pair_idx', 'im_idx', 'human_idx', 'union_feat', 'union_box', 'spatial_masks', 'attention_distribution', 'spatial_distribution', 'contacting_distribution'])


In [8]:
#explore the different elements returned with a prediction
for k in pred.keys():
    print(k.ljust(25), pred[k].shape)

boxes                     torch.Size([178, 5])
scores                    torch.Size([185])
distribution              torch.Size([178, 36])
pred_labels               torch.Size([178])
features                  torch.Size([178, 2048])
fmaps                     torch.Size([23, 1024, 38, 67])
im_info                   torch.Size([])
pred_scores               torch.Size([178])
pair_idx                  torch.Size([155, 2])
im_idx                    torch.Size([155])
human_idx                 torch.Size([23, 1])
union_feat                torch.Size([155, 1024, 7, 7])
union_box                 torch.Size([155, 5])
spatial_masks             torch.Size([155, 2, 27, 27])
attention_distribution    torch.Size([155, 3])
spatial_distribution      torch.Size([155, 6])
contacting_distribution   torch.Size([155, 17])


In [9]:
idx = 0
rels_i = np.concatenate((pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy(),             #attention
                                     pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy()[:,::-1],     #spatial
                                     pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy()), axis=0) 

## Convert Tensor Predictions to Readable Triplets

In [10]:
evaluator = MGSceneGraphEvaluator(
    mode=mode,
    AG_object_classes=MG_dataset.object_classes,
    AG_all_predicates=MG_dataset.relationship_classes,
    AG_attention_predicates=MG_dataset.attention_relationships,
    AG_spatial_predicates=MG_dataset.spatial_relationships,
    AG_contacting_predicates=MG_dataset.contacting_relationships,
    iou_threshold=0.5,
    constraint='True')

In [11]:
print(MG_dataset.relationship_classes)

['looking_at', 'not_looking_at', 'unsure', 'above', 'beneath', 'in_front_of', 'behind', 'on_the_side_of', 'in', 'carrying', 'covered_by', 'drinking_from', 'eating', 'have_it_on_the_back', 'holding', 'leaning_on', 'lying_on', 'not_contacting', 'other_relationship', 'sitting_on', 'standing_on', 'touching', 'twisting', 'wearing', 'wiping', 'writing_on']


In [12]:
def tripletsToWords(triplets, object_classes, relationship_classes):
    triplet_words = []
    for sub1, rel, sub2 in triplets:
        triplet_words.append([object_classes[sub1], relationship_classes[rel],object_classes[sub2]])
    return triplet_words

In [13]:
triplets_by_frame = evaluator.evaluate_scene_graph(pred)
# for i,t in enumerate(triplets_in_word):
#     print(triplet_boxes[i][0], t)

In [14]:
for i, frame in enumerate(triplets_by_frame):
    triplets = frame[0]
    boxes = frame[0]
    print(i-1)
    trips = tripletsToWords(triplets, MG_dataset.object_classes, MG_dataset.relationship_classes)
    for trip in trips:
        print(trip)
    print()

-1
['doorknob', 'on_the_side_of', 'person']
['person', 'holding', 'doorknob']
['person', 'not_looking_at', 'doorknob']
['person', 'touching', 'clothes']
['clothes', 'in', 'person']
['doorknob', 'on_the_side_of', 'person']
['person', 'holding', 'doorknob']
['person', 'not_looking_at', 'doorknob']
['person', 'not_looking_at', 'clothes']
['doorknob', 'on_the_side_of', 'person']
['person', 'holding', 'doorknob']
['person', 'not_looking_at', 'doorknob']
['doorknob', 'on_the_side_of', 'person']
['person', 'holding', 'doorknob']
['person', 'not_looking_at', 'doorknob']
['doorway', 'behind', 'person']
['person', 'not_contacting', 'doorway']
['person', 'not_looking_at', 'doorway']
['door', 'behind', 'person']
['person', 'not_contacting', 'door']
['person', 'not_looking_at', 'door']
['person', 'not_contacting', 'door']
['door', 'on_the_side_of', 'person']
['person', 'not_looking_at', 'door']

0
['person', 'not_contacting', 'table']
['table', 'in_front_of', 'person']
['shelf', 'on_the_side_of', '

In [3]:
from moviegraph.MGAnnotations import MGAnnotations

annotations_path = "MGAnnotations/2017-11-02-51-7637_py3.pkl"
annotations = MGAnnotations(annotations_path)

ModuleNotFoundError: No module named 'GraphClasses'

In [None]:
for k in predictions.keys():
    triplets_in_word, triplet_boxes = evaluator.pred_to_word_triplets(predictions[k])
    print(f"\nTriplets for scene id {k}, number of boxes were {len(predictions[k]['boxes'])}:")
    for i,t in enumerate(triplets_in_word):
        print(triplet_boxes[i][0], t)