In [1]:
import numpy as np
np.set_printoptions(precision=4)
import copy
import torch
import importlib

import dataloader
from dataloader.action_genome import AG, cuda_collate_fn
from dataloader.movie_graph import MG

from lib.config import Config
from lib.evaluation_recall import BasicSceneGraphEvaluator
from lib.evaluation_recall_mg import MGSceneGraphEvaluator
from lib.object_detector import detector
from lib.sttran import STTran

In [2]:
# conf = Config()
# print(conf)
# for i in conf.args:
#     print(i,':', conf.args[i])
datasize = 'large'
ag_data_path = "../ActionGenome/dataset/ag/"
mg_data_path = "../TER_MovieGraph/scene_library/"
mode = 'test'


# AG_dataset = AG(mode="test", datasize=datasize, data_path=ag_data_path, filter_nonperson_box_frame=True, 
#                 filter_small_box=False if mode == 'predcls' else True)

MG_dataset = MG(mode =mode, datasize = datasize, data_path = mg_data_path, filter_nonperson_box_frame=False, filter_small_box=False if mode == 'predcls' else True)


In [3]:
mg_dataloader = torch.utils.data.DataLoader(MG_dataset, shuffle=False, num_workers=0, collate_fn=cuda_collate_fn)


gpu_device = torch.device('cuda:0')
mode = 'sgdet'
object_detector = detector(train=False, object_classes=MG_dataset.object_classes, use_SUPPLY=True, mode=mode).to(device=gpu_device)
object_detector.eval()

detector(
  (fasterRCNN): resnet(
    (RCNN_rpn): _RPN(
      (RPN_Conv): Conv2d(1024, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (RPN_cls_score): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1))
      (RPN_bbox_pred): Conv2d(512, 48, kernel_size=(1, 1), stride=(1, 1))
      (RPN_proposal): _ProposalLayer()
      (RPN_anchor_target): _AnchorTargetLayer()
    )
    (RCNN_proposal_target): _ProposalTargetLayer()
    (RCNN_roi_pool): ROIPool(output_size=(7, 7), spatial_scale=0.0625)
    (RCNN_roi_align): ROIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0)
    (RCNN_base): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64

In [4]:
enc_layer = 1
dec_layer = 3
model_path = "pretrained_models/sgdet.tar"

model = STTran(mode=mode,
               attention_class_num=len(MG_dataset.attention_relationships),
               spatial_class_num=len(MG_dataset.spatial_relationships),
               contact_class_num=len(MG_dataset.contacting_relationships),
               obj_classes=MG_dataset.object_classes,
               enc_layer_num=enc_layer,
               dec_layer_num=dec_layer).to(device=gpu_device)

model.eval()

ckpt = torch.load(model_path, map_location=gpu_device)
model.load_state_dict(ckpt['state_dict'], strict=False)
print('*'*50)
print('CKPT {} is loaded'.format(model_path))

word vector location: data data
loading word vectors from data/data/glove.6B.200d.pt
word vector location: /home/cong/Dokumente/neural-motifs-master/data /hom
loading word vectors from data//home/cong/Dokumente/neural-motifs-master/data/glove.6B.200d.pt
__background__ -> __background__ 
fail on __background__
**************************************************
CKPT pretrained_models/sgdet.tar is loaded


In [5]:
def makeSTTranPrediction(model, data):
    im_data = copy.deepcopy(data[0].cuda(0))
    im_info = copy.deepcopy(data[1].cuda(0))
    gt_boxes = copy.deepcopy(data[2].cuda(0))
    num_boxes = copy.deepcopy(data[3].cuda(0))
    index = data[4]
    scene_id = data[5]
    print(f"Prediction and Scene ID: {scene_id}")
    gt_annotation = torch.zeros([im_data.shape[0], 1, 5])
    
    entry = object_detector(im_data, im_info, gt_boxes, num_boxes, gt_annotation, im_all=None)
    pred = model(entry)
    return entry, pred, scene_id

In [6]:
predictions = {}
obj_entries = {}
with torch.no_grad():
    for b, data in enumerate(mg_dataloader):
        if b < 15:
            obj_entry, pred, scene_id = makeSTTranPrediction(model, data)
            predictions[scene_id] = pred
            obj_entries[scene_id] = obj_entry
        else:
            break

Prediction and Scene ID: 18


  "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."


Prediction and Scene ID: 142
Prediction and Scene ID: 14
Prediction and Scene ID: 216
Prediction and Scene ID: 163
Prediction and Scene ID: 217
Prediction and Scene ID: 213
Prediction and Scene ID: 160
Prediction and Scene ID: 11
Prediction and Scene ID: 165
Prediction and Scene ID: 103
Prediction and Scene ID: 135
Prediction and Scene ID: 68
Prediction and Scene ID: 221
Prediction and Scene ID: 154


In [7]:
scene_id = 68
# scene_id = 18
pred = predictions[scene_id]
obj = obj_entries[scene_id]
print(pred.keys())

dict_keys(['boxes', 'scores', 'distribution', 'pred_labels', 'features', 'fmaps', 'im_info', 'pred_scores', 'pair_idx', 'im_idx', 'human_idx', 'union_feat', 'union_box', 'spatial_masks', 'attention_distribution', 'spatial_distribution', 'contacting_distribution'])


In [8]:
#explore the different elements returned with a prediction
for k in pred.keys():
    print(k.ljust(25), pred[k].shape)

boxes                     torch.Size([795, 5])
scores                    torch.Size([706])
distribution              torch.Size([795, 36])
pred_labels               torch.Size([795])
features                  torch.Size([795, 2048])
fmaps                     torch.Size([73, 1024, 38, 67])
im_info                   torch.Size([])
pred_scores               torch.Size([795])
pair_idx                  torch.Size([722, 2])
im_idx                    torch.Size([722])
human_idx                 torch.Size([73, 1])
union_feat                torch.Size([722, 1024, 7, 7])
union_box                 torch.Size([722, 5])
spatial_masks             torch.Size([722, 2, 27, 27])
attention_distribution    torch.Size([722, 3])
spatial_distribution      torch.Size([722, 6])
contacting_distribution   torch.Size([722, 17])


In [9]:
idx = 0
rels_i = np.concatenate((pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy(),             #attention
                                     pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy()[:,::-1],     #spatial
                                     pred['pair_idx'][pred['im_idx'] == idx].cpu().clone().numpy()), axis=0) 
print(pred['im_idx'])

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
         2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  3.,  3.,
         3.,  3.,  3.,  3.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
         4.,  4.,  4.,  4.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,
         5.,  5.,  5.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
         6.,  6.,  6.,  6.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,
         7.,  7.,  7.,  7.,  8.,  8.,  8.,  8.,  8.,  8.,  9.,  9., 10., 10.,
        10., 10., 10., 10., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
        11., 11., 11., 12., 12., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 14., 14., 14., 14., 14., 14., 14., 14.,
        14., 15., 15., 15., 15., 15., 15., 15., 15., 16., 16., 16., 16., 16.,
        16., 17., 17., 17., 17., 18., 18., 18., 18., 19., 19., 1

## Convert Tensor Predictions to Readable Triplets

In [10]:
evaluator = MGSceneGraphEvaluator(
    mode=mode,
    AG_object_classes=MG_dataset.object_classes,
    AG_all_predicates=MG_dataset.relationship_classes,
    AG_attention_predicates=MG_dataset.attention_relationships,
    AG_spatial_predicates=MG_dataset.spatial_relationships,
    AG_contacting_predicates=MG_dataset.contacting_relationships,
    iou_threshold=0.5,
    constraint='False')

In [11]:
def tripletsToWords(triplets, object_classes, relationship_classes):
    triplet_words = []
    for sub1, rel, sub2 in triplets:
        triplet_words.append([object_classes[sub1], relationship_classes[rel],object_classes[sub2]])
    return triplet_words

In [12]:
trplets_by_frame = evaluator.pred_to_word_triplets(pred)
# for i,t in enumerate(triplets_in_word):
#     print(triplet_boxes[i][0], t)

this is pred entry bellow, the shape of rel scores is (45, 26)
[[ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0 12]
 [ 0 13]
 [ 0 14]
 [ 0 15]
 [ 1  0]
 [ 2  0]
 [ 3  0]
 [ 4  0]
 [ 5  0]
 [ 6  0]
 [ 7  0]
 [ 8  0]
 [ 9  0]
 [10  0]
 [11  0]
 [12  0]
 [13  0]
 [14  0]
 [15  0]
 [ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0 12]
 [ 0 13]
 [ 0 14]
 [ 0 15]]
[[ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0 12]
 [ 0 13]
 [ 0 14]
 [ 0 15]
 [ 1  0]
 [ 2  0]
 [ 3  0]
 [ 4  0]
 [ 5  0]
 [ 6  0]
 [ 7  0]
 [ 8  0]
 [ 9  0]
 [10  0]
 [11  0]
 [12  0]
 [13  0]
 [14  0]
 [15  0]
 [ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0 12]
 [ 0 13]
 [ 0 14]
 [ 0 15]]
Params
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  2  3  4  5  6  7  8  9
 10 11 12 13 14 15  0  0  0  0  0  0  0  0  0

this is pred entry bellow, the shape of rel scores is (12, 26)
[[443 444]
 [443 445]
 [443 446]
 [443 447]
 [444 443]
 [445 443]
 [446 443]
 [447 443]
 [443 444]
 [443 445]
 [443 446]
 [443 447]]
[[443 444]
 [443 445]
 [443 446]
 [443 447]
 [444 443]
 [445 443]
 [446 443]
 [447 443]
 [443 444]
 [443 445]
 [443 446]
 [443 447]]
Params
[443 443 443 443 444 445 446 447 443 443 443 443] [444 445 446 447 443 443 443 443 444 445 446 447]


this is pred entry bellow, the shape of rel scores is (45, 26)
[[448 449]
 [448 450]
 [448 451]
 [448 452]
 [448 453]
 [448 454]
 [448 455]
 [448 456]
 [448 457]
 [448 458]
 [448 459]
 [448 460]
 [448 461]
 [448 462]
 [448 463]
 [449 448]
 [450 448]
 [451 448]
 [452 448]
 [453 448]
 [454 448]
 [455 448]
 [456 448]
 [457 448]
 [458 448]
 [459 448]
 [460 448]
 [461 448]
 [462 448]
 [463 448]
 [448 449]
 [448 450]
 [448 451]
 [448 452]
 [448 453]
 [448 454]
 [448 455]
 [448 456]
 [448 457]
 [448 458]
 [448 459]
 [448 460]
 [448 461]
 [448 462]
 [448 463]]
[[4

Params
[671 671 671 671 671 672 673 674 675 676 671 671 671 671 671] [672 673 674 675 676 671 671 671 671 671 672 673 674 675 676]


this is pred entry bellow, the shape of rel scores is (33, 26)
[[677 678]
 [677 679]
 [677 680]
 [677 681]
 [677 682]
 [677 683]
 [677 684]
 [677 685]
 [677 686]
 [677 687]
 [677 688]
 [678 677]
 [679 677]
 [680 677]
 [681 677]
 [682 677]
 [683 677]
 [684 677]
 [685 677]
 [686 677]
 [687 677]
 [688 677]
 [677 678]
 [677 679]
 [677 680]
 [677 681]
 [677 682]
 [677 683]
 [677 684]
 [677 685]
 [677 686]
 [677 687]
 [677 688]]
[[677 678]
 [677 679]
 [677 680]
 [677 681]
 [677 682]
 [677 683]
 [677 684]
 [677 685]
 [677 686]
 [677 687]
 [677 688]
 [678 677]
 [679 677]
 [680 677]
 [681 677]
 [682 677]
 [683 677]
 [684 677]
 [685 677]
 [686 677]
 [687 677]
 [688 677]
 [677 678]
 [677 679]
 [677 680]
 [677 681]
 [677 682]
 [677 683]
 [677 684]
 [677 685]
 [677 686]
 [677 687]
 [677 688]]
Params
[677 677 677 677 677 677 677 677 677 677 677 678 679 680 681 682 683 

this is pred entry bellow, the shape of rel scores is (51, 26)
[[769 770]
 [769 771]
 [769 772]
 [769 773]
 [769 774]
 [769 775]
 [769 776]
 [769 777]
 [769 778]
 [769 779]
 [769 780]
 [769 781]
 [769 782]
 [769 783]
 [769 784]
 [769 785]
 [769 786]
 [770 769]
 [771 769]
 [772 769]
 [773 769]
 [774 769]
 [775 769]
 [776 769]
 [777 769]
 [778 769]
 [779 769]
 [780 769]
 [781 769]
 [782 769]
 [783 769]
 [784 769]
 [785 769]
 [786 769]
 [769 770]
 [769 771]
 [769 772]
 [769 773]
 [769 774]
 [769 775]
 [769 776]
 [769 777]
 [769 778]
 [769 779]
 [769 780]
 [769 781]
 [769 782]
 [769 783]
 [769 784]
 [769 785]
 [769 786]]
[[769 770]
 [769 771]
 [769 772]
 [769 773]
 [769 774]
 [769 775]
 [769 776]
 [769 777]
 [769 778]
 [769 779]
 [769 780]
 [769 781]
 [769 782]
 [769 783]
 [769 784]
 [769 785]
 [769 786]
 [770 769]
 [771 769]
 [772 769]
 [773 769]
 [774 769]
 [775 769]
 [776 769]
 [777 769]
 [778 769]
 [779 769]
 [780 769]
 [781 769]
 [782 769]
 [783 769]
 [784 769]
 [785 769]
 [786 769]
 

TypeError: cannot unpack non-iterable NoneType object

In [None]:
print(type(triplets_in_word), type(triplet_boxes))
print(triplets_in_word.shape, triplet_boxes.shape)
print(triplet_boxes[:10])

In [None]:
for k in predictions.keys():
    triplets_in_word, triplet_boxes = evaluator.pred_to_word_triplets(predictions[k])
    print(f"\nTriplets for scene id {k}, number of boxes were {len(predictions[k]['boxes'])}:")
    for i,t in enumerate(triplets_in_word):
        print(triplet_boxes[i][0], t)

In [None]:
class PredictionEvaluator:
    def __init__(self, sttran_pred, mg_pred):
        self.mapping = {
            "lookingat": {"looks at", "watches", "observes", "glances at", "sees", "stares at", "spots"},
            "notlookingat": {"ignores", "averts gaze", "turns away from", "looks away from"},
            "unsure": {"uncertain", "unsure", "confused", "puzzled", "doubtful", "perplexed", "hesitant", "unclear", "unconvinced", "indecisive"},
            "above": {"above", "over", "higher than", "superior", "on top of", "up"},
            "beneath": {"beneath", "below", "lower than", "inferior", "under", "down"},
            "infrontof": {"in front of", "facing", "ahead of", "before", "forefront"},
            "behind": {"behind", "at the back of", "trailing", "rear"},
            "onthesideof": {"beside", "next to", "adjacent to", "alongside", "on the side of"},
            "in": {"inside", "within", "in", "in the middle of", "in the center of", "in the midst of", "amidst"},
            "carrying": {"carrying", "transporting", "holding", "bearing"},
            "coveredby": {"covered by", "hidden under", "camouflaged by", "obscured by"},
            "drinkingfrom": {"drinking from", "sipping from", "imbibing from", "swigging from", "quaffing from"},
            "eating": {"eating", "chewing", "consuming", "devouring"},
            "haveitontheback": {"wearing on the back", "carrying on the back", "having on the back", "sporting on the back"},
            "holding": {"holding", "gripping", "clutching", "grasping", "embracing"},
            "leaningon": {"leaning on", "resting on", "supported by", "propped up by", "relying on"},
            "lyingon": {"lying on", "resting on", "stretched out on", "lying prone on"},
            "notcontacting": {"not contacting", "not touching", "not reaching", "out of reach of", "too far away from"},
            "otherrelationship": {"related in other ways", "associated in other ways", "connected in other ways", "linked in other ways", "affiliated in other ways"},
            "sittingon": {"sitting on", "perched on", "astride", "straddling", "seated on"},
            "standingon": {"standing on", "perched on", "upright on", "balanced on"},
            "touching": {"touching", "in contact with", "grazing", "brushing", "pressing against"},
            "twisting": {"twisting", "contorting", "wringing", "writhing", "torquing"},
            "wearing": {"wearing", "sporting", "having on", "clad in", "dressed in"},
            "wiping": {"wiping", "drying", "cleaning", "clearing", "sponging"}
        }