In [40]:
import torch
import torchvision
from torch import nn, optim
import cv2
import numpy as np
import json

from model.cnn_model import HOCNN

In [6]:
model = "HICO/v5/"
epoch_num = "10"
checkpoint_path = "checkpoint_" + epoch_num + "_epoch.pth"

PATH = "checkpoints/" + model + "epoch_train/" + checkpoint_path

In [16]:
checkpoint = torch.load(PATH)
model = HOCNN()
model.load_state_dict(checkpoint['state_dict'])
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
IMG_PATH = "datasets/hico/images/test2015/HICO_test2015_00000001.jpg"
human_bboxes = [[319, 305, 358, 348],
                [269, 302, 310, 349]] # 2d matrix of bbox coords -- bottom left, top right
object_bboxes = [[147, 344, 375, 413]]

# apply masks
src = cv2.imread(IMG_PATH)
human_mask = np.zeros_like(src)
for bbox in human_bboxes:
    cv2.rectangle(human_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), thickness=-1)
human_bbox_img = cv2.bitwise_and(src, human_mask, mask=None)

obj_mask = np.zeros_like(src)
pairwise_mask = human_mask
for bbox in object_bboxes:
    cv2.rectangle(obj_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), thickness=-1)
    cv2.rectangle(pairwise_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), thickness=-1)
obj_bbox_img = cv2.bitwise_and(src, obj_mask, mask=None)
pairwise_bbox_img = cv2.bitwise_and(src, pairwise_mask, mask=None)

# resize images
human_bbox_img = cv2.resize(human_bbox_img, (64, 64), interpolation=cv2.INTER_AREA)
obj_bbox_img = cv2.resize(obj_bbox_img, (64, 64), interpolation=cv2.INTER_AREA)
pairwise_bbox_img = cv2.resize(pairwise_bbox_img, (64, 64), interpolation=cv2.INTER_AREA)
#pose_img = cv2.resize(pose_img, (64, 64), interpolation=cv2.INTER_AREA)

human_bbox_img = torch.from_numpy(human_bbox_img).to(device)
obj_bbox_img = torch.from_numpy(obj_bbox_img).to(device)
pairwise_bbox_img = torch.from_numpy(pairwise_bbox_img).to(device)
#pose_img = torch.from_numpy(pose_img).to(device)


res_human_input = human_bbox_img.unsqueeze(0)
res_obj_input = obj_bbox_img.unsqueeze(0)
res_pairwise_input = pairwise_bbox_img.unsqueeze(0)
#res_pose_input = pose_img.unsqueeze(0)

res_human_input = res_human_input.permute([0,3,1,2]).float().to(device)
res_obj_input = res_obj_input.permute([0,3,1,2]).float().to(device)
res_pairwise_input = res_pairwise_input.permute([0,3,1,2]).float().to(device)

In [33]:
with torch.no_grad(): # Disable gradients for validation
    outputs = model.forward(res_human_input, res_obj_input, res_pairwise_input)

    preds = torch.argmax(outputs, dim=1)
   # ground_labels = torch.max(labels, 1)[1]

tensor([245])


In [44]:
with open('datasets/processed/hico/hoi_list.json') as f:
    hoi_list = json.load(f)
prediction = hoi_list[preds.item()-1]
print(prediction)

{'id': '245', 'object': 'bench', 'verb': 'lie_on'}
