In [1]:
import sys
sys.path.insert(0, 'InternImage')
sys.path.insert(1, 'SOLIDER-REID')
import cv2
import mmcv
from mmdet.apis import inference_detector, init_detector
import mmcv_custom
import mmdet_custom
import torch
import numpy as np
from model.make_model import make_model
from config import cfg
from utils.metrics import Postprocessor

In [2]:
RT_DETR = init_detector('detection/work_dirs/cascade_internimage_l_fpn_3x_coco_custom/cascade_internimage_l_fpn_3x_coco_custom.py', 'detection/work_dirs/cascade_internimage_l_fpn_3x_coco_custom/InternImage-L epoch_12 stripped.pth')

In [3]:
cfg.merge_from_file('SOLIDER-REID/TIL.yml')
REID = make_model(cfg, num_class=2, camera_num=1, view_num=1, semantic_weight=cfg.MODEL.SEMANTIC_WEIGHT)
REID.classifier = torch.nn.Identity()  # remove the classifier layer
REID.load_param('SOLIDER-REID/log_SGD_500epoch_continue_1e-4LR_expanded/transformer_21_map0.941492492396344_acc0.8535950183868408.pth')
REID.to('cuda')
REID.eval()
postprocessor = Postprocessor(num_query=1, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM, reranking=False)  # in finals cannot use RR as threshold will be changed based on gallery size. num query is 1 since we only have 1 suspect but can be 4 if we decide to do rotation

using Transformer_type: swin_base_patch4_window7_224 as a backbone




Loading pretrained model from SOLIDER-REID/log_SGD_continue_1e-4/transformer_29_map0.9159278553764464_acc0.5036038160324097.pth


In [4]:
def obj_det(img_path: str, confidence_threshold=0.99):
    result = inference_detector(model, img_path)[0][0]
    boxes = result[result[:, 4] > confidence_threshold]
    return boxes

def load_img(img: np.ndarray):  # for REID only
        img = cv2.resize(img, (224, 224))
        img = np.transpose(img, (2, 0, 1))
        # normalize with mean and std supplied in cfg
        img = img / 255.0
        for channel in range(3):
            img[channel] -= cfg.INPUT.PIXEL_MEAN[channel]
            img[channel] /= cfg.INPUT.PIXEL_STD[channel]
        return img.astype(np.float32)

In [5]:
def predict(suspect: np.ndarray, image_path: str):
    boxes = obj_det(image_path)
    # query is the suspect
    query = [load_img(q) for q in suspect]
    # gallery is cropped out boxes
    gallery = []
    for box in boxes:
        x1, y1, x2, y2, conf = box.astype(np.int32)
        gallery.append(image[y1:y2, x1:x2])

    inputs = query + [load_img(img) for img in gallery]
    inputs = np.stack(inputs, axis=0)  # stack the query and gallery images as batch dim

    features = REID(torch.from_numpy(inputs).to('cuda'))[0]

    postprocessor.update(features.detach())  # postprocessor expects Torch tensor as it uses torch to compute stuff
    dist_mat = postprocessor.compute()
    postprocessor.reset()  # reset the postprocessor for next query
    # perform thresholding to determine which gallery image, if any, are matches with the query
    dist_mat = (dist_mat < cfg.TEST.THRESHOLD).astype(int)  # boolean array
    return [(box, dist_mat[0][i]) for i, box in enumerate(boxes)]

In [6]:
# DSTA will likely give raw bytes for suspect image so the data loading code is split out here and will be replaced once the details of finals come out. Same for the image. Also not sure if DSTA give images in RGB or BGR. RT-DETR expects BGR but REID expects RGB.
query_paths = ["RT-DETR/dataset/reid/test_old/query/image_0000.png"]
query = [cv2.imread(q) for q in query_paths]
query = [cv2.cvtColor(q, cv2.COLOR_BGR2RGB) for q in query]
results = predict(suspect=query, image_path="soccer.jpg")
results

The test feature is normalized
=> Computing DistMat with euclidean_distance


	addmm_(Number beta, Number alpha, Tensor mat1, Tensor mat2)
Consider using one of the following signatures instead:
	addmm_(Tensor mat1, Tensor mat2, *, Number beta, Number alpha) (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1485.)
  dist_mat.addmm_(1, -2, qf, gf.t())


[(array([3.2186508e-04, 7.0947578e+01, 9.9753395e+01, 4.9974039e+02],
        dtype=float32),
  0),
 (array([192.18567, 205.47726, 429.48993, 604.02997], dtype=float32), 0),
 (array([428.35492,  88.00606, 562.4648 , 359.00613], dtype=float32), 0),
 (array([595.018  , 143.83371, 876.8564 , 579.2175 ], dtype=float32), 0),
 (array([833.2748 , 168.45245, 988.9851 , 509.23865], dtype=float32), 0),
 (array([444.6331 , 536.9818 , 513.374  , 604.39264], dtype=float32), 0),
 (array([545.7633 ,  98.86127, 675.3297 , 350.4768 ], dtype=float32), 0),
 (array([1118.1732 ,  110.49107, 1164.643  ,  198.30702], dtype=float32), 0),
 (array([171.85945, 118.10051, 205.18661, 210.4709 ], dtype=float32), 0),
 (array([928.2529 , 114.02906, 967.8399 , 225.78555], dtype=float32), 0),
 (array([873.6314 , 104.24173, 938.08466, 229.7576 ], dtype=float32), 0),
 (array([219.74422 , 118.318665, 251.05542 , 210.4469  ], dtype=float32), 0),
 (array([1072.0616 ,  107.99488, 1104.1064 ,  196.92587], dtype=float32), 0)]