<a href="https://colab.research.google.com/github/abrichr/visual-contact-tracing/blob/master/Visual_Contact_Tracing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# install dependencies: (use cu101 because colab has CUDA 10.1)
!pip install -U torch==1.5 torchvision==0.6 -f https://download.pytorch.org/whl/cu101/torch_stable.html 
!pip install cython pyyaml==5.1
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
!gcc --version
# opencv is pre-installed on colab
# install detectron2:
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html
# get configs
!git clone https://github.com/facebookresearch/detectron2

In [0]:
# May need to restart your runtime prior to this to let installation take effect

# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

import numpy as np
import cv2
import random
from google.colab import files
from google.colab.patches import cv2_imshow

from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog



Upload a video

In [0]:
if 0:
  uploaded = files.upload()
  for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn]))
    )

Or load from Google Drive

In [0]:
import requests

def download_file_from_google_drive(file_id, file_name):
  # download a file from the Google Drive link
  !rm -f ./cookie
  !curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=$file_id" > /dev/null
  confirm_text = !awk '/download/ {print $NF}' ./cookie
  confirm_text = confirm_text[0]
  !curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=$confirm_text&id=$file_id" -o $file_name
  with open(file_name, 'rb') as f:
    data = f.read()
    print('downloaded', len(data), 'bytes to', video_filename)


file_id = '0Bzf1l8WmTwu0eUluQ2h1NWZQRjQ'
video_filename = 'salsa_cpp_cam4.avi'
download_file_from_google_drive(file_id, video_filename)

Re-encode

In [0]:
import subprocess as sp

import imageio
                                                                    
ffmpeg = imageio.plugins.ffmpeg                                                
try:                                                                           
    ffmpeg.download()                                                          
except:                                                                        
    pass                                                                       
ffmpeg_exe = ffmpeg.get_exe()

video_path = video_filename 
start_time_seconds = 1
duration_seconds = 10
video_filename_reenc = video_filename + '-reenc.avi'
cmd_parts = [
  ffmpeg_exe,
  '-i', video_path,
  '-vcodec', 'h264',
  '-acodec', 'aac',
  #'-c', 'copy',
  '-strict',
  '-2',
  '-ss', str(start_time_seconds),
  '-t', str(duration_seconds),
  '-y',
  '-loglevel', 'debug',
  '-an',
  video_filename_reenc
]
print('Running cmd:\n', ' '.join(cmd_parts))                                
result = sp.run(                                                            
    cmd_parts, stdout=sp.PIPE, stderr=sp.STDOUT, universal_newlines=True    
)                                                                           
print('Result:\n', result.stdout)                                           

with open(video_filename_reenc, 'rb') as f:
  data = f.read()
  print('wrote', len(data), 'bytes')

In [0]:
import argparse
import glob
import multiprocessing as mp
import os
import time
import cv2
import tqdm

from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.video_visualizer import VideoVisualizer

DEFAULT_CONFIG = 'detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml'
DEFAULT_CONF_THRESH = 0.7
DEFAULT_OPTS = [
  'MODEL.WEIGHTS',
  model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"),
]

default_args = [DEFAULT_CONFIG, DEFAULT_OPTS, DEFAULT_CONF_THRESH]

def setup_cfg(config=DEFAULT_CONFIG, opts=DEFAULT_OPTS, conf_thresh=DEFAULT_CONF_THRESH):
    # load config from file and arguments
    cfg = get_cfg()
    cfg.merge_from_file(config)
    cfg.merge_from_list(opts)
    # Set score_threshold for builtin models
    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = conf_thresh
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = conf_thresh
    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = conf_thresh
    cfg.freeze()
    return cfg

mp.set_start_method("spawn", force=True)
setup_logger(name="fvcore")
logger = setup_logger()
logger.info("Arguments: " + str(default_args))

cfg = setup_cfg()
predictor = DefaultPredictor(cfg)

video_input = video_filename_reenc
print('video_input:', video_input)
output = f'{video_filename_reenc}-out.mp4'
print('output:', output)

video = cv2.VideoCapture(video_input)
print('video:', video)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
basename = os.path.basename(video_input)

output_fname = './' + output
print('output_fname:', output_fname)
#assert not os.path.isfile(output_fname), output_fname
output_file = cv2.VideoWriter(
    filename=output_fname,
    # some installation of opencv may not support x264 (due to its license),
    # you can try other format (e.g. MPEG)
    #fourcc=cv2.VideoWriter_fourcc(*"x264"),
    #fourcc=cv2.VideoWriter_fourcc(*"MPEG"),
    fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
    fps=float(frames_per_second),
    frameSize=(width, height),
    isColor=True,
)
print('output_file:', output_file)
assert os.path.isfile(video_input)

metadata = MetadataCatalog.get(
    cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)
cpu_device = torch.device("cpu")
instance_mode = ColorMode.IMAGE
video_visualizer = VideoVisualizer(metadata, instance_mode)

def process_predictions(frame, predictions):
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    if "panoptic_seg" in predictions:
        panoptic_seg, segments_info = predictions["panoptic_seg"]
        vis_frame = video_visualizer.draw_panoptic_seg_predictions(
            frame, panoptic_seg.to(cpu_device), segments_info
        )
    elif "instances" in predictions:
        predictions = predictions["instances"].to(cpu_device)
        vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
    elif "sem_seg" in predictions:
        vis_frame = video_visualizer.draw_sem_seg(
            frame, predictions["sem_seg"].argmax(dim=0).to(cpu_device)
        )

    # Converts Matplotlib RGB format to OpenCV BGR format
    vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
    return vis_frame

def _frame_from_video(video):
    while video.isOpened():
        success, frame = video.read()
        if success:
            yield frame
        else:
            break

frame_gen = _frame_from_video(video)
from google.colab.patches import cv2_imshow
frames = []
all_predictions = []
vis_frames = []
for i, frame in enumerate(tqdm.tqdm(frame_gen, total=num_frames)):

  frames.append(frame)
  predictions = predictor(frame)
  all_predictions.append(predictions)

  # TODO: do this after detecting infections
  vis_frame = process_predictions(frame, predictions)
  vis_frames.append(vis_frame)

  if i < 1:
    print('displaying frame', i)
    cv2_imshow(vis_frame)
  #output_file.write(vis_frame)

all_predictions = np.array(all_predictions)
print('all_predictions.shape:', all_predictions.shape)

video.release()
output_file.release()
print('Done.')

with open(output_fname, 'rb') as f:
  print('wrote', len(f.read()), 'bytes')


In [0]:
#! pip install cython_bbox
#from cython_bbox import bbox_overlaps

from scipy.spatial.distance import pdist, squareform

# https://github.com/facebookresearch/detectron2/issues/754#issuecomment-579463185
JOINT_NAMES = [
  "nose",
  "left_eye", "right_eye",
  "left_ear", "right_ear",
  "left_shoulder", "right_shoulder",
  "left_elbow", "right_elbow",
  "left_wrist", "right_wrist",
  "left_hip", "right_hip",
  "left_knee", "right_knee",
  "left_ankle", "right_ankle"
]

# https://github.com/facebookresearch/detectron2/blob/04958b93e1232935e126c2fd9e6ccd3f57c3a8f3/detectron2/utils/visualizer.py#L32
KEYPOINT_THRESHOLD = 0.05

# TODO: smoothing e.g. kalman or savgol
# https://stackoverflow.com/a/52450682/95989
max_num_instances = 0
all_keypoints = []
all_boxes = []
for predictions in all_predictions:
  #predictions['instances'].get_fields().keys()
  #'pred_boxes', 'scores', 'pred_classes', 'pred_keypoints'
  instances = predictions['instances'].to(cpu_device)
  #import pdb; pdb.set_trace()
  keypoints = np.asarray(instances.pred_keypoints)
  boxes = np.asarray(instances.pred_boxes.tensor)
  #print('keypoints:', keypoints.shape)
  all_keypoints.append(keypoints)
  all_boxes.append(boxes)
  num_instances = keypoints.shape[0]
  max_num_instances = max(max_num_instances, num_instances)
print('max_num_instances:', max_num_instances)
print('len(all_keypoints):', len(all_keypoints))

# https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou


# https://github.com/facebookresearch/DetectAndTrack/blob/d66734498a4331cd6fde87d8269499b8577a2842/lib/core/tracking_engine.py#L106
def compute_pairwise_iou(a, b):
  """
  a, b (np.ndarray) of shape Nx4 and Mx4.
  The output is NxM, for each combination of boxes.
  """
  Cs = []
  for box_a in a:
    row = []
    for box_b in b:
      # TODO: replace with cython_bbox
      iou = bb_intersection_over_union(box_a, box_b)
      row.append(1 - iou)
    Cs.append(row)
  C = np.array(Cs)
  #print('a.shape:', a.shape)
  #print('b.shape:', b.shape)
  #print('C.shape:', C.shape)
  return C


def compute_distance_matrix(prev_boxes, cur_boxes):
  # TODO: consider keypoint distance?
  return compute_pairwise_iou(prev_boxes, cur_boxes)


# https://github.com/facebookresearch/DetectAndTrack/blob/d66734498a4331cd6fde87d8269499b8577a2842/lib/core/tracking_engine.py#L184
def bipartite_matching_greedy(C, prev_tracks):
    """
    Computes the bipartite matching between the rows and columns, given the
    cost matrix, C.
    """
    C = C.copy()  # to avoid affecting the original matrix
    prev_ids = []
    cur_ids = []
    while (C == np.inf).sum() != C.size:
      #print('*' * 40)

      # Find the lowest cost element
      min_idx = C.argmin()
      i, j = np.unravel_index(min_idx, C.shape)
      min_val = C[i][j]
      #print('min_idx:', min_idx, 'min_val:', min_val, 'i:', i, 'j:', j)

      # Add to results
      #print('adding to results:')
      prev_ids.append(i)
      cur_ids.append(j)
      #print('prev_ids:', prev_ids)
      #print('cur_ids:', cur_ids)
      
      # Remove from cost matrix
      track = prev_tracks[i]
      #print('track:', track)
      track_idxs = [
        idx for idx in range(len(prev_tracks))
        if prev_tracks[idx] == track
      ]
      #print('track_idxs:', track_idxs)
      C[:, j] = np.inf
      for track_idx in track_idxs:
        #print('removing track_idx:', track_idx)
        C[track_idx, :] = np.inf
      #num_removed_costs = (C == np.inf).sum()
      #print('num_removed_costs:', num_infs)

    return prev_ids, cur_ids


def compute_matches(prev_boxes, cur_boxes, prev_tracks):
  assert len(prev_boxes) == len(prev_tracks)
  #matches = -np.ones((max_num_instances, ), dtype=np.int32)
  matches = -np.ones((len(cur_boxes), ), dtype=np.int32)
  if not prev_boxes.size:
    return matches
  C = compute_distance_matrix(prev_boxes, cur_boxes)
  prev_inds, next_inds = bipartite_matching_greedy(C, prev_tracks)
  #print('prev_inds:', prev_inds, len(prev_inds))
  #print('next_inds:', next_inds, len(next_inds))
  assert(len(prev_inds) == len(next_inds))
  for i in range(len(prev_inds)):
    #print('i:', i, 'next_inds[i]:', next_inds[i], 'prev_inds[i]', prev_inds[i])
    matches[next_inds[i]] = prev_inds[i]
    #print('matches:', matches)
  return matches

def get_frame_tracks(matches, prev_tracks, next_track_id):
  frame_tracks = []
  for i, m in enumerate(matches):
    #print('i:', i, 'm:', m, 'len(prev_tracks):', len(prev_tracks ))
    if m == -1 or m >= len(prev_tracks):  # didn't match to any
      frame_tracks.append(next_track_id[0])
      next_track_id[0] += 1
      if next_track_id[0] >= MAX_TRACK_IDS:
        # TODO: handle this
        print('Exceeded max track ids')
        next_track_id[0] %= MAX_TRACK_IDS
    else:
      frame_tracks.append(prev_tracks[m])
  #print('prev_tracks:\t', prev_tracks, 'len(prev_tracks):', len(prev_tracks))
  print('frame_tracks:\t', frame_tracks, 'len(frame_tracks):', len(frame_tracks))
  return frame_tracks

def visualize_predictions(frame, predictions):
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    predictions = predictions["instances"].to(cpu_device)
    vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
    # Converts Matplotlib RGB format to OpenCV BGR format
    vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
    return vis_frame

# compute tracks, inspired by:
# https://github.com/facebookresearch/DetectAndTrack/blob/d66734498a4331cd6fde87d8269499b8577a2842/lib/core/tracking_engine.py#L272
MAX_TRACK_IDS = 999
all_prev_boxes = []
T = 10
video_tracks = []
next_track_id = [0]
for frame_id, (frame, predictions) in enumerate(zip(frames, all_predictions)):
  print('\nframe_id:', frame_id)

  instances = predictions['instances'].to(cpu_device)  
  cur_boxes = np.asarray(instances.pred_boxes.tensor)
  prev_boxes = np.vstack(all_prev_boxes) if all_prev_boxes else np.array([])
  all_prev_tracks = video_tracks[
    max(0, frame_id - len(all_prev_boxes)) :
    max(0, frame_id)
  ]
  #print('len(all_prev_tracks):', len(all_prev_tracks))
  prev_tracks = np.hstack(all_prev_tracks) if all_prev_tracks else np.array([])
  #print('prev_boxes.shape:', prev_boxes.shape)
  #print('prev_tracks.shape:', prev_tracks.shape)

  matches = compute_matches(prev_boxes, cur_boxes, prev_tracks)
  print('matches:\t', matches)
  # matches[i] contains the index of the box in the previous frames
  # corresponding to the box with index i in the current frame

  #print('prev_tracks:', prev_tracks)
  frame_tracks = get_frame_tracks(matches, prev_tracks, next_track_id)
  assert len(np.unique(frame_tracks)) == len(frame_tracks), (len(np.unique(frame_tracks)), len(frame_tracks))
  video_tracks.append(frame_tracks)
  all_prev_boxes.append(cur_boxes)
  if len(all_prev_boxes) > T:
    all_prev_boxes = all_prev_boxes[1:]

  if frame_id < 3 or frame_id >= len(frames) - 3 or any([match == -1 for match in matches]):
    vis_frame = visualize_predictions(frame, predictions)
    for box, frame_track in zip(cur_boxes, frame_tracks):
      cv2.putText(vis_frame, str(frame_track), (box[0], box[1]), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255))
      cv2.putText(vis_frame, str(frame_track), (int(box[0]+1), int(box[1]+1)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0))
    cv2_imshow(vis_frame)

# TODO: filter out large position jumps that immediately return after one frame

In [0]:


# TODO: smoothing (e.g. https://github.com/facebookresearch/DetectAndTrack/blob/d66734498a4331cd6fde87d8269499b8577a2842/lib/core/tracking_engine.py#L600)
# TODO: in each frame, compute distances between every pair of wrists
# between different instances
DISTANCE_THRESHOLD = 20
print(len(all_predictions), 'frames')
target_joint_names = [
  'left_wrist',
  'right_wrist'
]
infected_tracks = set(
  #[np.random.randint(0, max_num_instances)]
  [3]
)
print('len(all_keypoints):', len(all_keypoints))
print('len(video_tracks):', len(video_tracks))
assert(len(all_keypoints) == len(video_tracks))
for frame_id, (keypoints, frame_tracks) in enumerate(zip(all_keypoints, video_tracks)):
  print('*' * 40)
  print('frame_id:', frame_id)
  print('infected_tracks:', infected_tracks)
  print('frame_tracks:', frame_tracks)
  assert len(np.unique(frame_tracks)) == len(frame_tracks), (len(np.unique(frame_tracks)), len(frame_tracks))
  target_joint_vals = []
  target_joint_probs = []
  for keypoints_per_instance in keypoints:
    # https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/visualizer.py#L703
    # a tensor of shape (K, 3), where K is the number of keypoints
    # and the last dimension corresponds to (x, y, probability).

    for joint_name in target_joint_names:
      joint_idx = JOINT_NAMES.index(joint_name)
      joint_vals = keypoints_per_instance[joint_idx]
      x, y, prob = joint_vals
      # print(
      #   'joint_name:', joint_name,
      #   'joint_vals:', joint_vals,
      #   'prob:', prob
      # )
      target_joint_vals.append([x, y])
      target_joint_probs.append(prob)

  target_joint_vals = np.array(target_joint_vals)
  #print('target_joint_vals:', target_joint_vals.shape)

  distances = squareform(pdist(target_joint_vals))
  #print('distances.shape:', distances.shape)

  '''
  import sys
  numpy.set_printoptions(
    threshold=sys.maxsize,
    formatter={'float': lambda x: "{0:0.0f}".format(x)}
  )
  print(distances)
  '''
  
  hit_mask = distances < DISTANCE_THRESHOLD
  #print('hit_mask.shape:', hit_mask.shape)
  frame_infected_tracks = set()
  frame_tracks = np.array(frame_tracks)
  for infected_track in infected_tracks:
    #print('-' * 20)
    #print('infected_track:', infected_track)
    infected_idx = np.where(frame_tracks == infected_track)[0]
    #print('infected_idx:', infected_idx)
    if not infected_idx.size:
      #print('track', infected_track, 'no longer in frame')
      continue
    infected_idx = infected_idx[0]
    
    for i_joint in range(len(target_joint_names)):
      #print('. ' * 20)
      #print('i_joint:', i_joint)
      row = hit_mask[infected_idx * len(target_joint_names) + i_joint]
      #print('row:', [1 if v else 0 for v in row ])
      #print('row.shape:', row.shape)
      hit_idxs = np.where(row)[0]
      hit_idxs = np.array([idx // len(target_joint_names) for idx in hit_idxs])
      #print('hit_idxs:', hit_idxs)
      #print('hit_idxs.shape:', hit_idxs.shape)
      #print('frame_tracks:', frame_tracks)
      #print('frame_tracks.shape:', frame_tracks.shape)
      hit_tracks = frame_tracks[hit_idxs]
      #print('hit_tracks:', hit_tracks)
      # ignore self
      hit_tracks = set(hit_tracks) - set([infected_track])
      #print('hit_tracks:', hit_tracks)
      frame_infected_tracks |= hit_tracks

  # vis if new hit
  new_infected_tracks = frame_infected_tracks - infected_tracks
  if new_infected_tracks:
    print('new_infected_tracks:', new_infected_tracks)
    frame = frames[frame_id]
    predictions = all_predictions[frame_id]
    instances = predictions['instances'].to(cpu_device)
    cur_boxes = np.asarray(instances.pred_boxes.tensor)
    vis_frame = visualize_predictions(frame, predictions)
    for box, frame_track in zip(cur_boxes, frame_tracks):
      cv2.putText(vis_frame, str(frame_track), (box[0], box[1]), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255))
      cv2.putText(vis_frame, str(frame_track), (int(box[0]+1), int(box[1]+1)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0))

    # TODO: highlight
    '''
    infected_box = cur_boxes[infected_idx]
    #print('infected_box:', infected_box)
    for new_infected_track in new_infected_tracks:
      print('new_infected_track:', new_infected_track)
      new_infected_idx = np.where(frame_tracks == new_infected_track)[0][0]
      hit_box = cur_boxes[new_infected_idx]
      #print('hit_box:', hit_box)
    '''

    cv2_imshow(vis_frame)

  infected_tracks |= new_infected_tracks

  #confident = prob >= _KEYPOINT_THRESHOLD

  # if frame_id > 50:
  #   break

print('final infected_tracks:', infected_tracks)

In [0]:
# TODO: Download the results
if 0:
  from google.colab import files
  files.download(output)  

Limitations / Future Work
- 2D results in false positives (due to lack of depth information) and false negatives (due to occlusion)
  - Lack of depth can be mitigated with depth estimation: https://roxanneluo.github.io/Consistent-Video-Depth-Estimation/
  - Occlusion can be mitigated by using mutiple cameras: https://arxiv.org/pdf/2003.03972v2.pdf
- Only keypoints are considered, not semantic segmentation masks, which may contain more information about whether contact ocurred.
- Doesn't track individuals between videos