# Social Distance Tool with depth

This tool combines two algorithms to accurately detect people who are violating the social distancing protocol:
- Facebook/Detectron2 (Faster RCNN implementation)`https://github.com/facebookresearch/detectron2`
- "Digging into Self-Supervised Monocular Depth Prediction" `https://github.com/nianticlabs/monodepth2`

**Input:**
- A video sequence

**Output:**
- bounding boxes on all persons detected in the video
- highlighing people who are in close proximity
- depth map for accurate calculations 
***

## Code
**Import libraries for Detectron2**

In [14]:
# !python -m detectron2.utils.collect_env # to check if Detectron2 is working fine
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import cv2
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

**Import libraries and files for MonoDepth2 algorithm**

In [2]:
# Libraries for monodepth2
from __future__ import absolute_import, division, print_function
%matplotlib inline

import os
import PIL.Image as pil
import glob

import torch
from torchvision import transforms

import networks
from utils import download_model_if_doesnt_exist
from layers import disp_to_depth
import matplotlib as mpl
import matplotlib.cm as cm

**Convert Video to PNG Frames**

In [37]:
frames_folder = 'frames'
frame_count = 100
!rm -r $frames_folder/*
!mkdir $frames_folder/

#specify path to video
video = "sample.mp4"

#capture video
cap = cv2.VideoCapture(video)
cnt=0
FPS=cap.get(cv2.CAP_PROP_FPS)
# Check if video file is opened successfully
if (cap.isOpened()== False): 
  print("Error opening video stream or file")

ret,first_frame = cap.read()

#Read until video is completed
with tqdm(total=frame_count) as pbar:
    while(cap.isOpened()):

      # Capture frame-by-frame
      ret, frame = cap.read()
      pbar.update(1)
      if ret == True:

        #save each frame to folder        
        cv2.imwrite(frames_folder+'/{:04d}'.format(cnt)+'.png', frame)
        cnt=cnt+1
        if(cnt==frame_count):
          break
      # Break the loop
      else: 
        break

mkdir: cannot create directory ‘frames/’: File exists


100%|██████████| 100/100 [00:05<00:00, 18.95it/s]


**Loading MonoDept2 pretrained models**

In [26]:
if torch.cuda.is_available(): # and not args.no_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model_name = "mono_640x192"
download_model_if_doesnt_exist(model_name)
encoder_path = os.path.join("models", model_name, "encoder.pth")
depth_decoder_path = os.path.join("models", model_name, "depth.pth")


# LOADING PRETRAINED MODEL
encoder = networks.ResnetEncoder(18, False)
loaded_dict_enc = torch.load(encoder_path, map_location=device)
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
encoder.load_state_dict(filtered_dict_enc)

# extract the height and width of image that this model was trained with
feed_height = loaded_dict_enc['height']
feed_width = loaded_dict_enc['width']


encoder.to(device)
encoder.eval();

# LOADING PRETRAINED MODEL
depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))
loaded_dict = torch.load(depth_decoder_path, map_location=device)
depth_decoder.load_state_dict(loaded_dict)

depth_decoder.to(device)
depth_decoder.eval();

**Performing depth estimation**

In [29]:
def findDepth(image_path,output_directory,ext):
    # FINDING INPUT IMAGES
    if os.path.isfile(image_path):
        # Only testing on a single image
        paths = [image_path]
        #output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(image_path):
        # Searching folder for images
        paths = glob.glob(os.path.join(image_path, '*.{}'.format(ext)))
        #output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(image_path))
    #print("-> Predicting on {:d} test images".format(len(paths)))


    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        with tqdm(total=len(paths)) as pbar:
            for idx, image_path in (enumerate(paths)):

                if image_path.endswith("_disp.jpg"):
                    # don't try to predict disparity for a disparity image!
                    continue

                # Load image and preprocess
                input_image = pil.open(image_path).convert('RGB')
                original_width, original_height = input_image.size
                input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
                input_image = transforms.ToTensor()(input_image).unsqueeze(0)

                # PREDICTION
                input_image = input_image.to(device)
                features = encoder(input_image)
                outputs = depth_decoder(features)

                disp = outputs[("disp", 0)]
                disp_resized = torch.nn.functional.interpolate(
                    disp, (original_height, original_width), mode="bilinear", align_corners=False)

                # Saving numpy file
                output_name = os.path.splitext(os.path.basename(image_path))[0]
                name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name))
                scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
                np.save(name_dest_npy, scaled_disp.cpu().numpy())

                # Saving colormapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().numpy()
                vmax = np.percentile(disp_resized_np, 95)
                normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
                mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
                colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
                im = pil.fromarray(colormapped_im)

                name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
                im.save(name_dest_im)
                pbar.update(1)
                #print("   Processed {:d} of {:d} images - saved prediction to {}".format(
                #    idx + 1, len(paths), name_dest_im))
            
        #print('-> Done!')


    

In [40]:
image_path = 'frames'
output_directory = 'results/depth'
ext = 'png'

!rm -r $output_directory/*
!mkdir $output_directory
image_path = 'frames'
ext = 'png'
findDepth(image_path,output_directory,ext)

mkdir: cannot create directory ‘results/depth’: File exists


100%|██████████| 100/100 [00:14<00:00,  7.14it/s]


**Download a pretrained model from Detectron2 Model Zoo**

In [20]:
cfg = get_cfg()

# add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_C4_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.9  # set threshold for this model

# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_C4_3x.yaml")
predictor = DefaultPredictor(cfg)

## Define all the key functions

In [34]:
# define a function which return the bottom center of every bbox
def mid_point(img,person,idx,img_depth=0):
  #get the coordinates
  x1,y1,x2,y2 = person[idx]
  _ = cv2.rectangle(img, (x1, y1), (x2, y2), (0,0,255), 2)
  
  #compute bottom center of bbox
  x_mid = int((x1+x2)/2)
  y_mid = int(y2)
  mid   = (x_mid,y_mid)
  
  _ = cv2.circle(img, mid, 5, (0, 0, 255), -1)
  cv2.putText(img, str(idx), mid, cv2.FONT_HERSHEY_SIMPLEX,1, (255, 255, 255), 2, cv2.LINE_AA)
  
  return mid

# define a function which computes euclidean distance between two midpoints
from scipy.spatial import distance
def compute_distance(midpoints,num):
  dist = np.zeros((num,num))
  for i in range(num):
    for j in range(i+1,num):
      if i!=j:
        dst = distance.euclidean(midpoints[i], midpoints[j])
        dist[i][j]=dst
  return dist


# Finds pairs of people who are close together
def find_closest(dist,num,thresh):
  p1=[]
  p2=[]
  d=[]
  for i in range(num):
    for j in range(i,num):
      if( (i!=j) & (dist[i][j]<=thresh)):
        p1.append(i)
        p2.append(j)
        d.append(dist[i][j])
  return p1,p2,d


# Given pairs of people who are close, color them red
def change_2_red(img,person,p1,p2):
  mid1 = []
  mid2 = []
  for p in p1:
    mid1.append(mid_point(img,person,p))
  for pp in p2:
    mid2.append(mid_point(img,person,pp))
  for inx in range(len(mid1)):
      _ = cv2.line(img, mid1[inx], mid2[inx], (0,255,0), thickness=2, lineType=8, shift=0)
  
  risky = np.unique(p1+p2)
  for i in risky:
    x1,y1,x2,y2 = person[i]
    _ = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)  
  return img


# Main function to find closest people
def find_closest_people(name,thresh,savedir):

  img = cv2.imread('frames/'+name)
  outputs = predictor(img)
  classes=outputs['instances'].pred_classes.cpu().numpy()
  bbox=outputs['instances'].pred_boxes.tensor.cpu().numpy()
  ind = np.where(classes==0)[0]
  person=bbox[ind]
  midpoints = [mid_point(img,person,i) for i in range(len(person))]
  num = len(midpoints)
  dist= compute_distance(midpoints,num)
  p1,p2,d=find_closest(dist,num,thresh)
  img = change_2_red(img,person,p1,p2)
  cv2.imwrite(savedir+'/'+name,img)
  return 0

**Fetch all the frames of the video sequence**

In [21]:
frames=[]
for file in os.listdir("frames/"):
    if file.endswith(".png"):
        frames.append(file)
frames.sort()

**Fetch all the frame depths of the video sequence**

In [36]:
frame_depths=[]
for file in os.listdir("results/depth/"):
    if file.endswith(".npy"):
        frame_depths.append(file)
frame_depths.sort()

**Main loop to get results**

In [35]:
#from tqdm import tqdm
thresh=100
output_directory = 'results/frames'
with tqdm(total=len(frames)) as pbar:
    for i in range(len(frames)):
        find_closest_people(frames[i],thresh,output_directory)
        pbar.update(1)
    
#_ = [find_closest_people(frames[i],thresh,'frames2') for i in tqdm(range(len(frames))) ]

100%|██████████| 100/100 [00:38<00:00,  2.58it/s]


## Save results

In [None]:
%%time
frames=[]
for file in os.listdir("frames2/"):
    if file.endswith(".png"):
        frames.append(file)
frames.sort()

frame_array=[]
for i in range(len(frames)):
    
    #reading each files
    img = cv2.imread('frames2/'+frames[i])
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

    height, width, layers = img.shape
    size = (width,height)
    
    #inserting the frames into an image array
    frame_array.append(img)

out = cv2.VideoWriter('sample_output2.mp4',cv2.VideoWriter_fourcc(*'DIVX'), FPS, size)
 
for i in range(len(frame_array)):
    # writing to a image array
    out.write(frame_array[i])
out.release()