# Social Distance Tool with depth

This tool combines two algorithms to accurately detect people who are violating the social distancing protocol:
- Facebook/Detectron2 (Faster RCNN implementation)`https://github.com/facebookresearch/detectron2`
- "Digging into Self-Supervised Monocular Depth Prediction" `https://github.com/nianticlabs/monodepth2`

**Input:**
- A video sequence

**Output:**
- bounding boxes on all persons detected in the video
- highlighing people who are in close proximity
- depth map for accurate calculations 
***

## Code
**Import libraries for Detectron2**

In [1]:
# !python -m detectron2.utils.collect_env # to check if Detectron2 is working fine
# Some basic setup:
# Setup detectron2 logger
import detectron2
import open3d as o3d
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import cv2
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

**Import libraries and files for MonoDepth2 algorithm**

In [2]:
# Libraries for monodepth2
from __future__ import absolute_import, division, print_function
%matplotlib inline

import os
import PIL.Image as pil
import glob

import torch
from torchvision import transforms

import monodepth2.networks as networks
from monodepth2.utils import download_model_if_doesnt_exist
from monodepth2.layers import disp_to_depth
import matplotlib as mpl
import matplotlib.cm as cm

**Define key variables**

In [3]:
frames_folder = 'frames'
result_folder = 'results'
depths_folder = 'results/depth'
frame_count = 241 # Number of frames to consider in the video (use less for faster calculations) # None will take all frames
#video = "onwater_1.m4v"
img_ext = 'png'

**Convert Video to PNG Frames**

In [4]:
#!rm -r $frames_folder/*
#!mkdir $frames_folder/

#specify path to video

#capture video
"""
cap = cv2.VideoCapture(video)
cnt=0
FPS=cap.get(cv2.CAP_PROP_FPS)
# Check if video file is opened successfully
if (cap.isOpened()== False): 
  print("Error opening video stream or file")

ret,first_frame = cap.read()

#Read until video is completed
with tqdm(total=frame_count) as pbar:
    while(cap.isOpened()):

      # Capture frame-by-frame
      ret, frame = cap.read()
      pbar.update(1)
      if ret == True:

        #save each frame to folder        
        cv2.imwrite(frames_folder+'/{:04d}'.format(cnt)+'.png', frame)
        cnt=cnt+1
        if(cnt==frame_count) and frame_count != None:
          break
      # Break the loop
      else: 
        break
"""

mkdir: cannot create directory ‘frames/’: File exists


100%|██████████| 1/1 [00:00<00:00,  7.67it/s]


**Loading MonoDept2 pretrained model**

In [5]:

if torch.cuda.is_available(): # and not args.no_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

"""
model_name = "mono+stereo_640x192"
download_model_if_doesnt_exist(model_name)
encoder_path = os.path.join("monodepth2/models", model_name, "encoder.pth")
depth_decoder_path = os.path.join("monodepth2/models", model_name, "depth.pth")


# LOADING PRETRAINED MODEL
encoder = networks.ResnetEncoder(18, False)
loaded_dict_enc = torch.load(encoder_path, map_location=device)
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
encoder.load_state_dict(filtered_dict_enc)

# extract the height and width of image that this model was trained with
feed_height = loaded_dict_enc['height']
feed_width = loaded_dict_enc['width']


encoder.to(device)
encoder.eval();

# LOADING PRETRAINED MODEL
depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))
loaded_dict = torch.load(depth_decoder_path, map_location=device)
depth_decoder.load_state_dict(loaded_dict)

depth_decoder.to(device)
depth_decoder.eval();
"""

  return torch._C._cuda_getDeviceCount() > 0


'\nmodel_name = "mono+stereo_640x192"\ndownload_model_if_doesnt_exist(model_name)\nencoder_path = os.path.join("monodepth2/models", model_name, "encoder.pth")\ndepth_decoder_path = os.path.join("monodepth2/models", model_name, "depth.pth")\n\n\n# LOADING PRETRAINED MODEL\nencoder = networks.ResnetEncoder(18, False)\nloaded_dict_enc = torch.load(encoder_path, map_location=device)\nfiltered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}\nencoder.load_state_dict(filtered_dict_enc)\n\n# extract the height and width of image that this model was trained with\nfeed_height = loaded_dict_enc[\'height\']\nfeed_width = loaded_dict_enc[\'width\']\n\n\nencoder.to(device)\nencoder.eval();\n\n# LOADING PRETRAINED MODEL\ndepth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))\nloaded_dict = torch.load(depth_decoder_path, map_location=device)\ndepth_decoder.load_state_dict(loaded_dict)\n\ndepth_decoder.to(device)\ndepth_decoder.eval();\n

**Performing depth estimation**

In [6]:
def findDepth(image_path,output_directory,ext):
    # FINDING INPUT IMAGES
    if os.path.isfile(image_path):
        # Only testing on a single image
        paths = [image_path]
        #output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(image_path):
        # Searching folder for images
        paths = glob.glob(os.path.join(image_path, '*.{}'.format(ext)))
        #output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(image_path))
    #print("-> Predicting on {:d} test images".format(len(paths)))


    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        with tqdm(total=len(paths)) as pbar:
            for idx, image_path in (enumerate(paths)):

                if image_path.endswith("_disp.jpg"):
                    # don't try to predict disparity for a disparity image!
                    continue

                # Load image and preprocess
                input_image = pil.open(image_path).convert('RGB')
                original_width, original_height = input_image.size
                input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
                input_image = transforms.ToTensor()(input_image).unsqueeze(0)

                # PREDICTION
                input_image = input_image.to(device)
                features = encoder(input_image)
                outputs = depth_decoder(features)

                disp = outputs[("disp", 0)]
                disp_resized = torch.nn.functional.interpolate(
                    disp, (original_height, original_width), mode="bilinear", align_corners=False)

                print("Output of network resized:")
                print(np.shape(disp_resized))
                # Saving numpy file
                output_name = os.path.splitext(os.path.basename(image_path))[0]
                name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name))
                _, scaled_disp = disp_to_depth(disp, 3.0, 5000)
                np.save(name_dest_npy, scaled_disp.cpu().numpy())
                
                print("Absolute depth size:")
                print(np.shape(scaled_disp))
                #print(np.shape(disp))
                #break
                
                # Saving colormapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().numpy()
                
                #print(disp_resized_np)
                
                vmax = np.percentile(disp_resized_np, 95)
                normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
                
                mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
                colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
                
                #print(np.shape(colormapped_im))
                #break
                
                im = pil.fromarray(colormapped_im)
                
                name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
                im.save(name_dest_im)
                pbar.update(1)

In [7]:
#!rm -r $depths_folder/*
!mkdir $depths_folder
#findDepth(frames_folder,depths_folder,img_ext)

mkdir: cannot create directory ‘results/depth’: File exists


**Download a pretrained model from Detectron2 Model Zoo for Faster-RCNN**

In [8]:
cfg = get_cfg()

# add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
cfg.merge_from_file(model_zoo.get_config_file("COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model

# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml")
cfg.MODEL.DEVICE='cpu'
predictor = DefaultPredictor(cfg)

INFO - 2021-02-05 14:17:11,043 - checkpoint - Loading checkpoint from https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl
INFO - 2021-02-05 14:17:11,055 - file_io - URL https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl cached in /home/aellaboudy/.torch/iopath_cache/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl
INFO - 2021-02-05 14:17:14,454 - detection_checkpoint - Reading a file from 'Detectron2 Model Zoo'


### Define all the key functions

In [9]:
# define a function which return the bottom center of every bbox
def mid_point(img,img_depth,person,idx):
  #get the coordinates
  x1,y1,x2,y2 = person[idx]
  _ = cv2.rectangle(img, (x1, y1), (x2, y2), (0,0,255), 2)
  
  #compute bottom center of bbox
  x_mid = int((x1+x2)/2)
  y_mid = int(y2)
  mid   = (x_mid-1,y_mid-1)
  print("mid=",mid)
  print("img_depth",img_depth.shape)
  z_mid = img_depth[(y_mid-1,x_mid-1)]
  mid3d = (x_mid,y_mid,z_mid)
    
  _ = cv2.circle(img, mid, 5, (0, 0, 255), -1)
  cv2.putText(img, str(z_mid) + " m", mid, cv2.FONT_HERSHEY_SIMPLEX,1, (255, 255, 255), 2, cv2.LINE_AA)
  return mid3d

# define a function which computes euclidean distance between two midpoints
from scipy.spatial import distance
def compute_distance(midpoints,num):
  dist = np.zeros((num,num))
  for i in range(num):
    for j in range(i+1,num):
      if i!=j:
        dst = distance.euclidean(midpoints[i], midpoints[j])
        dist[i][j]=dst
  return dist


# Finds pairs of people who are close together
def find_closest(dist,num,thresh):
  p1=[]
  p2=[]
  d=[]
  for i in range(num):
    for j in range(i,num):
      if( (i!=j) & (dist[i][j]<=thresh)):
        p1.append(i)
        p2.append(j)
        d.append(dist[i][j])
  return p1,p2,d


# Given pairs of people who are close, color them red
def change_2_red(img,img_depth,person,p1,p2):
  mid1 = []
  mid2 = []
  for p in p1:
    mid1.append(mid_point(img,img_depth,person,p))
  for pp in p2:
    mid2.append(mid_point(img,img_depth,person,pp))
  for inx in range(len(mid1)):
      #print("mid1",mid1[inx][:2])
      _ = cv2.line(img, mid1[inx][:2], mid2[inx][:2], (0,255,0), thickness=2, lineType=8, shift=0)
  
  risky = np.unique(p1+p2)
  for i in risky:
    x1,y1,x2,y2 = person[i]
    _ = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)  
  return img


# Main function to find closest people
def find_closest_people(name,name_depth,thresh,savedir):

  img = cv2.imread(name)
  depth = np.load(name_depth)
  print("NPY file shape")
  print(depth.shape)
  original_height, original_width,_ = img.shape # (1920,1080) #input_image.size
  print("Original width, original_height:")
  print(original_width)
  print(original_height)
  disp_resized = torch.nn.functional.interpolate(
                    torch.from_numpy(depth).unsqueeze(0).unsqueeze(0), (original_height, original_width), mode="bilinear", align_corners=False)
  img_depth = disp_resized.squeeze().cpu().numpy()

  print("Depth resized shape, input to algorithms")
  print(disp_resized.shape)
  outputs = predictor(img)
  classes=outputs['instances'].pred_classes.cpu().numpy()
  bbox=outputs['instances'].pred_boxes.tensor.cpu().numpy()
  ind = np.where(classes>-1)[0]
  person=bbox[ind]
  midpoints = [mid_point(img,img_depth,person,i) for i in range(len(person))]
  num = len(midpoints)
  dist= compute_distance(midpoints,num)
  p1,p2,d=find_closest(dist,num,thresh)
  img = change_2_red(img,img_depth,person,p1,p2)
  cv2.imwrite(savedir+'/'+name,img)
  #print(savedir+'/'+name)
  return 0

**Fetch all the frames of the video sequence**

In [10]:
frames=[]
for file in os.listdir(frames_folder):
    if file.endswith(".png"):
        frames.append(frames_folder+"/"+file)
frames.sort()

**Fetch all the frame depths of the video sequence**

In [11]:
frame_depths=[]
for file in os.listdir(depths_folder):
    if file.endswith(".npy"):
        frame_depths.append(depths_folder+"/"+file)
frame_depths.sort()


### Main loop to get results

In [12]:
#from tqdm import tqdm
thresh=110
with tqdm(total=len(frames)) as pbar:
    for i in range(len(frames)):
        find_closest_people(frames[i],frame_depths[i],thresh,result_folder)
        pbar.update(1)
    
#_ = [find_closest_people(frames[i],thresh,'frames2') for i in tqdm(range(len(frames))) ]

  0%|          | 0/1 [00:00<?, ?it/s]

NPY file shape
(1080, 1920)
Original width, original_height:
1920
1080
Depth resized shape, input to algorithms
torch.Size([1, 1, 1080, 1920])


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  filter_inds = filter_mask.nonzero()
100%|██████████| 1/1 [00:09<00:00,  9.41s/it]

mid= (1011, 1074)
img_depth (1080, 1920)
mid= (1681, 752)
img_depth (1080, 1920)
mid= (830, 625)
img_depth (1080, 1920)
mid= (1774, 639)
img_depth (1080, 1920)
mid= (274, 626)
img_depth (1080, 1920)
mid= (321, 644)
img_depth (1080, 1920)
mid= (1769, 639)
img_depth (1080, 1920)
mid= (1774, 639)
img_depth (1080, 1920)
mid= (274, 626)
img_depth (1080, 1920)
mid= (1769, 639)
img_depth (1080, 1920)
mid= (321, 644)
img_depth (1080, 1920)


100%|██████████| 1/1 [00:09<00:00,  9.42s/it]


## Save results

**Main file with highlighed results**

In [13]:
frames=[]
for file in os.listdir(result_folder+"/frames/"):
    if file.endswith(".png"):
        frames.append(file)
frames.sort()

frame_array=[]
with tqdm(total=len(frames)) as pbar:
    for i in range(len(frames)):

        #reading each files
        img = cv2.imread(result_folder+'/frames/'+frames[i])
        img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

        height, width, layers = img.shape
        size = (width,height)

        #inserting the frames into an image array
        frame_array.append(img)
        pbar.update(1)

out = cv2.VideoWriter(result_folder+'/result.mp4',cv2.VideoWriter_fourcc(*'DIVX'), FPS, size)
 
with tqdm(total=len(frames)) as pbar:
    for i in range(len(frame_array)):
        # writing to a image array
        out.write(frame_array[i])
        pbar.update(1)
out.release()

100%|██████████| 10/10 [00:00<00:00, 12.77it/s]
100%|██████████| 10/10 [00:00<00:00, 37.75it/s]


**Depth map video**

In [14]:
frames_depth=[]
for file in os.listdir(depths_folder):
    if file.endswith(".jpeg"):
        frames_depth.append(file)
frames_depth.sort()

frame_array=[]
with tqdm(total=len(frames_depth)) as pbar:
    for i in range(len(frames_depth)):

        #reading each files
        img = cv2.imread(depths_folder+'/'+frames_depth[i])
        #img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        

        height, width, layers = img.shape
        size = (width,height)

        #inserting the frames into an image array
        frame_array.append(img)
        pbar.update(1)

out = cv2.VideoWriter(result_folder+'/depth_result.mp4',cv2.VideoWriter_fourcc(*'DIVX'), FPS, size)
 
with tqdm(total=len(frames_depth)) as pbar:
    for i in range(len(frame_array)):
        # writing to a image array
        out.write(frame_array[i])
        pbar.update(1)
out.release()

0it [00:00, ?it/s]
0it [00:00, ?it/s]
