In [4]:
from torchvision.models.detection import FasterRCNN
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
import numpy as np
import cv2

In [6]:
# Load a pre-trained ResNet50 backbone
backbone = resnet_fpn_backbone('resnet50', pretrained=True)

num_classes = 91

# Create the Faster R-CNN model with ResNet50 backbone
model = FasterRCNN(backbone=backbone, num_classes=num_classes)

In [7]:
classes = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [8]:
transform = transforms.Compose([
    transforms.ToTensor(),
])
COLORS = np.random.uniform(0, 255, size=(len(classes), 3))

In [9]:
from torch.autograd import Variable
def predict(image, model, device, detection_threshold=0.8):
    """
    Predict the output of an image after forward pass through
    the model and return the bounding boxes, class names, and 
    class labels. 
    """
    # Transform the image to tensor.
    image = transform(image).to(device)
    # Add a batch dimension.
    image = image.unsqueeze(0) 
    
    # Get the predictions on the image.
    with torch.no_grad():
        outputs = model(image) 
    # Get score for all the predicted objects.
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()
    # Get all the predicted bounding boxes.
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # Get boxes above the threshold score.
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)
    pred_scores=pred_scores[pred_scores >= detection_threshold].astype(np.int32)
    labels = outputs[0]['labels'][:len(boxes)]
    # Get all the predicited class names.
    # print(labels.cpu().numpy())
    pred_classes = [classes[i] for i in labels.cpu().numpy()]
    return boxes, pred_scores, pred_classes, labels
    # return outputs


def draw_boxes(boxes, classes, labels, image):
    """
    Draws the bounding box around a detected object.
    """
    lw = max(round(sum(image.shape) / 2 * 0.003), 2)  # Line width.
    tf = max(lw - 1, 1) # Font thickness.
    for i, box in enumerate(boxes):
        # print(box)
        color = COLORS[labels[i]]
        cv2.rectangle(
            img=image,
            pt1=(int(box[0]), int(box[1])),
            pt2=(int(box[2]), int(box[3])),
            color=color[::-1], 
            thickness=lw
        )
        cv2.putText(
            img=image, 
            text=classes[i], 
            org=(int(box[0]), int(box[1]-5)),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX, 
            fontScale=lw / 3, 
            color=color[::-1], 
            thickness=tf, 
            lineType=cv2.LINE_AA
        )
    return image

In [10]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.eval().to(device)

In [11]:
def non_max_suppression(boxes, max_bbox_overlap, scores=None):
    """Suppress overlapping detections.
    Original code from [1]_ has been adapted to include confidence score.
    .. [1] http://www.pyimagesearch.com/2015/02/16/
           faster-non-maximum-suppression-python/
    Examples
    --------
        >>> boxes = [d.roi for d in detections]
        >>> scores = [d.confidence for d in detections]
        >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores)
        >>> detections = [detections[i] for i in indices]
    Parameters
    ----------
    boxes : ndarray
        Array of ROIs (x, y, width, height).
    max_bbox_overlap : float
        ROIs that overlap more than this values are suppressed.
    scores : Optional[array_like]
        Detector confidence score.
    Returns
    -------
    List[int]
        Returns indices of detections that have survived non-maxima suppression.
    """
    if len(boxes) == 0:
        return []

    boxes = boxes.astype(np.float)
    pick = []

    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2] #+ boxes[:, 0]
    y2 = boxes[:, 3] #+ boxes[:, 1]

    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    if scores is not None:
        idxs = np.argsort(scores)
    else:
        idxs = np.argsort(y2)

    while len(idxs) > 0:
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)

        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])

        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        overlap = (w * h) / area[idxs[:last]]

        idxs = np.delete(
            idxs, np.concatenate(
                ([last], np.where(overlap > max_bbox_overlap)[0])))

    return pick

In [12]:
# from PIL import Image
# image = Image.open("/content/car_new.jpeg").convert('RGB')
# # Create a BGR copy of the image for annotation.
# image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# # Detect outputs.
# with torch.no_grad():
#     boxes, scores, classes_, labels = predict(image, model, device)
# # Draw bounding boxes.
# # print(boxes)
# image = draw_boxes(boxes, classes_, labels, image_bgr)
# # save_name = f"{args['input'].split('/')[-1].split('.')[0]}_t{''.join(str(args['threshold']).split('.'))}_{args['model']}"
# # cv2.imshow('Image', image)
# cv2.imwrite("detection.jpg", image)


In [13]:
!pip install filterpy



In [16]:
import cv2

def cv2_imshow(image):
    cv2.imshow('Image', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

from IPython.display import clear_output
from PIL import Image
# from utils import *
from sort_frcnn import *
tracker=Sort()
vid = cv2.VideoCapture("/content/video1.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output.mp4', fourcc, vid.get(cv2.CAP_PROP_FPS),(int(vid.get(3)),int(vid.get(4))))
uniqueCars={}
while(True):
    ret, frame_bgr = vid.read()
    if(ret==False):
      break
    frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    pilimg = Image.fromarray(frame)
    with torch.no_grad():
        boxes, scores, classes_, labels = predict(pilimg, model, device)
    
    detections=np.concatenate([boxes,scores.reshape(-1,1)],axis=1)
    indices = non_max_suppression(boxes, 0.4, scores)
    detections = transform(np.array([detections[i] for i in indices]))[0].to(device)
    classes_=np.array([classes_[i] for i in indices])
    labels=np.array([labels.cpu().numpy()[i] for i in indices])
    
    tracked_objects = tracker.update(detections.cpu())
    for i,t in enumerate(tracked_objects):
      x1,y1,x2,y2,id=t
      if(classes_[i]=="car"):
        uniqueCars[id]="car"
        bx=np.array([x1,y1,x2,y2]).reshape(1,-1)
        frame_bgr = draw_boxes(bx, np.array(["car-"+str(int(id))]),np.array([int(labels[i])]), frame_bgr)
    clear_output(wait=True)
    out.write(frame_bgr)
print(len(uniqueCars))
out.release()

0
