In [1]:
from base64 import b64encode
from PIL import Image
from tqdm.notebook import tqdm
import cv2
import numpy as np
import os
import torch
import random
import pafy

In [2]:
class ObjectDetection:
    def __init__(self, path, confidence, video):
        
        self.model = self.load_model()
        self.classes = self.model.names
        self.video = video
        self.path = path
        self.confidence = confidence
        self.pixels = {}
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        

    def load_model(self):

        model = torch.hub.load('ultralytics/yolov5', 'yolov5x',
                       pretrained=True, verbose=False)
        
        return model
    
    
    def generate_pixels_colors(self):
        n_red = random.randint(0,255)
        n_green = random.randint(0,255)
        n_blue = random.randint(0,255)
        
        return (n_red, n_green, n_blue)
    
    
           
    def detect_object_on_frame(self, img):
        '''Detect car on a frame and draw the rectangles and lines.'''
        self.model.to(self.device)
        
        results = self.model([img[:, :, ::-1]])  # Pass the frame through the model and get the boxes

        xyxy = results.xyxy[0].cpu().numpy() # img1 predictions (pandas) to numpy
        #      xmin    ymin    xmax   ymax  confidence  class    name
        # 0  749.50   43.50  1148.0  704.5    0.874023      0     xx
        # 1  433.50  433.50   517.5  714.5    0.687988     27     xx
        # 2  114.75  195.75  1095.0  708.0    0.624512      0     xx
        # 3  986.00  304.00  1028.0  420.0    0.286865     27     xx

        xyxy = xyxy[xyxy[:, 4] >= self.confidence]  # Filter desired confidence
        xyxy = xyxy[:, :6]
                
        for i, (x1, y1, x2, y2, conf, class_) in enumerate(xyxy):
            # Draw the boxes
            class_detected = self.classes[int(class_)]
            
            for ii in range(len(xyxy)):
                 if class_detected not in self.pixels.keys():
                    (r, g, b) = self.generate_pixels_colors()
                    self.pixels[class_detected] = (r, g, b)
            
            img = cv2.rectangle(img, (x1, y1), (x2, y2), (self.pixels[class_detected][:]), 1)
            cv2.putText(img, class_detected, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.65, 
                        (self.pixels[class_detected]), 2)
            
            
        return img


    def __call__(self):
        '''Detect people on a video and draw the rectangles and lines.'''
        if self.video == True:
            # Capture video
            
            cap = cv2.VideoCapture(self.path)
             
            # Get video properties
            fps = cap.get(cv2.CAP_PROP_FPS)
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

            # Define the codec and create VideoWriter object
            fourcc = cv2.VideoWriter_fourcc(*'XVID')
            if os.path.exists('output.avi'):
                os.remove('output.avi')
            out = cv2.VideoWriter('output.avi', fourcc, fps, (width, height))

            # Iterate through frames and detect people
            vidlen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            with tqdm(total=vidlen) as pbar:
                while cap.isOpened():
                    # Read a frame
                    ret, frame = cap.read()
                    # If it's ok
                    if ret == True:
                        frame = self.detect_object_on_frame(frame)
                        # Write new video
                        out.write(frame)
                        pbar.update(1)
                    else:
                        break

            # Release everything if job is finished
            cap.release()
            out.release()
            cv2.destroyAllWindows()
        
        else:
            
            img = cv2.imread(self.path)
     
            cv2.imshow('image', self.detect_object_on_frame(img))
            cv2.waitKey(0)

In [3]:
a = ObjectDetection(r"C:\Users\bruno\Desktop\avenue.mp4", 0.5, video=True)
a()

Fusing layers... 
Model Summary: 476 layers, 87730285 parameters, 0 gradients
Adding AutoShape... 
YOLOv5  2021-6-23 torch 1.9.0 CUDA:0 (NVIDIA GeForce GTX 1060 6GB, 6144.0MB)



  0%|          | 0/59717 [00:00<?, ?it/s]

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  img = cv2.rectangle(img, (x1, y1), (x2, y2), (self.pixels[class_detected][:]), 1)
  cv2.putText(img, class_detected, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.65,
