## YOLOv9 Detection + Yaw Estimation

In [10]:
video_name = 'video6'

In [11]:
from ultralytics import YOLO
from deepbox.deepbox import Deepbox

import cv2
import json
import torch
import numpy as np

# Load YOLOv9
device_name = 'cpu'
if torch.cuda.is_available():
    torch.cuda.set_device(0)
    device_name = '0'
    yolov9 = YOLO("yolov9c.pt").to('cuda')
    print('Detecting using GPU...')
else:
    device_name = 'cpu'
    yolov9 = YOLO("yolov9c.pt").to('cpu')
    print('Detecting using CPU...')

# Load Deepbox
deepbox = Deepbox()

# Read video file
input_file_name = f'video/{video_name}.mp4'
cap = cv2.VideoCapture(input_file_name)

# get height, width and frame count of the video
width, height = (
        int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    )
no_of_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
proc_frames = 0

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter()
output_file_name = f"output/{video_name}_yolov9.mp4"
out.open(output_file_name, fourcc, fps, (width, height), True)

frames = []
frames_detections = []
try:
    for f in range(no_of_frames):
        ret, frame = cap.read()
        if not ret:
            break
        
        # yolov9 detection
        result = yolov9.predict(frame)[0]
        frame_detections = []
        for i, box in enumerate(result.boxes):
            accepted_class_names = ['car', 'bus', 'truck']
            class_index = int(result.boxes.cls[i])
            conf = float(result.boxes.conf[i])
            xywh = result.boxes.xywh[i]

            x1 = int(xywh[0] - xywh[2]/2)
            y1 = int(xywh[1] - xywh[3]/2)
            x2 = int(xywh[0] + xywh[2]/2)
            y2 = int(xywh[1] + xywh[3]/2)

            det = xywh.tolist() +  [class_index, conf]
            if result.names[class_index] not in accepted_class_names:
                continue

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
            cv2.putText(frame, f'{det[5]:.2f}', (int(det[0]), int(det[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
            
            frame_detections.append(det)
        
        # yaw estimation
        detc_2ds = []
        for det in frame_detections:
            class_name = result.names[det[4]]
            xywh = det
            x1 = int(xywh[0] - xywh[2]/2)
            y1 = int(xywh[1] - xywh[3]/2)
            x2 = int(xywh[0] + xywh[2]/2)
            y2 = int(xywh[1] + xywh[3]/2)
            detc_2ds.append([class_name, [x1, y1, x2, y2]])
        yaws = deepbox.predict(frame, detc_2ds)
        yaw_degrees = [np.rad2deg(yaw) for yaw in yaws]

        # convert to frame detections [x, y, w, h, yaw, class_index, conf]
        frame_detections = [[det[0], det[1], det[2], det[3], yaw_degrees[i], det[4], det[5]] for i, det in enumerate(frame_detections)]

        out.write(frame)
        frames_detections.append(frame_detections)

    # Add to frames detections
    with open(f'output/{video_name}_yolov9.json', 'w', encoding='utf-8') as f:
        json.dump({ "final_frames_detections": frames_detections }, f, ensure_ascii=False, indent=4)

except Exception:
    import traceback
    print(traceback.format_exc())
finally:
    # Release resources
    cap.release()
    out.release()

Detecting using GPU...

0: 384x640 1 person, 1 car, 400.7ms
Speed: 33.9ms preprocess, 400.7ms inference, 69.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 car, 370.6ms
Speed: 4.0ms preprocess, 370.6ms inference, 46.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 370.5ms
Speed: 4.1ms preprocess, 370.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 371.2ms
Speed: 4.0ms preprocess, 371.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 371.9ms
Speed: 3.0ms preprocess, 371.9ms inference, 3.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 car, 371.1ms
Speed: 4.2ms preprocess, 371.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 371.4ms
Speed: 3.5ms preprocess, 371.4ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 370.6ms
Speed: 4.2ms prepro

In [12]:
%%capture
!ffmpeg -i video/video6.mp4 -f mp3 -ab 192000 -vn output/video6_audio.mp3 -y
!ffmpeg -i output/video6_yolov9.mp4 -i output/video6_audio.mp3 -c:v libx264 -c:a copy -map 0:v:0 -map 1:a:0 output/video6_yolov9_audio.mp4 -y