## YOLOv9 Detection + Yaw Estimation

In [1]:
video_name = 'video1'

In [4]:
from ultralytics import YOLO
from yaen.yaen import YAEN
import cv2
import json
import torch
from yaen.models.baselines import *

# Load YOLOv9
device_name = 'cpu'
if torch.cuda.is_available():
    torch.cuda.set_device(0)
    device_name = '0'
    yolov9 = YOLO("yolov9c.pt").to('cuda')
    print('Detecting using GPU...')
else:
    device_name = 'cpu'
    yolov9 = YOLO("yolov9c.pt").to('cpu')
    print('Detecting using CPU...')

# Load YAEN
yaen = YAEN(device_name=device_name)

# Read video file
input_file_name = f'video/{video_name}.mp4'
cap = cv2.VideoCapture(input_file_name)

# get height, width and frame count of the video
width, height = (
        int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    )
no_of_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
proc_frames = 0

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter()
output_file_name = f"output/{video_name}_yolov9.mp4"
out.open(output_file_name, fourcc, fps, (width, height), True)

frames = []
frames_detections = []
try:
    for f in range(no_of_frames):
        ret, frame = cap.read()
        if not ret:
            break
        
        result = yolov9.predict(frame)[0]
        frame_detections = []
        for i, box in enumerate(result.boxes):
            accepted_class_names = ['car', 'bus', 'truck']
            class_index = int(result.boxes.cls[i])
            conf = float(result.boxes.conf[i])
            xywh = result.boxes.xywh[i]

            x1 = int(xywh[0] - xywh[2]/2)
            y1 = int(xywh[1] - xywh[3]/2)
            x2 = int(xywh[0] + xywh[2]/2)
            y2 = int(xywh[1] + xywh[3]/2)
            cropped = frame[y1:y2, x1:x2]

            yaw = 0
            yaw_predict = yaen.predict_single(cropped)
            if len(yaw_predict) >= 1:
                yaw = yaw_predict[0]

            det = xywh.tolist() +  [yaw, class_index, conf]
            if result.names[class_index] not in accepted_class_names:
                continue

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
            cv2.putText(frame, f'{det[6]:.2f}', (int(det[0]), int(det[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
            
            frame_detections.append(det)
        out.write(frame)
        frames_detections.append(frame_detections)

    # Add to frames detections
    with open(f'output/{video_name}_yolov9.json', 'w', encoding='utf-8') as f:
        json.dump({ "final_frames_detections": frames_detections }, f, ensure_ascii=False, indent=4)

except Exception:
    import traceback
    print(traceback.format_exc())
finally:
    # Release resources
    cap.release()
    out.release()

Detecting using GPU...

0: 384x640 8 cars, 126.3ms
Speed: 2.7ms preprocess, 126.3ms inference, 305.0ms postprocess per image at shape (1, 3, 384, 640)


  return F.conv2d(input, weight, bias, self.stride,



0: 384x640 9 cars, 102.0ms
Speed: 3.0ms preprocess, 102.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 cars, 101.9ms
Speed: 2.0ms preprocess, 101.9ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 102.0ms
Speed: 2.0ms preprocess, 102.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 103.0ms
Speed: 2.0ms preprocess, 103.0ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 103.1ms
Speed: 2.0ms preprocess, 103.1ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 cars, 103.0ms
Speed: 2.0ms preprocess, 103.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 cars, 102.0ms
Speed: 3.0ms preprocess, 102.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 102.1ms
Speed: 2.0ms preprocess, 102.1ms inference, 3.0ms postprocess per image at shape (1, 3,

In [14]:
img = cv2.imread('./images/input7.png')
yaen.predict_single(img)

[200.38710021972656, 293.33160400390625]

In [7]:
%%capture
!ffmpeg -i video/video1.mp4 -f mp3 -ab 192000 -vn output/video1_audio.mp3 -y
!ffmpeg -i output/video1_yolov9.mp4 -i output/video1_audio.mp3 -c:v libx264 -c:a copy -map 0:v:0 -map 1:a:0 output/video1_yolov9_audio.mp4 -y

In [6]:
import cv2
import json

yolov9_file_name = f'output/{video_name}_yolov9.mp4'

# Draw bounding box to video
cap = cv2.VideoCapture(input_file_name)
with open(f'output/{video_name}_yolov9.json') as f:
    detections = json.load(f)['final_frames_detections']

fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
fps = int(cap.get(cv2.CAP_PROP_FPS))
width, height = (
            int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
            int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
)
out = cv2.VideoWriter()
out.open(yolov9_file_name, fourcc, fps, (width, height), True)

proc_frames = 0
try:
    while proc_frames < len(detections):
        ret, frame = cap.read()
        if not ret:
            break

        im = frame
        # Loop through list (if empty this will be skipped) and overlay green bboxes
        for d in detections[proc_frames]:
            cv2.rectangle(im, (int(d[0] - d[2]/2), int(d[1] - d[3]/2)), (int(d[0] + d[2]/2), int(d[1] + d[3]/2)), (0, 255, 0), 3)
            cv2.putText(im, f'{d[4]:.2f} {d[6]:.2f}', (int(d[0]), int(d[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # write the frame
        out.write(im)

        proc_frames += 1
except Exception as error:
    # Release resources
    print(error)
    cap.release()
    out.release()

out.release()
cap.release()