## Install and Import Libraries

**Use CUDA 12.4 with these torch versions!!!**

- [Download CUDA 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_local)
- [Download cuDNN 8.9.7](https://developer.nvidia.com/rdp/cudnn-archive)

In [1]:
# --- Install Packages ---
#%pip install ultralytics
#%pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

#%pip install opencv-python
#%pip install cvzone
#%pip install numpy

# --- Import Libraries ---
from ultralytics import YOLO
import torch
import time

import cv2
import cvzone
import numpy as np

## Preparation and Configuration

### Data Preparation

In [2]:
# --- Class Names and RGB Function ---

CLASS_NAMES_BASE = ["pedestrian", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
                    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
                    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
                    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
                    "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
                    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
                    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
                    "toothbrush"]

CLASS_NAMES_TRAFFIC = ["green-lights", "red-lights", "yellow-lights"]

CLASS_NAMES_LANES = ['lane', 'road']

def to_RGB(r, g, b):
    return (b, g, r)

### Configuration and Visualization

In [5]:
# --- Configuration---

# --- Source Settings ---
VIDEO_SOURCE = "data/test-videos/test-real-h.mp4"                  # Horizontal test video from iPhone camera
#VIDEO_SOURCE = "data/test-videos/test-real-v.mp4"                  # Vertical test video from iPhone camera

#VIDEO_SOURCE = "data/test-videos/test-cars.mp4"                    # Video for testing cars
#VIDEO_SOURCE = "data/test-videos/test-lights.mp4"                  # Video for testing traffic lights
#VIDEO_SOURCE = "data/test-videos/test-lines.mp4"                    # Video for testing lanes and roads   
#VIDEO_SOURCE = "data/test-videos/test-universal.mp4"               # Video for testing everything

#VIDEO_SOURCE = 0                                                   # Webcam for testing real-time


# --- Model Settings ---
MODEL_PEOPLE_CARS_PATH = "trained-models/yolo11n.pt"                  # People and Vehicles detection model   [yolo11l / yolo11n] 
MODEL_TRAFFIC_LIGHTS_PATH = "trained-models/v1/lights-nano.pt"     # Traffic lights detection model        [yoloTLDl / yoloTLDn]
MODEL_LANE_SEGMENTATION_PATH = "trained-models/v1/lane-nano.pt"    # Lane segmentation model               [yoloLSl / yoloLSn]

# --- Visualization Settings ---
COLOR_PEOPLE_CARS = to_RGB(130, 0, 255)                           # People and Vehicles: Purple

GREEN_TRAFFIC_LIGHT = to_RGB(0, 255, 0)                           # Green traffic light: Green
RED_TRAFFIC_LIGHT = to_RGB(255, 0, 0)                             # Red traffic light: Red
YELLOW_TRAFFIC_LIGHT = to_RGB(255, 255, 0)                        # Yellow traffic light: Yellow

COLOR_LANES = to_RGB(255, 0, 0)                                   # Lanes: Orange
COLOR_ROADS = to_RGB(255, 255, 255)                               # Roads: Green

# --- Threshold Settings ---
CONFIDENCE_THRESHOLD_DETECTION = 0.5                                # Vehicles and Pedestrians detection confidence
CONFIDENCE_THRESHOLD_DETECTION_LIGHTS = 0.35                        # Lights detection confidence

CONFIDENCE_THRESHOLD_SEGMENTATION = 0.75                            # Lane and Road segmentation confidence
OVERLAY_ALPHA = 0.25                                                # Lane and Road mask transparency

# --- Detection Settings ---
ENABLE_PEOPLE_CARS_DETECTION = True                                # Enable/Disable detection of people and vehicles
ENABLE_TRAFFIC_LIGHTS_DETECTION = True                             # Enable/Disable detection of traffic lights
ENABLE_LANES = True                                                # Enable/Disable lane segmentation
ENABLE_ROADS = False                                                # Enable/Disable road segmentation

## Model and Source Loading

In [6]:
# --- Load Models ---

# --- Check Device ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device} [{torch.cuda.get_device_name(0)}]")

# --- Load Models ---
model_people_cars = YOLO(MODEL_PEOPLE_CARS_PATH)
model_people_cars.to(device)

model_traffic_lights = YOLO(MODEL_TRAFFIC_LIGHTS_PATH)
model_traffic_lights.to(device)

model_lane_segmentation = YOLO(MODEL_LANE_SEGMENTATION_PATH)
model_lane_segmentation.to(device)
print("Models loaded successfully.")

Using device: cuda [NVIDIA GeForce GTX 1660 SUPER]
Models loaded successfully.


In [7]:
# --- Video Capture ---
print(f"Opening video source: {"Camera" if VIDEO_SOURCE == 0 else VIDEO_SOURCE}")
cap = cv2.VideoCapture(VIDEO_SOURCE)

if not cap.isOpened():
    print(f"Error: Could not open video source")
    exit()

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video resolution: {frame_width}x{frame_height}")

Opening video source: data/test-videos/test-real-h.mp4
Video resolution: 944x544


## Main Loop

In [None]:
while True:

    # Read a frame from the video sourceQ
    ret, frame = cap.read()
    if not ret:
        print("End of video stream or error reading frame.")
        break

    # --- Inference ---
    results_people_cars = model_people_cars.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION)
    results_traffic_lights = model_traffic_lights.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION_LIGHTS)
    results_lanes = model_lane_segmentation.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_SEGMENTATION)

    processed_frame = frame.copy()
    segmentation_overlay = np.zeros_like(processed_frame, dtype=np.uint8)


    # --- Model 1: Process Detections for Pedestrians and Vehicles ---
    
    if ENABLE_PEOPLE_CARS_DETECTION and results_people_cars:
        r = results_people_cars[0]
        boxes = r.boxes.cpu().numpy()

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            obj_class = CLASS_NAMES_BASE[cls_id]

            if obj_class in ["pedestrian", "car", "truck", "bus", "motorbike", "train"]:
                label = f"{obj_class}: {conf:.2f}"
                cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_PEOPLE_CARS, colorC=COLOR_PEOPLE_CARS)
                cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_PEOPLE_CARS, colorT=(255, 255, 255))


    # --- Model 2: Process Detections for Traffic Lights---
    if ENABLE_TRAFFIC_LIGHTS_DETECTION and results_traffic_lights:
        detected_green = False
        detected_red = False

        r = results_traffic_lights[0]
        boxes = r.boxes.cpu().numpy()

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            obj_class = CLASS_NAMES_TRAFFIC[cls_id]
            label = f"{obj_class}: {conf:.2f}"
            if obj_class == CLASS_NAMES_TRAFFIC[0]:
                detected_green = True
                cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=GREEN_TRAFFIC_LIGHT, colorC=GREEN_TRAFFIC_LIGHT)
                cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=GREEN_TRAFFIC_LIGHT, colorT=(0, 0, 0))
            elif obj_class == CLASS_NAMES_TRAFFIC[1]:
                detected_red = True
                cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=RED_TRAFFIC_LIGHT, colorC=RED_TRAFFIC_LIGHT)
                cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=RED_TRAFFIC_LIGHT, colorT=(0, 0, 0))
            elif obj_class == CLASS_NAMES_TRAFFIC[2]:
                cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=YELLOW_TRAFFIC_LIGHT, colorC=YELLOW_TRAFFIC_LIGHT)
                cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=YELLOW_TRAFFIC_LIGHT, colorT=(0, 0, 0))
        
        if detected_green:
            frame_height, frame_width, _ = processed_frame.shape

            (text_width, text_height), baseline = cv2.getTextSize("GREEN LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)

            padding = 10
            bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
            bg_rect_y2 = frame_height - padding
            bg_rect_x1 = (frame_width - text_width) // 2 - padding
            bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

            cv2.rectangle(processed_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), (0, 80, 0), cv2.FILLED)

            text_x = (frame_width - text_width) // 2
            text_y = frame_height - baseline - padding - (padding//2)

            cv2.putText(processed_frame, "GREEN LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2, cv2.LINE_AA)

        elif detected_red:
            frame_height, frame_width, _ = processed_frame.shape

            (text_width, text_height), baseline = cv2.getTextSize("RED LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)

            padding = 10
            bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
            bg_rect_y2 = frame_height - padding
            bg_rect_x1 = (frame_width - text_width) // 2 - padding
            bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

            cv2.rectangle(processed_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), to_RGB(80, 0, 0), cv2.FILLED)

            text_x = (frame_width - text_width) // 2
            text_y = frame_height - baseline - padding - (padding//2)

            cv2.putText(processed_frame, "RED LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, to_RGB(255, 0, 0), 2, cv2.LINE_AA)


    # --- Model 3: Process Segmentation for Lanes ---
    if results_lanes:
        r = results_lanes[0]

        if r.masks is not None:
            masks_data = r.masks.data
            class_ids = r.boxes.cls.cpu().numpy()

            if masks_data.shape[0] > 0 and len(class_ids) == masks_data.shape[0]:

                for i in range(masks_data.shape[0]):
                    mask = masks_data[i]
                    cls_id = int(class_ids[i])
                    obj_class = CLASS_NAMES_LANES[cls_id]

                    resized_mask = torch.nn.functional.interpolate(
                        mask.unsqueeze(0).unsqueeze(0),
                        size=(frame_height, frame_width), 
                        mode='bilinear',
                        align_corners=False
                    ).squeeze()

                    binary_mask = (resized_mask > CONFIDENCE_THRESHOLD_SEGMENTATION).float()

                    # --- Create Colored Overlay ---
                    binary_mask_cpu = binary_mask.cpu().numpy().astype(np.uint8)
                    bool_mask_cpu = binary_mask_cpu.astype(bool)

                    if ENABLE_LANES and obj_class == 'lane':
                        segmentation_overlay[bool_mask_cpu] = COLOR_LANES
                    elif ENABLE_ROADS and obj_class == 'road':
                        segmentation_overlay[bool_mask_cpu] = COLOR_ROADS

                # --- Blend the Overlay with the Frame ---
                processed_frame = cv2.addWeighted(segmentation_overlay, OVERLAY_ALPHA, processed_frame, 1 - OVERLAY_ALPHA, 0)

    # --- Display ---
    cv2.imshow("AutoDrive Output", processed_frame)

    # --- Exit Condition ---
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()