## Install and Import Libraries

**Use CUDA 12.4 with these torch versions!!!**

- [Download CUDA 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_local)
- [Download cuDNN 8.9.7](https://developer.nvidia.com/rdp/cudnn-archive)

In [None]:
# Install required packages

#%pip install ultralytics
#%pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

#%pip install opencv-python
#%pip install cvzone
#%pip install numpy

In [None]:
# Import Essential Libraries

from ultralytics import YOLO
import torch

import cv2
import cvzone
import numpy as np

## Configuration and Visualization

In [13]:
# --- Configuration ---

VIDEO_SOURCE = 0
#VIDEO_SOURCE = "data/test-videos/test_video.mp4"
MODEL_PEOPLE_CARS_PATH = "yolo-weights/yolo11l.pt"
MODEL_TRAFFIC_LIGHTS_PATH = "trained-models/yolo_large_trafficlights.pt"
MODEL_LANE_SEGMENTATION_PATH = "trained-models/yolo_large_lane.pt"

In [14]:
# --- Class Names ---

CLASS_NAMES_BASE = ["pedestrian", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
                    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
                    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
                    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
                    "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
                    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
                    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
                    "toothbrush"]

CLASS_NAMES_TRAFFIC = ["green-lights", "red-lights", "yellow-lights"]

CLASS_NAMES_LANES = ['lane', 'road']

In [None]:
# --- Visualization Settings ---
COLOR_PEOPLE_CARS = (255, 178, 50)   # Light Blue
COLOR_TRAFFIC_LIGHTS = (0, 255, 0)  # Green
COLOR_LANES = (0, 0, 255)         # Red
LANE_OVERLAY_ALPHA = 0.4            # Transparency of the lane

CONFIDENCE_THRESHOLD_DETECTION = 0.3 # Minimum confidence for detection boxes
CONFIDENCE_THRESHOLD_SEGMENTATION = 0.4 # Minimum confidence for segmentation masks

In [16]:
# --- GPU Setup ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device} [{torch.cuda.get_device_name(0)}]")

Using device: cuda [NVIDIA GeForce GTX 1660 SUPER]


## Model and Source Loading

In [19]:
# --- Load Models ---
print("Loading models...")
model_people_cars = YOLO(MODEL_PEOPLE_CARS_PATH)
model_people_cars.to(device)

model_traffic_lights = YOLO(MODEL_TRAFFIC_LIGHTS_PATH)
model_traffic_lights.to(device)

model_lane_segmentation = YOLO(MODEL_LANE_SEGMENTATION_PATH)
model_lane_segmentation.to(device)
print("Models loaded successfully.")

Loading models...
Models loaded successfully.


In [23]:
# --- Video Capture ---
print(f"Opening video source: {"Camera" if VIDEO_SOURCE == 0 else VIDEO_SOURCE}")
cap = cv2.VideoCapture(VIDEO_SOURCE)

if not cap.isOpened():
    print(f"Error: Could not open video source")
    exit()

Opening video source: Camera


In [24]:
# --- Get Frame Dimensions ---
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video resolution: {frame_width}x{frame_height}")

Video resolution: 640x480


## Main Loop

In [None]:
while True:

    # Read a frame from the video source
    ret, frame = cap.read()
    if not ret:
        print("End of video stream or error reading frame.")
        break

    # --- Inference ---
    results_people_cars = model_people_cars.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION)
    results_traffic_lights = model_traffic_lights.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION)
    results_lanes = model_lane_segmentation.predict(frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_SEGMENTATION)

    processed_frame = frame.copy()
    segmentation_overlay = np.zeros_like(processed_frame, dtype=np.uint8)

    # --- Model 1: Process Detections for Pedestrians and Vehicles ---
    if results_people_cars and len(results_people_cars) > 0:
        r = results_people_cars[0]
        boxes = r.boxes.cpu().numpy()

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            obj_class = CLASS_NAMES_BASE[cls_id]

            if obj_class in ["pedestrian", "car", "truck", "bus", "motorbike", "train"]:
                label = f"{obj_class}: {conf:.2f}"
                cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_PEOPLE_CARS, colorC=COLOR_PEOPLE_CARS)
                cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_PEOPLE_CARS, colorT=(255, 255, 255))

    # --- Model 2: Process Detections for Traffic Lights ---
    if results_traffic_lights and len(results_traffic_lights) > 0:
        r = results_traffic_lights[0]
        boxes = r.boxes.cpu().numpy()

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            obj_class = CLASS_NAMES_TRAFFIC[cls_id]
            label = f"{obj_class}: {conf:.2f}"
            cvzone.cornerRect(processed_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_TRAFFIC_LIGHTS, colorC=COLOR_TRAFFIC_LIGHTS)
            cvzone.putTextRect(processed_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_TRAFFIC_LIGHTS, colorT=(255, 255, 255))

    # --- Model 3: Process Segmentation for Lanes ---
    if results_lanes and len(results_lanes) > 0:
        r = results_lanes[0]

        if r.masks is not None:
            masks_data = r.masks.data # contains the mask tensors [N, H, W]

            # Check if any masks were detected
            if masks_data.shape[0] > 0:

                combined_mask = torch.max(masks_data, dim=0)[0]

                # Resize mask to original frame size using GPU acceleration
                combined_mask_resized = torch.nn.functional.interpolate(
                    combined_mask.unsqueeze(0).unsqueeze(0),
                    size=(frame_height, frame_width), 
                    mode='bilinear',
                    align_corners=False
                ).squeeze()

                # Get a binary mask (0 or 1)
                binary_mask = (combined_mask_resized > CONFIDENCE_THRESHOLD_SEGMENTATION).float()

                # --- Create Colored Overlay ---
                binary_mask_cpu = binary_mask.cpu().numpy().astype(np.uint8)
                bool_mask_cpu = binary_mask_cpu.astype(bool)
                segmentation_overlay[bool_mask_cpu] = COLOR_LANES

                # --- Blend the Overlay with the Frame ---
                processed_frame = cv2.addWeighted(
                    segmentation_overlay,
                    LANE_OVERLAY_ALPHA,
                    processed_frame,
                    1 - LANE_OVERLAY_ALPHA,
                    0
                )

    # --- Display ---
    cv2.imshow("Combined YOLO Output", processed_frame)

    # --- Exit Condition ---
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()