# AutoDrive - Setup

## Install and Import Libraries

**Use CUDA 12.4 with these torch versions!!!**

- [Download CUDA 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_local)
- [Download cuDNN 8.9.7](https://developer.nvidia.com/rdp/cudnn-archive)

In [1]:
# --- Install Packages ---
#%pip install ultralytics
#%pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

#%pip install opencv-python
#%pip install cvzone
#%pip install numpy

# --- Import Libraries ---
from ultralytics import YOLO
import torch

import cv2
import cvzone
import numpy as np

## Preparation and Configuration

### Data Preparation

In [2]:
# --- Class Names and RGB Function ---
CLASS_NAMES = ['bicycle', 'bus', 'car', 'green-light', 'motorbike', 'pedestrian', 'red-light', 'truck', 'yellow-light']
CLASS_NAMES_LANES = ['lane', 'road']

def rgb(r, g, b):
    return (b, g, r)

### Configuration and Visualization

In [3]:
# --- Configuration---

# --- Source Settings ---
VIDEO_SOURCE = "data/test-videos/test-real-h.mp4"               # Horizontal test video from iPhone camera
#VIDEO_SOURCE = "data/test-videos/test-real-v.mp4"              # Vertical test video from iPhone camera

#VIDEO_SOURCE = "data/test-videos/test-cars.mp4"                # Video for testing cars
#VIDEO_SOURCE = "data/test-videos/test-lights.mp4"              # Video for testing traffic lights
#VIDEO_SOURCE = "data/test-videos/test-lines.mp4"               # Video for testing lanes and roads   
#VIDEO_SOURCE = "data/test-videos/test-universal.mp4"           # Video for testing everything

#VIDEO_SOURCE = 0                                               # Webcam for testing real-time


# --- Model Settings ---
DETECTION_MODEL_PATH = "trained-models/detection-large.pt"
SEGMENTATION_MODEL_PATH = "trained-models/segmentation-large.pt"

# --- Visualization Settings ---
COLOR_PEOPLE = rgb(130, 0, 255)
COLOR_CARS = rgb(0, 140, 255)
COLOR_BIG_VEHICLES = rgb(0, 255, 213)
COLOR_SMALL_VEHICLES = rgb(7, 48, 231)

TRAFFIC_LIGHT_GREEN = rgb(0, 255, 0)
TRAFFIC_LIGHT_RED = rgb(255, 0, 0)
TRAFFIC_LIGHT_YELLOW = rgb(255, 255, 0)

COLOR_LANES = rgb(255, 0, 0) 
COLOR_ROADS = rgb(255, 255, 255) 

# --- Threshold Settings ---
CONFIDENCE_THRESHOLD_DETECTION = 0.2
CONFIDENCE_THRESHOLD_SEGMENTATION = 0.7
OVERLAY_ALPHA = 0.25

# --- Detection Settings ---
ENABLE_PEOPLE_CARS = True
ENABLE_TRAFFIC_LIGHTS = True
SHOW_LIGHTS_BANNER = True
ENABLE_LANES = True
ENABLE_ROADS = False

### Model and Source Loading

In [4]:
# --- Load Models ---

# --- Check Device ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device} [{torch.cuda.get_device_name(0)}]")

# --- Load Models ---
detection_model = YOLO(DETECTION_MODEL_PATH)
detection_model.to(device)

segmentation_model = YOLO(SEGMENTATION_MODEL_PATH)
segmentation_model.to(device)

print("Model loaded successfully.")

Using device: cuda [NVIDIA GeForce GTX 1660 SUPER]
Model loaded successfully.


In [5]:
# --- Video Capture ---
print(f"Opening video source: {"Camera" if VIDEO_SOURCE == 0 else VIDEO_SOURCE}")
cap = cv2.VideoCapture(VIDEO_SOURCE)

if not cap.isOpened():
    print(f"Error: Could not open video source")
    exit()

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video resolution: {frame_width}x{frame_height}")

Opening video source: data/test-videos/test-real-h.mp4
Video resolution: 944x544


# Run Detection Model

In [6]:
while True:

    # Read a frame from the video sourceQ
    ret, det_frame = cap.read()
    if not ret:
        print("End of video stream or error reading frame.")
        break

    # --- Inference ---
    detection_results = detection_model.predict(det_frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION)
    
    # --- Process Detections for Pedestrians, Vehicles and Traffic-lights ---
    
    if detection_results:
        d = detection_results[0]
        boxes = d.boxes.cpu().numpy()

        detected_green = False
        detected_red = False

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            if 0 <= cls_id < len(CLASS_NAMES):
                obj_class = CLASS_NAMES[cls_id]
                label = f"{obj_class}: {conf:.2f}"
            else:
                print(f"Warning: Unknown class ID {cls_id} detected. Skipping.")
                continue

            if ENABLE_PEOPLE_CARS:
                if obj_class == 'car':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_CARS, colorC=COLOR_CARS)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_CARS, colorT=(255, 255, 255))
                elif obj_class == 'pedestrian':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_PEOPLE, colorC=COLOR_PEOPLE)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_PEOPLE, colorT=(255, 255, 255))
                elif obj_class in ['bus', 'truck']:
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_BIG_VEHICLES, colorC=COLOR_BIG_VEHICLES)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_BIG_VEHICLES, colorT=(255, 255, 255))
                elif obj_class in ['bicycle', 'motorbike']:
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_SMALL_VEHICLES, colorC=COLOR_SMALL_VEHICLES)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_SMALL_VEHICLES, colorT=(255, 255, 255))

            if ENABLE_TRAFFIC_LIGHTS:
                if obj_class == 'green-light':
                    detected_green = True
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_GREEN, colorC=TRAFFIC_LIGHT_GREEN)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_GREEN, colorT=(255, 255, 255))
                elif obj_class == 'red-light':
                    detected_red = True
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_RED, colorC=TRAFFIC_LIGHT_RED)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_RED, colorT=(255, 255, 255))
                elif obj_class == 'yellow-light':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_YELLOW, colorC=TRAFFIC_LIGHT_YELLOW)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_YELLOW, colorT=(255, 255, 255))

        if SHOW_LIGHTS_BANNER:
            frame_height, frame_width, _ = det_frame.shape

            if detected_green:
                (text_width, text_height), baseline = cv2.getTextSize("GREEN LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)
                padding = 10
                bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
                bg_rect_y2 = frame_height - padding
                bg_rect_x1 = (frame_width - text_width) // 2 - padding
                bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

                cv2.rectangle(det_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), (0, 80, 0), cv2.FILLED)

                text_x = (frame_width - text_width) // 2
                text_y = frame_height - baseline - padding - (padding//2)
                cv2.putText(det_frame, "GREEN LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2, cv2.LINE_AA)

            elif detected_red:
                (text_width, text_height), baseline = cv2.getTextSize("RED LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)
                padding = 10
                bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
                bg_rect_y2 = frame_height - padding
                bg_rect_x1 = (frame_width - text_width) // 2 - padding
                bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

                cv2.rectangle(det_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), rgb(80, 0, 0), cv2.FILLED)

                text_x = (frame_width - text_width) // 2
                text_y = frame_height - baseline - padding - (padding//2)
                cv2.putText(det_frame, "RED LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb(255, 0, 0), 2, cv2.LINE_AA)

    # --- Display ---
    cv2.imshow("AutoDrive Detection", det_frame)

    # --- Exit Condition ---
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()

End of video stream or error reading frame.


# Run Segmentation Model

In [None]:
while True:

    # Read a frame from the video sourceQ
    ret, seg_frame = cap.read()
    if not ret:
        print("End of video stream or error reading frame.")
        break

    # --- Inference ---
    segmentation_results = segmentation_model.predict(seg_frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_SEGMENTATION)
    segmentation_overlay = np.zeros_like(seg_frame, dtype=np.uint8)
    
    # --- Process Detections for Pedestrians, Vehicles and Traffic-lights ---
    
    if segmentation_results:
        s = segmentation_results[0]

        if s.masks is not None:
            masks_data = s.masks.data
            class_ids = s.boxes.cls.cpu().numpy()

            if masks_data.shape[0] > 0 and len(class_ids) == masks_data.shape[0]:

                for i in range(masks_data.shape[0]):
                    mask = masks_data[i]
                    cls_id = int(class_ids[i])

                    if 0 <= cls_id < len(CLASS_NAMES_LANES):
                        obj_class = CLASS_NAMES_LANES[cls_id]
                    else:
                        print(f"Warning: Unknown class ID {cls_id} detected. Skipping.")
                        continue

                    resized_mask = torch.nn.functional.interpolate(
                        mask.unsqueeze(0).unsqueeze(0),
                        size=(frame_height, frame_width), 
                        mode='bilinear',
                        align_corners=False
                    ).squeeze()

                    binary_mask = (resized_mask > CONFIDENCE_THRESHOLD_SEGMENTATION).float()

                    # --- Create Colored Overlay ---
                    binary_mask_cpu = binary_mask.cpu().numpy().astype(np.uint8)
                    bool_mask_cpu = binary_mask_cpu.astype(bool)

                    if ENABLE_LANES and obj_class == 'lane':
                        segmentation_overlay[bool_mask_cpu] = COLOR_LANES
                    elif ENABLE_ROADS and obj_class == 'road':
                        segmentation_overlay[bool_mask_cpu] = COLOR_ROADS

                # --- Blend the Overlay with the Frame ---
                seg_frame = cv2.addWeighted(segmentation_overlay, OVERLAY_ALPHA, seg_frame, 1 - OVERLAY_ALPHA, 0)

    # --- Display ---
    cv2.imshow("AutoDrive Segmentation", seg_frame)

    # --- Exit Condition ---
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()

# Run Both Models

In [None]:
while True:

    # Read a frame from the video sourceQ
    ret, det_frame = cap.read()
    ret2, seg_frame = cap.read()
    if not ret and not ret2:
        print("End of video stream or error reading frame.")
        break

    # --- Inference ---
    detection_results = detection_model.predict(det_frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_DETECTION)
    segmentation_results = segmentation_model.predict(seg_frame, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD_SEGMENTATION)
    segmentation_overlay = np.zeros_like(seg_frame, dtype=np.uint8)
    
    # --- Process Detections for Pedestrians, Vehicles and Traffic-lights ---
    
    if detection_results:
        d = detection_results[0]
        boxes = d.boxes.cpu().numpy()

        detected_green = False
        detected_red = False

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w, h = x2 - x1, y2 - y1
            conf = box.conf[0]
            cls_id = int(box.cls[0])

            if 0 <= cls_id < len(CLASS_NAMES):
                obj_class = CLASS_NAMES[cls_id]
                label = f"{obj_class}: {conf:.2f}"
            else:
                print(f"Warning: Unknown class ID {cls_id} detected. Skipping.")
                continue

            if ENABLE_PEOPLE_CARS:
                if obj_class == 'car':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_CARS, colorC=COLOR_CARS)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_CARS, colorT=(255, 255, 255))
                elif obj_class == 'pedestrian':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_PEOPLE, colorC=COLOR_PEOPLE)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_PEOPLE, colorT=(255, 255, 255))
                elif obj_class in ['bus', 'truck']:
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_BIG_VEHICLES, colorC=COLOR_BIG_VEHICLES)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_BIG_VEHICLES, colorT=(255, 255, 255))
                elif obj_class in ['bicycle', 'motorbike']:
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=COLOR_SMALL_VEHICLES, colorC=COLOR_SMALL_VEHICLES)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=COLOR_SMALL_VEHICLES, colorT=(255, 255, 255))

            if ENABLE_TRAFFIC_LIGHTS:
                if obj_class == 'green-light':
                    detected_green = True
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_GREEN, colorC=TRAFFIC_LIGHT_GREEN)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_GREEN, colorT=(255, 255, 255))
                elif obj_class == 'red-light':
                    detected_red = True
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_RED, colorC=TRAFFIC_LIGHT_RED)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_RED, colorT=(255, 255, 255))
                elif obj_class == 'yellow-light':
                    cvzone.cornerRect(det_frame, (x1, y1, w, h), l=9, t=2, rt=2, colorR=TRAFFIC_LIGHT_YELLOW, colorC=TRAFFIC_LIGHT_YELLOW)
                    cvzone.putTextRect(det_frame, label, (max(0, x1), max(35, y1 - 10)), scale=0.6, thickness=1, offset=3, colorR=TRAFFIC_LIGHT_YELLOW, colorT=(255, 255, 255))

        if SHOW_LIGHTS_BANNER:
            frame_height, frame_width, _ = det_frame.shape

            if detected_green:
                (text_width, text_height), baseline = cv2.getTextSize("GREEN LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)
                padding = 10
                bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
                bg_rect_y2 = frame_height - padding
                bg_rect_x1 = (frame_width - text_width) // 2 - padding
                bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

                cv2.rectangle(det_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), (0, 80, 0), cv2.FILLED)

                text_x = (frame_width - text_width) // 2
                text_y = frame_height - baseline - padding - (padding//2)
                cv2.putText(det_frame, "GREEN LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2, cv2.LINE_AA)

            elif detected_red:
                (text_width, text_height), baseline = cv2.getTextSize("RED LIGHT", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 2)
                padding = 10
                bg_rect_y1 = frame_height - text_height - baseline - (2 * padding)
                bg_rect_y2 = frame_height - padding
                bg_rect_x1 = (frame_width - text_width) // 2 - padding
                bg_rect_x2 = bg_rect_x1 + text_width + (2 * padding)

                cv2.rectangle(det_frame, (bg_rect_x1, bg_rect_y1), (bg_rect_x2, bg_rect_y2), rgb(80, 0, 0), cv2.FILLED)

                text_x = (frame_width - text_width) // 2
                text_y = frame_height - baseline - padding - (padding//2)
                cv2.putText(det_frame, "RED LIGHT", (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb(255, 0, 0), 2, cv2.LINE_AA)
        
    if segmentation_results:
        s = segmentation_results[0]

        if s.masks is not None:
            masks_data = s.masks.data
            class_ids = s.boxes.cls.cpu().numpy()

            if masks_data.shape[0] > 0 and len(class_ids) == masks_data.shape[0]:

                for i in range(masks_data.shape[0]):
                    mask = masks_data[i]
                    cls_id = int(class_ids[i])
                    obj_class = CLASS_NAMES_LANES[cls_id]

                    resized_mask = torch.nn.functional.interpolate(
                        mask.unsqueeze(0).unsqueeze(0),
                        size=(frame_height, frame_width), 
                        mode='bilinear',
                        align_corners=False
                    ).squeeze()

                    binary_mask = (resized_mask > CONFIDENCE_THRESHOLD_SEGMENTATION).float()

                    # --- Create Colored Overlay ---
                    binary_mask_cpu = binary_mask.cpu().numpy().astype(np.uint8)
                    bool_mask_cpu = binary_mask_cpu.astype(bool)

                    if ENABLE_LANES and obj_class == 'lane':
                        segmentation_overlay[bool_mask_cpu] = COLOR_LANES
                    elif ENABLE_ROADS and obj_class == 'road':
                        segmentation_overlay[bool_mask_cpu] = COLOR_ROADS

                # --- Blend the Overlay with the Frame ---
                seg_frame = cv2.addWeighted(segmentation_overlay, OVERLAY_ALPHA, seg_frame, 1 - OVERLAY_ALPHA, 0)

    # --- Display ---
    cv2.imshow("AutoDrive Detection", det_frame)
    cv2.imshow("AutoDrive Segmentation", seg_frame)

    # --- Exit Condition ---
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()



IndexError: too many indices for array: array is 0-dimensional, but 2 were indexed

: 