model updation for voice output

model testing on videos 

In [None]:
import cv2
import pyttsx3
import threading
import queue
from ultralytics import YOLO
 
# model loading 
model = YOLO("best.pt")

# Speech queue
speech_queue = queue.Queue()

# pyttsx3 
engine = pyttsx3.init('sapi5')
engine.setProperty("rate", 155)


def speech_worker():
    while True:
        text = speech_queue.get()
        if text is None:
            break
        engine.say(text)
        engine.runAndWait()
        speech_queue.task_done()


# speech threading
threading.Thread(target=speech_worker, daemon=True).start()


def speak(text):
    speech_queue.put(text)


# thresholds
CLOSE_AREA = 55000
FRAME_CENTER_ZONE = 80

video_path = "custom/vid_4.mp4"
cap = cv2.VideoCapture(video_path)

last_decision = None

# custom messages for each class
class_warnings = {
    0: "Bike ahead",
    1: "Car ahead",
    2: "Cycle ahead",
    3: "Pole ahead",
    4: "Object ahead",
    5: "Pothole ahead, step carefully",
    6: "Scooty ahead",
    7: "Stairs ahead, step carefully",
    8: "Tree ahead",
}

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)[0]
    decision = "Path Clear"
    warning = None

    # bounding boxes
    for box in results.boxes:
        cls = int(box.cls)
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        area = (x2 - x1) * (y2 - y1)

        obj_center_x = (x1 + x2) // 2
        frame_center_x = frame.shape[1] // 2

        # warning message for each classes
        if cls in class_warnings:
            warning = class_warnings[cls]

        # distance logic
        if area > CLOSE_AREA:
            decision = "Stop, obstacle very close"
        else:
            # direction logic
            if obj_center_x < frame_center_x - FRAME_CENTER_ZONE:
                decision = "Move Right"
            elif obj_center_x > frame_center_x + FRAME_CENTER_ZONE:
                decision = "Move Left"

    # final message : distance + direction
    if decision == "Path Clear":
        final_message = "Path Clear"
    else:
        final_message = (warning + ", " + decision) if warning else decision

    if final_message != last_decision:
        print("Speaking:", final_message)
        speak(final_message)
        last_decision = final_message

    annotated_frame = results.plot()
    cv2.putText(annotated_frame, final_message, (30, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3)

    cv2.imshow("Assistive Vision System", annotated_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


0: 384x640 3 bikes, 3 cycles, 1 scooty, 391.9ms
Speed: 12.8ms preprocess, 391.9ms inference, 26.0ms postprocess per image at shape (1, 3, 384, 640)
Speaking: Bike ahead, Move Right

0: 384x640 3 bikes, 3 cycles, 1 scooty, 241.6ms
Speed: 3.8ms preprocess, 241.6ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 bikes, 3 cycles, 1 scooty, 231.2ms
Speed: 2.6ms preprocess, 231.2ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 bikes, 3 cycles, 1 scooty, 232.5ms
Speed: 2.9ms preprocess, 232.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 bikes, 3 cycles, 1 scooty, 211.9ms
Speed: 2.7ms preprocess, 211.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 bikes, 3 cycles, 1 scooty, 179.1ms
Speed: 3.4ms preprocess, 179.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 bikes, 3 cycles, 1 scooty, 175.7ms
Speed: 1.9ms preprocess, 175.7ms i

: 