In [1]:
from ultralytics import YOLO
import cv2
from collections import defaultdict
import numpy as np

## Without Track

In [2]:
# # For Webcam
# cap = cv2.VideoCapture(0)

# For Video
input_video_path = "footage/testing/phone/VID_20241111_121149.mp4"
cap = cv2.VideoCapture(input_video_path)

In [3]:
# Initialize the YOLO model
model = YOLO("runs/train/train11/weights/best.pt")

In [4]:
# Define class names
class_names = [
    "Other",
    "Plastic",
    "Straw",
    "Paper",
    "Tissue",
    "Bottle",
    "Tetra Pack" "Cigarette Pack",
    "Carton",
    "Food Container",
]

# Define colors for each class (adjust as needed)
class_colors = {
    "Other": (255, 0, 0),
    "Plastic": (255, 0, 128),
    "Paper": (179, 0, 255),
    "Straw": (255, 0, 255),
    "Tissue": (0, 255, 0),
    "Bottle": (0, 255, 255),
    "Tetra Pack": (0, 128, 255),
    "Cigarette Pack": (0, 0, 255),
    "Carton": (255, 255, 0),
    "Food Container": (255, 128, 0),
}

In [5]:
while True:
    # Read a frame from the video
    success, img = cap.read()
    if not success:
        break

    # Perform object detection
    results = model(img, stream=True)

    # Process each result
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # Extract box coordinates and confidence
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            conf = f"{box.conf[0] * 100:.2f}"
            cls = int(box.cls[0])

            # Ensure the class index is within the range of classNames
            if cls < len(class_names):
                current_class = class_names[cls]
                color = class_colors[current_class]

                cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
                text = f"{current_class} {conf}%"
                cv2.putText(
                    img,
                    text,
                    (max(0, x1), max(35, y1)),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    color,
                    2,
                )

            else:
                print(f"Warning: Class index {cls} is out of range for class names")

    # Resize the image to 1280x720 for display (adjust as needed)
    resized_img = cv2.resize(img, (1366, 768))

    # Display the image
    cv2.imshow("Image", resized_img)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Release video capture and video writer
cap.release()

# Close all windows
cv2.destroyAllWindows()




  return F.conv2d(input, weight, bias, self.stride,


0: 640x384 (no detections), 140.1ms
Speed: 3.5ms preprocess, 140.1ms inference, 9.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 8.1ms
Speed: 2.4ms preprocess, 8.1ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 9.4ms
Speed: 2.1ms preprocess, 9.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 8.1ms
Speed: 2.6ms preprocess, 8.1ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 6.9ms
Speed: 2.8ms preprocess, 6.9ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 9.7ms
Speed: 3.0ms preprocess, 9.7ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 6.2ms
Speed: 2.0ms preprocess, 6.2ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 7.4ms
Speed: 2.3ms preprocess, 7.4ms inference, 1.4

## With Track

In [26]:
cap = cv2.VideoCapture(input_video_path)

In [27]:
track_history = defaultdict(lambda: [])

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    results = model.track(frame, persist=True)
    img = results[0].plot()

    # Check if there are any detected boxes
    if results[0].boxes is not None:
        boxes = results[0].boxes.xywh.cpu()

        # Check if track IDs are available
        if results[0].boxes.id is not None:
            track_ids = results[0].boxes.id.int().cpu().tolist()

            # Process each detected object
            for box, track_id in zip(boxes, track_ids):
                x, y, w, h = box
                track = track_history[track_id]
                track.append((float(x), float(y)))
                if len(track) > 90:
                    track.pop(0)
                points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))

                cv2.polylines(
                    img,
                    [points],
                    isClosed=False,
                    color=(230, 230, 230),
                    thickness=6,
                )

        else:
            print("No track IDs available")

    resized_img = cv2.resize(img, (1280, 720))

    # Display the image
    cv2.imshow("Image", resized_img)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 2 Otherss, 14.0ms
Speed: 3.4ms preprocess, 14.0ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 11.1ms
Speed: 2.7ms preprocess, 11.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 12.3ms
Speed: 2.9ms preprocess, 12.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 9.9ms
Speed: 2.1ms preprocess, 9.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 10.6ms
Speed: 2.8ms preprocess, 10.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 10.4ms
Speed: 2.0ms preprocess, 10.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Otherss, 16.7ms
Speed: 2.8ms preprocess, 16.7ms inference, 2.5ms postprocess per image at shape (1,