In [1]:
from ultralytics import YOLO
import cv2
import math
from collections import defaultdict
import numpy as np

## Without Track

In [20]:
# # For Webcam
# cap = cv2.VideoCapture(0)

# For Video
input_video_path = "footage/phone/VID_20241014_104735.mp4"
cap = cv2.VideoCapture(input_video_path)

In [21]:
# Initialize the YOLO model
model = YOLO("runs/train/train5/weights/best.pt")

In [22]:
# Define class names
classNames = ["Others", "Plastic", "Straw", "Paper", "Tissue", "Bottle", "Beverage Carton Box", "Cigarette Pack"]

# Define colors for each class (adjust as needed)
class_colors = {
    "Others": (255, 0, 0),
    "Plastic": (255, 0, 128),
    "Straw": (255, 0, 255),
    "Paper": (179, 0, 255),
    "Tissue": (0, 255, 0),
    "Bottle": (0, 255, 255),
    "Beverage Carton Box": (0, 128, 255),
    "Cigarette Pack": (0, 0, 255),
}

In [23]:
while True:
    # Read a frame from the video
    success, img = cap.read()
    if not success:
        break

    # Perform object detection
    results = model(img, stream=True)

    # Process each result
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # Extract box coordinates and confidence
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            conf = math.ceil((box.conf[0] * 100)) / 100
            cls = int(box.cls[0])

            # Ensure the class index is within the range of classNames
            if cls < len(classNames):
                currentClass = classNames[cls]

                color = class_colors[currentClass]
                cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
                text = f"{currentClass} {conf}"
                cv2.putText(
                    img,
                    text,
                    (max(0, x1), max(35, y1)),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    color,
                    2,
                )

            else:
                print(f"Warning: Class index {cls} is out of range for classNames")

    # Resize the image to 1280x720 for display (adjust as needed)
    resized_img = cv2.resize(img, (720, 1280))

    # Display the image
    cv2.imshow("Image", resized_img)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Release video capture and video writer
cap.release()

# Close all windows
cv2.destroyAllWindows()


0: 640x384 3 Plastics, 1 Straw, 20.1ms
Speed: 3.1ms preprocess, 20.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 Plastics, 1 Straw, 17.5ms
Speed: 2.0ms preprocess, 17.5ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 Plastics, 1 Straw, 17.4ms
Speed: 3.0ms preprocess, 17.4ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 Plastics, 2 Straws, 17.7ms
Speed: 4.0ms preprocess, 17.7ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 Plastics, 1 Straw, 17.0ms
Speed: 2.7ms preprocess, 17.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 Plastics, 1 Straw, 17.8ms
Speed: 2.5ms preprocess, 17.8ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 Plastics, 1 Straw, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 Plastics, 1 Straw, 17.0ms


## With Track

In [7]:
cap = cv2.VideoCapture(input_video_path)

In [8]:
track_history = defaultdict(lambda: [])

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    results = model.track(frame, persist=True)
    img = results[0].plot()

    # Check if there are any detected boxes
    if results[0].boxes is not None:
        boxes = results[0].boxes.xywh.cpu()

        # Check if track IDs are available
        if results[0].boxes.id is not None:
            track_ids = results[0].boxes.id.int().cpu().tolist()

            # Process each detected object
            for box, track_id in zip(boxes, track_ids):
                x, y, w, h = box
                track = track_history[track_id]
                track.append((float(x), float(y)))
                if len(track) > 90:
                    track.pop(0)
                points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))

                cv2.polylines(
                    img,
                    [points],
                    isClosed=False,
                    color=(230, 230, 230),
                    thickness=6,
                )

        else:
            print("No track IDs available")

    resized_img = cv2.resize(img, (1280, 720))

    # Display the image
    cv2.imshow("Image", resized_img)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 (no detections), 13.4ms
Speed: 6.7ms preprocess, 13.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 14.1ms
Speed: 3.2ms preprocess, 14.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 15.5ms
Speed: 4.3ms preprocess, 15.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 13.6ms
Speed: 3.8ms preprocess, 13.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 14.5ms
Speed: 3.6ms preprocess, 14.5ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 14.1ms
Speed: 2.5ms preprocess, 14.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
No track IDs available

0: 384x640 (no detections), 14.3ms
Speed: 3.0ms preprocess, 14.