In [2]:
import cv2
import numpy as np
from IPython.display import display as disp, Image as IPyImage, clear_output
import time
from ultralytics import YOLO
import os
from PIL import Image
import shutil
import matplotlib.pyplot as plt
import sys
sys.path.append("../sort/")
# from Sort import Sort


## count FPS

In [11]:
import cv2

# Path to the video file
video_path = "output/output10FPS.mp4"

# Initialize the video capture
cap = cv2.VideoCapture(video_path)

# Check if the video file is opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
else:
    # Get and print the FPS of the video
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    print(f"The FPS of the video is: {fps}")

    # Release the video capture object
    cap.release()


The FPS of the video is: 5


## ROI

In [4]:
def display_video_with_rectangles(video_path, rect1_coords, rect2_coords):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break

            # Draw rectangles
            cv2.rectangle(frame, rect1_coords[0], rect1_coords[1], (0, 255, 0), 2)
            cv2.rectangle(frame, rect2_coords[0], rect2_coords[1], (0, 255, 0), 2)

            # Convert to JPEG
            _, buffer = cv2.imencode('.jpg', frame)

            # Display the frame
            disp(IPyImage(data=buffer.tobytes()))
            clear_output(wait=True)
            
            # Sleep to slow down the loop
            time.sleep(0.04)
    finally:
        cap.release()
        cv2.destroyAllWindows()


In [7]:
video_path = "sample/testVideo.mp4"
rect1_coords = ((240, 60), (450, 190))  # Top-left and bottom-right corners for the first rectangle
rect2_coords = ((5, 10), (120, 140)) # Top-left and bottom-right corners for the second rectangle

display_video_with_rectangles(video_path, rect1_coords, rect2_coords)

KeyboardInterrupt: 

## Detection

In [4]:
import cv2

# Initialize the YOLO model
model = YOLO('handDetection.pt')

# Define the video path and the output video path
video_path = "./sample/testVideo2.mp4"
output_path = "./output/detection.mp4"

# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Open the video
cap = cv2.VideoCapture(video_path)

# Get the video width, height, and frames per second
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create a VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while(cap.isOpened()):
    ret, frame = cap.read()

    if not ret:
        break

    # Run inference on the frame
    results = model(frame)

    # Loop through all detected hands and draw them
    for box_data in results[0].boxes.data.cpu().numpy():
        conf = box_data[4]  # Confidence score is at the 5th index
        if conf >= 0.1:  # Only draw if confidence is above 10%
            frame = draw_detections_on_frame(frame, box_data, results[0].names)

    # Save the frame with detections to the output video
    out.write(frame)

# Release the video objects
cap.release()
out.release()



0: 384x640 1 Human hand, 5.5ms
Speed: 1.5ms preprocess, 5.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.9ms
Speed: 1.2ms preprocess, 5.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.2ms
Speed: 1.1ms preprocess, 6.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.9ms
Speed: 1.1ms preprocess, 5.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.2ms
Speed: 1.0ms preprocess, 6.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.9ms
Speed: 1.1ms preprocess, 5.9ms inference, 0.7ms postprocess 

## Confidence

In [None]:
import cv2

# Initialize the YOLO model
model = YOLO('handDetection.pt')

# Define the video path and the output video path
video_path = "sample/testVideo2.mp4"
output_path = "output/detection.mp4"

# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Open the video
cap = cv2.VideoCapture(video_path)

# Get the video width, height, and frames per second
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create a VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while(cap.isOpened()):
    ret, frame = cap.read()
    
    if not ret:
        break

    # Run inference on the frame
    results = model(frame)

    # Loop through all detected hands and draw them
    for box_data in results[0].boxes.data.cpu().numpy():
        conf = box_data[4]  # Assuming that the 5th element is the confidence score
        if conf >= 0.1 and conf<=0.3:  # Filter by confidence
            frame = draw_detections_on_frame(frame, box_data, results[0].names)
    
    # Save the frame with detections to the output video
    out.write(frame)

# Release the video objects
cap.release()
out.release()


## SORT

In [11]:
# Initialize the YOLO model
model = YOLO('handDetection.pt')
tracker = Sort()

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    """Check if the center of the box is inside the rectangle."""
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    state = State.WAIT_FOR_PICKUP
    count = 0
    timestamps = []

    hand_was_in_drop = False

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break

            results = model(frame)
            
            dets = []
            for box_data in results[0].boxes.data.cpu().numpy():
                x1, y1, x2, y2, conf, cls = box_data
                dets.append([x1, y1, x2, y2, conf])

            # Skip tracking update for frames with no detections
            if len(dets) == 0:
                cv2.imshow('Frame', frame)
                continue

            trackers = tracker.update(np.array(dets))
            
            for d in trackers:
                frame = draw_detections_on_frame(frame, d, results[0].names)


            hand_detected_in_pickup = any(is_inside(d[:4], pickup_coords) for d in trackers)
            hand_detected_in_drop = any(is_inside(d[:4], drop_coords) for d in trackers)

            if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup:
                state = State.WAIT_FOR_DROP

            if state == State.WAIT_FOR_DROP and hand_detected_in_drop:
                hand_was_in_drop = True

            if hand_was_in_drop and hand_detected_in_pickup:
                count += 1
                state = State.WAIT_FOR_DROP
                hand_was_in_drop = False

            # ... rest of the code (drawing rectangles and text on the frame)

            out.write(frame)

            time.sleep(0.04)
    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()

# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, track_id = box_data
    label = f'Hand {track_id:.0f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Usage
video_path = "sample/testVideo2.mp4"
output_path = "output/output2SORT.mp4"
# rect1_coords = ((1050, 1550), (1830, 2500))  # Top-left and bottom-right corners for the first rectangle
pickup_coords = ((5, 10), (120, 140))
drop_coords = ((240, 60), (450, 190))

display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path)



0: 384x640 1 Human hand, 6.3ms
Speed: 1.4ms preprocess, 6.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.6ms
Speed: 3.7ms preprocess, 5.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.8ms
Speed: 1.3ms preprocess, 5.8ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.4ms
Speed: 1.3ms preprocess, 6.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.3ms
Speed: 1.0ms preprocess, 5.3ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.3ms
Speed: 1.4ms preprocess, 5.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.2ms
Speed: 1.4ms preprocess, 6.2ms inference, 1.1ms postprocess 

Stream ended.


## 10 FPS

In [2]:
# Initialize the YOLO model
model = YOLO('handDetection.pt')

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def is_on_side(box, rect, side):
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    if side == "bottom":
        return center_y >= (rect[1][1] + rect[0][1]) / 2
    if side == "right":
        return center_x >= (rect[1][0] + rect[0][0]) / 2
    # Add more conditions for other sides like 'top', 'left', etc.
    return False


def display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path, pickup_sides, drop_sides):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    print(original_fps)
    
    # Set the fps to 10
    fps = 5

    frame_count = 0

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    state = State.WAIT_FOR_PICKUP
    count = 0

    hand_was_in_drop = False

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break
            if frame_count % (original_fps // fps) == 0:
                results = model(frame, conf = 0.05, iou = 0.5)
                hand_detected_in_pickup = any(is_inside(map(int, box_data[:4]), pickup_coords) for box_data in results[0].boxes.data.cpu().numpy())
                hand_detected_in_drop = any(is_inside(map(int, box_data[:4]), drop_coords) for box_data in results[0].boxes.data.cpu().numpy())
    
                detected_on_pickup_side = any(is_on_side(map(int, box_data[:4]), pickup_coords, side) for side in pickup_sides for box_data in results[0].boxes.data.cpu().numpy())
                detected_on_drop_side = any(is_on_side(map(int, box_data[:4]), drop_coords, side) for side in drop_sides for box_data in results[0].boxes.data.cpu().numpy())
        
                if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup and detected_on_pickup_side:
                    state = State.WAIT_FOR_DROP
        
                if state == State.WAIT_FOR_DROP and hand_detected_in_drop and detected_on_drop_side:
                    hand_was_in_drop = True
        
                if hand_was_in_drop and hand_detected_in_pickup and detected_on_pickup_side:
                    count += 1
                    state = State.WAIT_FOR_DROP
                    hand_was_in_drop = False
    
                for box_data in results[0].boxes.data.cpu().numpy():
                    frame = draw_detections_on_frame(frame, box_data, results[0].names)
    
                cv2.putText(frame, f"Count: {count}", (width - 150, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(frame, f"Pickup: {hand_detected_in_pickup}", (width - 150, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Drop: {hand_detected_in_drop}", (width - 150, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Was in Drop: {hand_was_in_drop}", (width - 150, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"State: {state}", (width - 150, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                out.write(frame)
                
            frame_count += 1
            
    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()

# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Usage
video_path = "sample/testVideo2.mp4"
output_path = "output/output10FPS.mp4"
pickup_coords = ((5, 10), (120, 140))
drop_coords = ((240, 60), (450, 190))
pickup_sides = ["bottom", "right"]
drop_sides = ["bottom"]

display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path, pickup_sides, drop_sides)


25



0: 384x640 1 Human hand, 23.1ms
Speed: 10.7ms preprocess, 23.1ms inference, 16.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Human hands, 5.6ms
Speed: 2.6ms preprocess, 5.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.9ms
Speed: 1.4ms preprocess, 5.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.3ms
Speed: 1.4ms preprocess, 6.3ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.9ms
Speed: 1.3ms preprocess, 5.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.8ms
Speed: 1.4ms preprocess, 5.8ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.4ms
Speed: 1.2ms preprocess, 6.4ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.9ms
Speed: 1.2ms preprocess, 5.9ms inference, 1.5ms postprocess p

Stream ended.


In [4]:
# Initialize the YOLO model
model = YOLO('handDetection.pt')

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    print(original_fps)
    
    # Set the fps to 10
    fps = 5

    frame_count = 0

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    state = State.WAIT_FOR_PICKUP
    count = 0

    hand_was_in_drop = False

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break
            if frame_count % (original_fps // fps) == 0:
                results = model(frame, conf = 0.05, iou = 0.5)
                hand_detected_in_pickup = any(is_inside(map(int, box_data[:4]), pickup_coords) for box_data in results[0].boxes.data.cpu().numpy())
                hand_detected_in_drop = any(is_inside(map(int, box_data[:4]), drop_coords) for box_data in results[0].boxes.data.cpu().numpy())
    
                if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup:
                    state = State.WAIT_FOR_DROP
    
                if state == State.WAIT_FOR_DROP and hand_detected_in_drop:
                    hand_was_in_drop = True
    
                if hand_was_in_drop and hand_detected_in_pickup:
                    count += 1
                    state = State.WAIT_FOR_DROP
                    hand_was_in_drop = False
    
                for box_data in results[0].boxes.data.cpu().numpy():
                    frame = draw_detections_on_frame(frame, box_data, results[0].names)
    
                cv2.putText(frame, f"Count: {count}", (width - 150, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(frame, f"Pickup: {hand_detected_in_pickup}", (width - 150, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Drop: {hand_detected_in_drop}", (width - 150, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Was in Drop: {hand_was_in_drop}", (width - 150, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"State: {state}", (width - 150, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                out.write(frame)
                
            frame_count += 1
            
    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()

# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Usage
video_path = "sample/testVideo2.mp4"
output_path = "output/output10FPS.mp4"
pickup_coords = ((5, 10), (120, 140))
drop_coords = ((240, 60), (450, 190))

display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path)



0: 384x640 1 Human hand, 6.7ms
Speed: 1.9ms preprocess, 6.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Human hands, 6.7ms
Speed: 1.8ms preprocess, 6.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.0ms
Speed: 1.5ms preprocess, 6.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.3ms
Speed: 2.1ms preprocess, 5.3ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.5ms
Speed: 1.2ms preprocess, 6.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.3ms
Speed: 1.4ms preprocess, 6.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.6ms
Speed: 1.1ms preprocess, 5.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.3ms
Speed: 1.3ms preprocess, 5.3ms inference, 0.7ms postprocess per i

25


0: 384x640 2 Human hands, 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Human hands, 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 5.3ms
Speed: 1.4ms preprocess, 5.3ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.2ms
Speed: 1.2ms preprocess, 6.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 6.2ms
Speed: 1.1ms preprocess, 6.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Human hands, 5.6ms
Speed: 1.0ms preprocess, 5.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Human hands, 5.6ms
Speed: 1.0ms preprocess, 5.6ms inference, 0.7ms postprocess per image

Stream ended.


## Tracking

In [None]:
from ultralytics import YOLO

# Configure the tracking parameters and run the tracker
model = YOLO('handDetection.pt')
results = model.track(source="sample/testVideo.mp4", conf=0.1, iou=0.1, show=True, tracker="bytetrack.yaml", save=True)



errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (1/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 1 Human hand, 24.5ms
video 1/1 (2/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 1 Human hand, 5.3ms
video 1/1 (3/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 1 Human hand, 5.6ms
video 1/1 (4/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 (no detections), 6.4ms
video 1/1 (5/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 (no detections), 6.3ms
video 1/1 (6/6849) /home/arpit/Testing/texRND/sample/testVideo.mp4: 384x640 1 Human h

In [None]:
# Initialize the YOLO model for tracking
model = YOLO('handDetection.pt')
results = model.track(source="sample/testVideo.mp4", conf=0.3, iou=0.5, show=True)

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    """Check if the center of the box is inside the rectangle."""
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    state = State.WAIT_FOR_PICKUP
    count = 0
    timestamps = []

    hand_was_in_drop = False

    frame_index = 0  # Initialize frame index

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break

            results = model(frame)
            hand_detected_in_pickup = any(is_inside(map(int, box_data[:4]), pickup_coords) for box_data in results[0].boxes.data.cpu().numpy())
            hand_detected_in_drop = any(is_inside(map(int, box_data[:4]), drop_coords) for box_data in results[0].boxes.data.cpu().numpy())

            if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup:
                state = State.WAIT_FOR_DROP

            if state == State.WAIT_FOR_DROP and hand_detected_in_drop:
                hand_was_in_drop = True

            if hand_was_in_drop and hand_detected_in_pickup:
                count += 1
                state = State.WAIT_FOR_DROP
                hand_was_in_drop = False

            for box_data in results[0].boxes.data.cpu().numpy():
                frame = draw_detections_on_frame(frame, box_data, results[0].names)

            cv2.putText(frame, f"Count: {count}", (width - 150, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.putText(frame, f"Pickup: {hand_detected_in_pickup}", (width - 150, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"Drop: {hand_detected_in_drop}", (width - 150, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"Was in Drop: {hand_was_in_drop}", (width - 150, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"State: {state}", (width - 150, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.rectangle(frame, pickup_coords[0], pickup_coords[1], (0, 255, 0), 2)
            cv2.rectangle(frame, drop_coords[0], drop_coords[1], (0, 255, 0), 2)
            out.write(frame)

            time.sleep(0.04)
    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()


# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Usage
video_path = "sample/testVideo.mp4"
output_path = "output/output.mp4"
# rect1_coords = ((1050, 1550), (1830, 2500))  # Top-left and bottom-right corners for the first rectangle
pickup_coords = ((5, 10), (120, 140))
drop_coords = ((240, 60), (450, 190))

display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path)


## countCycle

In [25]:
import cv2
from ultralytics import YOLO

# Initialize the model
model = YOLO('handDetection.pt')  # Ensure you provide the correct path to your model weights

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def detect_entry_side(box, rect):
    x1, y1, x2, y2 = map(int, box)
    rect_x1, rect_y1, rect_x2, rect_y2 = rect[0][0], rect[0][1], rect[1][0], rect[1][1]
    center_x, center_y = (x1 + x2) / 2, (y1 + y2) / 2

    if center_x < rect_x1: return 'left'
    if center_x > rect_x2: return 'right'
    if center_y < rect_y1: return 'top'
    if center_y > rect_y2: return 'bottom'
    return None

def draw_detections_on_frame(frame, box_data, names, entry_side, specified_sides):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [255, 0, 0] if entry_side in specified_sides else [0, 255, 0]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

def process_video(video_url, pickup_coords, drop_coords, pickup_sides, drop_sides, video_index, output_video_path):
    cap = cv2.VideoCapture(video_url)
    if not cap.isOpened():
        raise ValueError(f"Couldn't open video stream from URL: {video_url}")

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    codec = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, codec, 5, (width, height))

    frame_count = 0
    state = State.WAIT_FOR_PICKUP
    count = 0
    hand_was_in_drop = False

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % (original_fps // 5) == 0:
                # Perform detection
                results = model(frame, conf=0.05, iou=0.2)

                hand_detected_in_pickup = False
                hand_detected_in_drop = False
                last_pickup_side = None
                last_drop_side = None

                # Process detections
                for box_data in results[0].boxes.data.cpu().numpy():
                    entry_side_pickup = detect_entry_side(box_data[:4], pickup_coords)
                    entry_side_drop = detect_entry_side(box_data[:4], drop_coords)

                    if is_inside(map(int, box_data[:4]), pickup_coords):
                        if entry_side_pickup in pickup_sides:
                            hand_detected_in_pickup = True
                            last_pickup_side = entry_side_pickup

                    if is_inside(map(int, box_data[:4]), drop_coords):
                        if entry_side_drop in drop_sides:
                            hand_detected_in_drop = True
                            last_drop_side = entry_side_drop

                    frame = draw_detections_on_frame(frame, box_data, results[0].names, entry_side_pickup if hand_detected_in_pickup else entry_side_drop, pickup_sides if hand_detected_in_pickup else drop_sides)

                if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup:
                    state = State.WAIT_FOR_DROP
                    hand_was_in_drop = False

                if state == State.WAIT_FOR_DROP and hand_detected_in_drop:
                    hand_was_in_drop = True

                if state == State.WAIT_FOR_DROP and hand_was_in_drop and not hand_detected_in_drop:
                    state = State.WAIT_FOR_PICKUP
                    count += 1

            frame = cv2.putText(frame, f'Count: {count}', (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            out.write(frame)
            frame_count += 1

    finally:
        cap.release()
        out.release()
        print("Released video resources.")

# Sample usage
process_video(
    'videos/testVideo2.mp4', 
    pickup_coords=((5, 10), (120, 140)), 
    drop_coords=((240, 60), (450, 190)), 
    pickup_sides = ["bottom", "left"],
    drop_sides = ["bottom"],
    video_index=1, 
    output_video_path='output/output2.mp4'
)


In [28]:
import cv2
from time import time
from ultralytics import YOLO

# Initialize the model
model = YOLO('handDetection.pt')  # Ensure you provide the correct path to your model weights

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2

def is_inside(box, rect):
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

def process_video(video_url, pickup_coords, drop_coords, video_index, output_video_path):
    cap = cv2.VideoCapture(video_url)
    if not cap.isOpened():
        raise ValueError(f"Couldn't open video stream from URL: {video_url}")

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    codec = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, codec, 5, (width, height))

    frame_count = 0
    state = State.WAIT_FOR_PICKUP
    count = 0
    hand_was_in_drop = False


    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % (original_fps // 5) == 0:
                # Perform detection
                results = model(frame, conf=0.05, iou=0.2)
                hand_detected_in_pickup = any(is_inside(map(int, box_data[:4]), pickup_coords) for box_data in results[0].boxes.data.cpu().numpy())
                hand_detected_in_drop = any(is_inside(map(int, box_data[:4]), drop_coords) for box_data in results[0].boxes.data.cpu().numpy())


                if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup:
                    state = State.WAIT_FOR_DROP

                if state == State.WAIT_FOR_DROP and hand_detected_in_drop:
                    hand_was_in_drop = True

                if hand_was_in_drop and hand_detected_in_pickup:
                    count += 1
                    state = State.WAIT_FOR_DROP
                    hand_was_in_drop = False

                # Process detections
                for box_data in results[0].boxes.data.cpu().numpy():
                    frame = draw_detections_on_frame(frame, box_data, results[0].names)
                    
                
                # Draw text and additional info on frame
                cv2.putText(frame, f"Count: {count}", (width - 150, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(frame, f"Pickup: {hand_detected_in_pickup}", (width - 150, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Drop: {hand_detected_in_drop}", (width - 150, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"Was in Drop: {hand_was_in_drop}", (width - 150, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.putText(frame, f"State: {state}", (width - 150, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                cv2.rectangle(frame, pickup_coords[0], pickup_coords[1], (0, 255, 0), 2)
                cv2.rectangle(frame, drop_coords[0], drop_coords[1], (0, 255, 0), 2)
                

                out.write(frame)

            frame_count += 1

    finally:
        cap.release()
        out.release()
        print("Released video resources.")

# Sample usage:
process_video('videos/testVideo2.mp4', 
              pickup_coords=((5, 10), (120, 140)), 
              drop_coords=((240, 60), (450, 190)), 
              video_index=1, 
              output_video_path='output/output2.mp4')




0: 384x640 1 Human hand, 93.4ms
Speed: 4.0ms preprocess, 93.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 89.5ms
Speed: 4.0ms preprocess, 89.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 91.5ms
Speed: 5.0ms preprocess, 91.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 110.2ms
Speed: 7.0ms preprocess, 110.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 103.5ms
Speed: 3.0ms preprocess, 103.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 96.6ms
Speed: 4.0ms preprocess, 96.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 105.5ms
Speed: 3.1ms preprocess, 105.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 101.2ms
Speed: 4.0ms preprocess, 101.2ms inference, 

Released video resources.


## ROI and detection

In [4]:
# Initialize the YOLO model
model = YOLO('handDetection.pt')

class State:
    WAIT_FOR_PICKUP = 1
    WAIT_FOR_DROP = 2


def is_inside(box, rect):
    """Check if the center of the box is inside the rectangle."""
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    return rect[0][0] <= center_x <= rect[1][0] and rect[0][1] <= center_y <= rect[1][1]

def is_on_edge(box, rect, sides):
    """Check if the center of the box is on a specific edge of the rectangle."""
    x1, y1, x2, y2 = box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2

    if 'left' in sides and rect[0][0] -5 <= center_x <= rect[0][0] + 5:
        return True
    if 'right' in sides and rect[1][0] - 5 <= center_x <= rect[1][0] +5:
        return True
    if 'top' in sides and rect[0][1] - 5 <= center_y <= rect[0][1] + 5:
        return True
    if 'bottom' in sides and rect[1][1] - 5 <= center_y <= rect[1][1] + 5:
        return True
    return False

def display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path):
    cap = cv2.VideoCapture(video_path)

    hand_detected_in_pickup = False
    hand_detected_in_drop = False
    
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    state = State.WAIT_FOR_PICKUP
    count = 0
    timestamps = []

    hand_was_in_drop = False
    hand_on_pickup_edge_detected = False

    entry_side_pickup = None
    exit_side_pickup = None
    entry_side_drop = None
    exit_side_drop = None

    try:
        while True:
            ret, frame = cap.read()

            if not ret:
                print("Stream ended.")
                break

            # Update state and reset entry and exit sides
            if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup: #and entry_side_pickup == exit_side_pickup:
                state = State.WAIT_FOR_DROP
                # entry_side_pickup = None
                # exit_side_pickup = None
                
            elif state == State.WAIT_FOR_DROP and hand_detected_in_drop: #and entry_side_drop == exit_side_drop:
                state = State.WAIT_FOR_PICKUP
                # entry_side_drop = None
                # exit_side_drop = None
                # count += 1

            results = model(frame)
            hand_detected_in_pickup = any(is_inside(map(int, box_data[:4]), pickup_coords) for box_data in results[0].boxes.data.cpu().numpy())
            hand_detected_in_drop = any(is_inside(map(int, box_data[:4]), drop_coords) for box_data in results[0].boxes.data.cpu().numpy())

            # hand_on_pickup_edge = any(is_on_edge(map(int, box_data[:4]), pickup_coords, pickup_side) for box_data in results[0].boxes.data.cpu().numpy())
            # hand_on_drop_edge = any(is_on_edge(map(int, box_data[:4]), drop_coords, drop_side) for box_data in results[0].boxes.data.cpu().numpy())
    
            # for side in pickup_side:
            #     if any(is_on_edge(map(int, box_data[:4]), pickup_coords, [side]) for box_data in results[0].boxes.data.cpu().numpy()):
            #         if state == State.WAIT_FOR_PICKUP:
            #             entry_side_pickup = side

            #         exit_side_pickup = side

            # for side in drop_side:
            #     if any(is_on_edge(map(int, box_data[:4]), drop_coords, [side]) for box_data in results[0].boxes.data.cpu().numpy()):
            #         if state == State.WAIT_FOR_DROP:
            #             entry_side_drop = side

            #         exit_side_drop = side

            # Update state
            if state == State.WAIT_FOR_PICKUP and hand_detected_in_pickup and entry_side_pickup == exit_side_pickup:
                state = State.WAIT_FOR_DROP
                # entry_side_pickup = None  # Reset for next cycle
                
            elif state == State.WAIT_FOR_DROP and hand_detected_in_drop and entry_side_drop == exit_side_drop:
                state = State.WAIT_FOR_PICKUP
                entry_side_drop = None  # Reset for next cycle
                count += 1

            # Drawing detections
            for box_data in results[0].boxes.data.cpu().numpy():
                frame = draw_detections_on_frame(frame, box_data, ["Hand"])  # Assuming "Hand" is the class name

            cv2.putText(frame, f"Count: {count}", (width - 150, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.putText(frame, f"Pickup: {hand_detected_in_pickup}", (width - 150, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"Drop: {hand_detected_in_drop}", (width - 150, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"Was in Drop: {hand_was_in_drop}", (width - 150, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.putText(frame, f"State: {state}", (width - 150, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.rectangle(frame, pickup_coords[0], pickup_coords[1], (0, 255, 0), 2)
            cv2.rectangle(frame, drop_coords[0], drop_coords[1], (0, 255, 0), 2)
            out.write(frame)

            time.sleep(0.04)
    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()


# Function to draw the detections onto a frame
def draw_detections_on_frame(frame, box_data, names):
    x1, y1, x2, y2, conf, cls = box_data
    label = f'{names[int(cls)]} {conf:.2f}'
    color = [int(c) for c in (255, 0, 0)]
    tl = round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
    c1, c2 = (int(x1), int(y1)), (int(x2), int(y2))
    cv2.rectangle(frame, c1, c2, color, thickness=tl)
    tf = max(tl - 1, 1)
    t_size = cv2.getTextSize(label, 0, fontScale=tf / 3, thickness=tf)[0]
    c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
    cv2.rectangle(frame, c1, c2, color, -1)
    cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tf / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return frame

# Usage
video_path = "videos/testVideo2.mp4"
output_path = "output/output.mp4"
# rect1_coords = ((1050, 1550), (1830, 2500))  # Top-left and bottom-right corners for the first rectangle
pickup_coords = ((5, 10), (120, 140))
drop_coords = ((240, 60), (450, 190))
pickup_side = ['bottom', 'right']
drop_side = ['bottom']


display_video_with_detections_and_rectangles(video_path, pickup_coords, drop_coords, output_path)



0: 384x640 1 Human hand, 111.6ms
Speed: 3.0ms preprocess, 111.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 94.6ms
Speed: 4.2ms preprocess, 94.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 111.1ms
Speed: 2.6ms preprocess, 111.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 131.0ms
Speed: 3.0ms preprocess, 131.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 120.4ms
Speed: 4.3ms preprocess, 120.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 140.1ms
Speed: 4.3ms preprocess, 140.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 114.2ms
Speed: 4.0ms preprocess, 114.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Human hand, 110.6ms
Speed: 3.0ms preprocess, 110.6ms

Stream ended.
