### Install necessary packages
<br>!pip install opencv-python-headless
<br>!pip install numpy
<br>!pip install torch
<br>!pip install ultralytics

# Results example
<video controls  src="resultdemo.mp4"/>

## Imports 
project requires several key libraries to perform specific tasks related to video processing, object detection, and tracking.

In [2]:
import cv2
import numpy as np
from ultralytics import YOLO
import torch
from collections import defaultdict

### set the cuda 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### check if device is cuda or not

In [3]:
print(device)

cuda


### initalize the model

In [4]:
model = YOLO('yolov8n-pose.pt').to(device)

In [6]:
tracker_bbox = None
current_hand = None  
tracker = cv2.TrackerKCF_create()
tracker_initialized = False
thrown=False
kernel=None
selected_pt=None
selected_hand=None
selected_pt_1=None

This function initializes video capture from a specified file and attempts to set the frame width and height to 1280x720. It verifies if these settings were applied, printing a warning if they were not. Finally, it returns the video capture object for further processing.

In [5]:
def initialize_video(video_path, width=1280, height=720):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return None
    
    # Try setting the desired frame width and height
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

    # Verify if the settings were applied
    actual_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    actual_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    if actual_width != width or actual_height != height:
        print(f"Warning: Desired dimensions ({width}x{height}) not applied. Actual dimensions: {actual_width}x{actual_height}")

    return cap

This function checks if any point in a given contour is within a specified distance (threshold) from a keypoint. It calculates the Euclidean distance between each contour point and the keypoint. If any distance is less than the threshold, it returns True; otherwise, it returns False.

In [9]:
def is_contour_connected_to_keypoint(contour, keypoint, threshold=10):
    for point in contour:
        distance = np.linalg.norm(point[0] - keypoint)
        if distance < threshold:
            return True
    return False

This function determines if an object, represented by its bounding box, is far enough from a given keypoint to be considered a "throwing action." It calculates the distance between the object's center and the keypoint. If this distance is greater than or equal to the specified threshold, it returns True; otherwise, it returns False.

In [10]:
def is_throwing_action(bbox, keypoint,threshold=50):
    x_contour, y_contour, width, height = bbox
    obj_center = np.array([x_contour + width / 2, y_contour + height / 2])
    if(keypoint[0]!=0 and keypoint[1]!=0):
        distance = np.linalg.norm(obj_center - keypoint)
        print(f"Distance to keypoint  [{keypoint}] : {distance:.2f}")
        if distance>=threshold:
            return True
    return False

This function converts the input frame to grayscale and calculates the absolute difference with a background frame. Then, it applies Otsu's thresholding to create a binary mask, followed by erosion and dilation to reduce noise and enhance the foreground mask, which it then returns.

<img src="result images/frame_3.jpg" width="400" height="200"><img src="result images/frame_0.jpg" width="400" height="200">

In [11]:
def generate_foreground(frame):
    fgframe = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    fgMask = cv2.absdiff(fgframe, backframe)
    _, fgMask = cv2.threshold(fgMask, 0, 255, cv2.THRESH_OTSU)
    fgMask = cv2.erode(fgMask, kernel, iterations=1)
    fgMask = cv2.dilate(fgMask, kernel, iterations=1)
    return fgMask

This function processes a video frame with a YOLO model to track human poses. It extracts bounding boxes, keypoints, and IDs of detected individuals, and then returns these detections.
<img src="result images/frame_3.jpg" width="400" height="200"><img src="result images/frame_1.jpg" width="400" height="200"> 

In [12]:
def detect_human_pose(frame):
    results = model.track(frame, persist=True, tracker='bytetrack.yaml')
    boundingboxes = results[0].boxes.xywh.cpu().numpy()
    keypoints_list = results[0].keypoints.xy.cpu().numpy()
    ids = {}
    if results[0].boxes.id is not None:
        ids = results[0].boxes.id.cpu().numpy()  # Extract track IDs
    return boundingboxes, keypoints_list, ids

This Python function, draw_contours, takes a frame and a list of contour information. It sorts the contours based on their area, selects the largest one, and draws a rectangle around it on the frame. Additionally, it annotates the contour with a text indicating its ID, type of hand, and that it's an object. Finally, it returns the hand type, the point selected, and the coordinates of the rectangle enclosing the contour.


<img src="result images/frame_3.jpg" width="400" height="200"><img src="result images/frame_4.jpg" width="400" height="200"> 


In [13]:
def draw_contours(frame, contours_info):
    if contours_info:
        contours_info.sort(key=lambda info: info[4] * info[5], reverse=True)
        selected_hand, selected_pt, x_contour, y_contour, width, height, selected_contour = contours_info[0]
        cv2.rectangle(frame, (x_contour, y_contour), (x_contour + width, y_contour + height), (0, 0, 255), 2)
        cv2.putText(frame, f"{id} {selected_hand} Hand obj", (x_contour, y_contour - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv2.LINE_AA)
        return selected_hand, selected_pt, (x_contour, y_contour, width, height)
    return None, None, None

This Python function, handle_tracking, updates the tracker with the current frame and tracks the object specified by the bounding box. It then calculates the coordinates of the tracked bounding box and compares them with the detected keypoints of the hand. If the hand is being tracked successfully, it checks for a throwing action and returns the result along with the updated bounding box. If tracking fails, it returns False along with None for the bounding box.

<img src="result images/frame_0 (5).jpg" width="400" height="200"><img src="result images/frame_0 (11).jpg" width="400" height="200"> 

In [14]:
def handle_tracking(tracking_frame, frame,tracker, tracker_bbox, keypoints_list, selected_hand):
    success, tracker_bbox = tracker.update(tracking_frame)
    if success:
        p1 = (int(tracker_bbox[0]), int(tracker_bbox[1]))
        p2 = (int(tracker_bbox[0] + tracker_bbox[2]), int(tracker_bbox[1] + tracker_bbox[3]))
        temp_tracker_box = (int(tracker_bbox[0]), int(tracker_bbox[1]), int(tracker_bbox[2]), int(tracker_bbox[3]))
        pt1 = keypoints_list[0][9].astype(int)  # Left hand
        pt2 = keypoints_list[0][10].astype(int)
        if pt1[0] > pt2[0]:
            pt1, pt2 = pt2, pt1
        selected_pt_1 = pt1 if selected_hand == "Left" else pt2
        thrown = is_throwing_action(temp_tracker_box, selected_pt_1)
        cv2.rectangle(frame, p1, p2, (255, 0, 0), 2, 1)
        cv2.putText(frame, f"Tracking {current_hand} Hand", (p1[0], p1[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
        return thrown, tracker_bbox
    else:
        return False, None

This code initializes video capture from a specified file and reads the first frame. It creates a copy of this frame for tracking purposes (tracking_frame). Additionally, it converts the first frame to grayscale (backframe) for background subtraction and further processing.

In [15]:
video_path = "videos/WhatsApp Video 2024-05-20 at 10.00.32.mp4"
cap = initialize_video(video_path)
ret, backframe = cap.read()
tracking_frame = backframe.copy()
backframe = cv2.cvtColor(backframe, cv2.COLOR_RGB2GRAY)



In [16]:
if not cap.isOpened():
    print(f"Error: Unable to open video file")
    exit()

## This code snippet appears to be part of a larger program for real-time object tracking and gesture recognition, likely designed for applications such as computer vision-based interaction or surveillance.

- The while loop continuously reads frames from a video capture (cap) until there are no more frames (cap.isOpened()).

- It checks if the frame retrieval was successful (ret) and if not, breaks out of the loop, printing an error message.

- It initializes a copy of the current frame for object tracking and generates a foreground mask using a function called generate_foreground.

- Object detection is performed on the current frame using a function called detect_objects, which likely employs a machine learning model like YOLO (You Only Look Once) to detect objects and their keypoints.

- If object tracking has been initialized (tracker_initialized), the program attempts to update the tracker with the current frame. If successful, it calculates the bounding box of the tracked object and performs additional processing to determine the hand's position and whether a throwing action is occurring.

- If the tracker fails to update, it indicates that the object might have been lost or occluded, and the tracker is reset for the next iteration.

- If object tracking has not been initialized, the program proceeds to perform object detection using YOLO. It extracts bounding boxes and keypoints of detected objects, particularly focusing on hands.

- For each detected hand, it creates a mask to isolate the hand region in the foreground mask and finds contours within that region. If contours are found and meet certain criteria, they are considered for tracking.

- The largest contour associated with each hand is selected, and its bounding box is drawn on the frame. If a suitable contour is found, object tracking is initialized using that bounding box, and the current hand type is determined for subsequent iterations.

- If a throwing action is detected based on the hand's motion, a message is displayed on the frame.

- The frame is displayed using OpenCV's imshow function, and the loop continues until the user presses 'q' to quit.

- Finally, the video capture is released, and all OpenCV windows are destroyed.

This code forms part of a larger system for real-time hand tracking and gesture recognition

<img src="result images/frame_0 (18).jpg" width="800" height="400">


<img src="result images/frame_43.jpg" width="800" height="400">

In [17]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Error: Unable to read frame")
        break
    tracking_frame = frame.copy()
    fgMask=generate_foreground(frame)
    boundingboxes, keypoints_list, ids = detect_objects(frame)
    # if results[0].boxes.id is not None:
    #     ids = results[0].boxes.id.cpu().numpy()  # Extract track IDs

    if tracker_initialized:
        # thrown, tracker_bbox = handle_tracking(tracking_frame,frame, tracker, tracker_bbox, keypoints_list, current_hand)
        success, tracker_bbox = tracker.update(tracking_frame)
        if success:

            p1 = (int(tracker_bbox[0]), int(tracker_bbox[1]))
            p2 = (int(tracker_bbox[0] + tracker_bbox[2]), int(tracker_bbox[1] + tracker_bbox[3]))
            # print_distances_to_keypoints(tracker_bbox, keypoints_list[0])
            temp_tracker_box=(int(tracker_bbox[0]), int(tracker_bbox[1]),int(tracker_bbox[2]),int(tracker_bbox[3]))
            # print(temp_tracker_box)
            pt1 = keypoints_list[0][9].astype(int)  # Left hand
            pt2 = keypoints_list[0][10].astype(int)
            if pt1[0] > pt2[0]:
                pt1, pt2 = pt2, pt1
            if selected_hand=="Left":
                print("left")
                selected_pt_1=pt1
            else:
                print("Right")
                selected_pt_1=pt2
            # print(keypoints_list)
            # print(tracker_bbox)
            # print(selected_pt_1)
            obj_center = (int(tracker_bbox[0]) + int(tracker_bbox[2]) // 2, int(tracker_bbox[1]) + int(tracker_bbox[3]) // 2)
            # print(obj_center)
            thrown=is_throwing_action(temp_tracker_box,selected_pt_1)
            # print(obj_center)
            # print(keypoints_list)
            # print(selected_pt)
            # cv2.line(frame,obj_center,selected_pt_1,(255, 0, 0), 2)
            cv2.rectangle(frame, p1, p2, (255, 0, 0), 2, 1)
            cv2.putText(frame, f"Tracking {current_hand} Hand", (p1[0], p1[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
        else:
            tracker_initialized = False  # Tracker failed, reinitialize in the next iteration
    else:
        # Object tracking with YOLO
        if len(ids) > 0:
            for bbox, keypoints, id in zip(boundingboxes, keypoints_list, ids):
                x, y, w, h = bbox.astype(int)
                pt1 = keypoints[9].astype(int)  # Left hand
                pt2 = keypoints[10].astype(int)  # Right hand
                if not (0 <= pt1[0] < frame.shape[1] and 0 <= pt1[1] < frame.shape[0]):
                    continue
                if not (0 <= pt2[0] < frame.shape[1] and 0 <= pt2[1] < frame.shape[0]):
                    continue

                height_hand = h // 2
                width_hand = w
                if pt1[0] > pt2[0]:
                    pt1, pt2 = pt2, pt1

                x_l_hand = max(pt1[0] + (width_hand // 5), 0)
                y_l_hand = max(pt1[1], 0)
                x_l_hand_end = max(pt1[0] - width_hand, 0)
                y_l_hand_end = max(pt1[1] + height_hand, 0)

                x_r_hand = max(pt2[0] - (width_hand // 5), 0)
                y_r_hand = max(pt2[1], 0)
                x_r_hand_end = min(pt2[0] + width_hand, frame.shape[1])
                y_r_hand_end = min(pt2[1] + height_hand, frame.shape[0])

                contours_info = []

                if (0 <= x_l_hand < frame.shape[1] and 0 <= y_l_hand < frame.shape[0] and
                        0 <= x_l_hand_end <= frame.shape[1] and 0 <= y_l_hand_end <= frame.shape[0]):

                    mask_l_hand = np.zeros(fgMask.shape[:2], dtype="uint8")
                    cv2.rectangle(mask_l_hand, (x_l_hand, y_l_hand), (x_l_hand_end, y_l_hand_end), 255, -1)
                    fgMask_l_hand = cv2.bitwise_and(fgMask, fgMask, mask=mask_l_hand)
                    contours_l_hand, hierarchy_l_hand = cv2.findContours(fgMask_l_hand, cv2.RETR_EXTERNAL,
                                                                         cv2.CHAIN_APPROX_NONE)

                    if len(contours_l_hand) > 0:
                        largest_contour_l = max(contours_l_hand, key=cv2.contourArea)
                        if is_contour_connected_to_keypoint(largest_contour_l, pt1):
                            x_contour, y_contour, width, height = cv2.boundingRect(largest_contour_l)
                            contours_info.append(("Left", pt1, x_contour, y_contour, width, height, largest_contour_l))

                if (0 <= x_r_hand < frame.shape[1] and 0 <= y_r_hand < frame.shape[0] and
                        0 <= x_r_hand_end <= frame.shape[1] and 0 <= y_r_hand_end <= frame.shape[0]):

                    mask_r_hand = np.zeros(fgMask.shape[:2], dtype="uint8")
                    cv2.rectangle(mask_r_hand, (x_r_hand, y_r_hand), (x_r_hand_end, y_r_hand_end), 255, -1)
                    fgMask_r_hand = cv2.bitwise_and(fgMask, fgMask, mask=mask_r_hand)
                    contours_r_hand, hierarchy_r_hand = cv2.findContours(fgMask_r_hand, cv2.RETR_EXTERNAL,
                                                                         cv2.CHAIN_APPROX_NONE)

                    if len(contours_r_hand) > 0:
                        largest_contour_r = max(contours_r_hand, key=cv2.contourArea)
                        if is_contour_connected_to_keypoint(largest_contour_r, pt2):
                            x_contour, y_contour, width, height = cv2.boundingRect(largest_contour_r)
                            contours_info.append(("Right", pt2, x_contour, y_contour, width, height, largest_contour_r))

                selected_hand, selected_pt, tracker_bbox = draw_contours(frame, contours_info)
                if tracker_bbox:
                        tracker = cv2.TrackerKCF_create()
                        tracker.init(tracking_frame, tracker_bbox)
                        tracker_initialized = True
                        current_hand = selected_hand
                        thrown = is_throwing_action(tracker_bbox, selected_pt)

    if thrown:
        print("thrown")
        cv2.putText(frame, f"Throwing action detected", (100, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv2.LINE_AA)
    # frame=results[0].plot()
    cv2.imshow('Frame', frame)
    # cv2.imshow('Frame_2', fgMask)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

    


0: 384x640 1 person, 70.5ms
Speed: 4.2ms preprocess, 70.5ms inference, 1385.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11.5ms
Speed: 2.0ms preprocess, 11.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 3.9ms
Speed: 2.0ms preprocess, 3.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 10.4ms
Speed: 0.9ms preprocess, 10.4ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.8ms
Speed: 0.0ms preprocess, 14.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11.5ms
Speed: 0.0ms preprocess, 11.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)
Distance to keypoint  [[520  64]] : 10.05

0: 384x640 1 person, 11.6ms
Speed: 0.0ms preprocess, 11.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)
Right
Distance to keypoint  [[520  70]] : 4.12

0: 384x640 1 person, 13.7ms