In [1]:
from ultralytics import YOLO


In [2]:
model = YOLO("yolo11n.pt")


In [6]:
import mediapipe as mp
import cv2
import numpy as np

def mediapipeDetect(image):
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose()
    results = pose.process(image)
    
    if results.pose_landmarks:
        # Draw hollow circles for each landmark
        for landmark in results.pose_landmarks.landmark:
            x = int(landmark.x * image.shape[1])
            y = int(landmark.y * image.shape[0])
            cv2.circle(image, (x, y), 5, (0, 255, 0), 1)  # Hollow circle with thin edge
        
        # Draw lines to connect the landmarks and form the skeleton
        landmark_points = [(int(lm.x * image.shape[1]), int(lm.y * image.shape[0])) for lm in results.pose_landmarks.landmark]
        connections = mp_pose.POSE_CONNECTIONS
        for connection in connections:
            start_idx, end_idx = connection
            start_point = landmark_points[start_idx]
            end_point = landmark_points[end_idx]
            cv2.line(image, start_point, end_point, (0, 255, 0), 2)  # Line connecting the landmarks
    
    cv2.imshow("MediaPipe Pose Detection", image)

In [7]:
import cv2
import torch
from ultralytics import YOLO

# Load the YOLO11 model
model = YOLO("yolo11s.pt")

# Open the video file
video_path = "etc/drone.mp4"
cap = cv2.VideoCapture(video_path)

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Run YOLO11 tracking on the frame, persisting tracks between frames
        results = model.track(frame, persist=True, conf=0.55)
        
        # Filter the results to only include boxes with class 0 (person)
        filtered_indexes = torch.where(results[0].boxes.cls == 0)
        filtered_boxes = None
        if filtered_indexes[0].shape[0] > 0:
            filtered_boxes = results[0].boxes[filtered_indexes]


        # Visualize the results on the frame
        filtered_results = results[0]
        if filtered_boxes is not None:
            filtered_results.boxes = filtered_boxes
            annotated_frame = filtered_results.plot()
            
            # Save the bounding box region of the image in another variable
            x1, y1, x2, y2 = map(int, filtered_boxes.xyxy[0])
            bounding_box_region = frame[y1:y2, x1:x2]

        # Display the annotated frame
        mediapipeDetect(bounding_box_region)
        #cv2.imshow("YOLO11 Tracking", bounding_box_region)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
cv2.destroyAllWindows()


0: 384x640 1 person, 1 tv, 1 microwave, 130.2ms
Speed: 2.2ms preprocess, 130.2ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 91.7ms
Speed: 1.5ms preprocess, 91.7ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 88.3ms
Speed: 1.4ms preprocess, 88.3ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 95.9ms
Speed: 1.6ms preprocess, 95.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 101.4ms
Speed: 1.7ms preprocess, 101.4ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 89.6ms
Speed: 1.4ms preprocess, 89.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 microwave, 102.5ms
Speed: 1.5ms preprocess, 102.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 p

KeyboardInterrupt: 

In [14]:
annotated_frame

array([[[ 45,  90, 124],
        [ 44,  89, 123],
        [ 46,  89, 123],
        ...,
        [ 51, 114, 161],
        [ 51, 114, 161],
        [ 51, 114, 161]],

       [[ 47,  92, 126],
        [ 46,  91, 125],
        [ 46,  89, 123],
        ...,
        [ 51, 114, 161],
        [ 51, 114, 161],
        [ 51, 114, 161]],

       [[ 49,  97, 130],
        [ 47,  95, 128],
        [ 46,  91, 125],
        ...,
        [ 51, 114, 161],
        [ 51, 114, 161],
        [ 51, 114, 161]],

       ...,

       [[ 43,  43,  57],
        [ 45,  45,  59],
        [ 86,  42,  45],
        ...,
        [ 50, 112, 157],
        [ 50, 112, 157],
        [ 50, 112, 157]],

       [[ 43,  43,  57],
        [ 45,  45,  59],
        [117,  42,  38],
        ...,
        [ 50, 112, 157],
        [ 50, 112, 157],
        [ 50, 112, 157]],

       [[ 43,  43,  57],
        [ 45,  45,  59],
        [ 46,  42,  56],
        ...,
        [ 50, 112, 157],
        [ 50, 112, 157],
        [ 50, 112, 157]]

In [None]:
# from ultralytics import YOLO
# # Load the YOLO11 model
# model = YOLO("yolo11n-pose.pt")

# ## From youtube video
# video_link = "https://youtu.be/OO7XT24AmTY?si=w-_UTTuJr5Fj1Syl"
# results = model.track(video_link, imgsz=32*40, conf=0.55,
#                       save=True, show=True, project='./result')  # Tracking with default tracker



1/1: https://youtu.be/OO7XT24AmTY?si=w-_UTTuJr5Fj1Syl... Success  (4593 frames of shape 1920x1080 at 29.97 FPS)


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

0: 736x1280 (no detections), 269.4ms
0: 736x1280 (no detections), 189.3ms
0: 736x1280 (no detections), 201.5ms
0: 736x1280 (no detections), 199.2ms
0: 736x1280 (no detections), 209.6ms
0: 736x1280 1 person, 197.1ms
0: 736x1280 1 person, 197.9ms
0: 736x1280 1 person, 192.0ms
0: 736x1280 1 person, 196.9ms
0: 736x1280 (no detections), 193.3ms
0: 736x1280 (no detections), 193.8ms
0: 736x1280 (no detections), 210.2ms
0: 736x1280 (no detections), 196.7ms
0: 736