# YOLOv8 inference on static Yoga pose image dataset

In [1]:
import cv2
from ultralytics import YOLO
import matplotlib.pyplot as plt


# Load a pretrained YOLO model (recommended for training)
model = YOLO('yolov8n-pose.pt')

In [2]:
yolo_img_path = 'Yoga poses.v4i.yolov8/test/images/test1.jpg'

# Perform object detection on an image using the model
results = model(yolo_img_path)


image 1/1 /home/rtu/Documents/Metrics_mp/Yoga poses.v4i.yolov8/test/images/test1.jpg: 640x640 1 person, 7.3ms
Speed: 19.9ms preprocess, 7.3ms inference, 6598.9ms postprocess per image at shape (1, 3, 640, 640)


## Define metrics

In [13]:
import numpy as np
import torch

def keypoint_similarity(gt_kpts, pred_kpts, areas):
    """
    Params:
        gt_kpts: Ground-truth keypoints, Shape: [M, #kpts, 2],
                 where, M is the # of ground truth instances,
                        2 in the last dimension denotes coordinates: x,y
                         
        pred_kpts: Prediction keypoints, Shape: [N, #kpts, 2]
                   where  N is the # of predicted instances,

        areas: Represent ground truth areas of shape: [M,]

    Returns:
        oks: The Object Keypoint Similarity (OKS) score tensor of shape: [M, N]
    """
    sigmas = np.array([0.26, 0.25, 0.25, 0.35, 0.35, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62])

    # epsilon to take care of div by 0 exception.
    EPSILON = torch.finfo(torch.float32).eps
    
    # Euclidean distance squared:
    # d^2 = (x1 - x2)^2 + (y1 - y2)^2
    # Shape: (M, N, #kpts)
    dist_sq = (gt_kpts[:, None, :, 0] - pred_kpts[..., 0])**2 + (gt_kpts[:, None, :, 1] - pred_kpts[..., 1])**2

    # COCO assigns k = 2σ.
    k = 2 * sigmas

    # Denominator in the exponent term. Shape: [M, 1, #kpts]
    denom = 2 * (k**2) * (areas[:, None, None] + EPSILON)

    # Exponent term. Shape: [M, N, #kpts]
    exp_term = dist_sq / denom

    # Object Keypoint Similarity. Shape: (M, N)
    oks = torch.exp(-exp_term).mean(-1)

    return oks


In [36]:
# Define the indices for the 13 keypoints we need
# Using MediaPipe indices: 
# 0: Nose, 5: Left Shoulder, 6: Right Shoulder, 11: Left Hip, 12: Right Hip, 
# 13: Left Knee, 14: Right Knee, 15: Left Ankle, 16: Right Ankle, 7: Left Elbow, 8: Right Elbow,
# 9: Left Wrist, 10: Right Wrist
selected_indices = [0, 5, 6, 11, 12, 13, 14, 15, 16, 7, 8, 9, 10]


# Predicted keypoints----------------------------------
denormalized_kps = []

# Process results
for r in results:
    keypoints = r.keypoints.xyn.cpu().numpy()  # Normalized keypoints (x, y, conf)
    
    for kp in keypoints[0]:
            x, y = int(kp[0] * 640), int(kp[1] * 640) # denormalize , if needed
            denormalized_kps.append((x,y))

selected_keypoints = []

# Filter for the 13 specific keypoints
for i in selected_indices:
    selected_keypoints.append(denormalized_kps[i])
#----------------------------------------------------


#Ground truth keypoints+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# File path to your YOLO txt file
file_path = 'Yoga poses.v4i.yolov8/test/labels/1_123_jpg.rf.aec8214c1ba57eef43d571faa5775f8a.txt'

# Read the file
with open(file_path, 'r') as file:
    data = file.read()

normal_gt_kpts = [float(value) for i, value in enumerate(data.split()) if 0 < float(value) <= 1]


# Reshape keypoints into pairs of (x, y)
normal_gt_kpts = np.array(normal_gt_kpts).reshape(-1, 2)

# Denormalize keypoints
gt_kpts = np.zeros_like(normal_gt_kpts)
gt_kpts[:, 0] = normal_gt_kpts[:, 0] * 640   # Denormalize x by image width
gt_kpts[:, 1] = normal_gt_kpts[:, 1] * 640  # Denormalize y by image height


print(gt_kpts)
print(selected_keypoints)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

gt_kpts = torch.tensor(gt_kpts)
selected_keypoints = torch.tensor(selected_keypoints)


areas = torch.tensor([640*640*0.53])

print(keypoint_similarity(gt_kpts.unsqueeze(0), selected_keypoints.unsqueeze(0), areas))

"""
for r in results:    
    # Load and display the image
    image = cv2.imread(yolo_img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Plot the image and the selected keypoints
    plt.imshow(image)

    for kp in selected_keypoints:
        cv2.circle(image, (int(x), int(y)), 5, (0, 255, 0), -1)

    plt.axis('off')
    plt.show()



# Extract the pose landmarks from the results
for result in results:
    if result.keypoints is not None:
        # Access keypoints tensor directly
        keypoints = result.keypoints[0]  # Assume the first detection (or iterate over multiple detections)

        # Filter for the 13 specific keypoints
        selected_keypoints = keypoints[selected_indices]

        # Load and display the image
        image = cv2.imread(yolo_img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Plot the image and the selected keypoints
        plt.imshow(image)

        for kp in selected_keypoints:
            x, y, confidence = kp[:3]  # Assumes that each kp has three values: x, y, confidence
            if confidence > 0.5:  # Filter by confidence if needed
                

        plt.axis('off')
        plt.show()

"""

[[        344         327]
 [      546.5       564.5]
 [     285.45      226.36]
 [     256.36      275.45]
 [     311.82      280.91]
 [        320      380.91]
 [     275.45         390]
 [     151.82      391.82]
 [     397.27      458.18]
 [     163.64      494.55]
 [     530.91      490.91]
 [     298.18      184.55]
 [     330.91      114.55]
 [     277.27      188.18]
 [     325.45      110.91]]
[(267, 206), (316, 245), (303, 241), (306, 379), (266, 376), (397, 426), (172, 413), (542, 500), (174, 485), (308, 231), (312, 228), (305, 152), (284, 151)]


RuntimeError: The size of tensor a (15) must match the size of tensor b (13) at non-singleton dimension 2