# Comparative study of YOLO vs MediaPipe vs MoveNet

In [1]:
import cv2
from ultralytics import YOLO
import matplotlib.pyplot as plt

import mediapipe as mp

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed


import numpy as np

KEYPOINT_DICT = {
    'nose': 0,
    'left_shoulder': 5,
    'right_shoulder': 6,
    'left_elbow': 7,
    'right_elbow': 8,
    'left_wrist': 9,
    'right_wrist': 10,
    'left_hip': 11,
    'right_hip': 12,
    'left_knee': 13,
    'right_knee': 14,
    'left_ankle': 15,
    'right_ankle': 16
}

# Load the input image.
image_path = 'Yoga poses.v5i.yolov8/test/images/1_123_jpg.rf.38c81030db0d99d8c5a2c090b3028403.jpg'
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for display with matplotlib

# File path to your YOLO txt file
file_path = 'Yoga poses.v5i.yolov8/test/labels/1_123_jpg.rf.38c81030db0d99d8c5a2c090b3028403.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
except UnicodeDecodeError:
    print("UTF-8 decoding failed, trying ISO-8859-1 encoding...")
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        data = file.read()

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/home/rtu/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


## Define Metrics

In [2]:
def calculate_oks(gt_keypoints, pred_keypoints, bbox_area, indices):
    # Object Keypoint Similarity (OKS) is a metric used to evaluate the accuracy of keypoint predictions

    sigmas = np.array([0.26, 0.25, 0.25, 0.35, 0.35, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62])
    #sigmas = np.array([0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33])

    #Filter keypoints
    selected_sigmas = []

    for j in indices:
        selected_sigmas.append(sigmas[j])

    #print("gt_kpts: ", gt_keypoints)

    # Ensure both keypoints lists have exactly 13 keypoints
    gt_keypoints = gt_keypoints[:13]
    pred_keypoints = pred_keypoints[:13]


    y_true = np.array(gt_keypoints).reshape(-1, 2)
    y_pred = np.array(pred_keypoints).reshape(-1, 2)

    #print("ytrue: ", y_true)
    #print("ypred: ", y_pred)

    # Handle cases where there might be fewer keypoints
    if y_true.shape[0] != y_pred.shape[0]:
        raise ValueError(f"Shape mismatch between ground truth and predicted keypoints: {y_true.shape} vs {y_pred.shape}")


    # Calculate Euclidean distance between keypoints
    d2 = (y_true - y_pred)**2 
    # d^2 = (x1 - x2)^2 + (y1 - y2)^2
    d2_sum = d2.sum(axis=1)

    #print(d2_sum)
    

    # Adjust sigmas shape if necessary
    if sigmas.shape[0] != d2.shape[1]:
        sigmas = sigmas[:d2.shape[1]]

    # COCO assigns k = 2σ.
    for i in range(len(selected_sigmas)):
        selected_sigmas[i] = 2 * selected_sigmas[i]

    #print(selected_sigmas)

    denom=[]
    # Denominator in the exponent term. Shape: [M, 1, #kpts]
    for k in range(len(selected_sigmas)):
        denom.append( 2 * (selected_sigmas[k]**2) * bbox_area )
    
    #print( -d2_sum/denom )

    # Calculate OKS
    oks = np.exp(-d2_sum / denom )

    print(oks)

    return oks.mean()


def calculate_mppe(gt_keypoints, pred_keypoints):
    # Mean Per Part Error (MPPE) is the average error between predicted and ground truth parts detected (pairs of keypoints) 

    # Ensure both keypoints lists have exactly 13 keypoints
    gt_keypoints = gt_keypoints[:13]
    pred_keypoints = pred_keypoints[:13]

    # Handle missing keypoints by replacing them with a placeholder
    def handle_missing_keypoints(keypoints):
        return [(0, 0) if k == (0, 0) else k for k in keypoints]

    gt_keypoints = handle_missing_keypoints(gt_keypoints)
    pred_keypoints = handle_missing_keypoints(pred_keypoints)

    correct_parts = 0
    total_parts = 0
    
    # COCO limb pairs: COCO has pairs like [5,7] for left upper arm, [11,13] for left upper leg, etc.
    # coco_limb_pairs = [(5, 7), (7, 9), (6, 8), (8, 10), (11, 13), (13, 15), (12, 14), (14, 16)]


    # Converted limb pairs: pairs are enumerated from 0 to 13, based on kpts array size, check coco_indices for conversion
    limb_pairs = [(1, 9), (9, 11), (2, 10), (10, 12), (3, 5), (5, 7), (4, 6), (6, 8)]

    error_array = []

    for (i, j) in limb_pairs:
        #Skip the part if either keypoint in the pair is missing
        if gt_keypoints[i] == (0, 0) or gt_keypoints[j] == (0, 0):
            continue
        
        gt_dist = np.linalg.norm(np.array(gt_keypoints[i]) - np.array(gt_keypoints[j]))
        pred_dist = np.linalg.norm(np.array(pred_keypoints[i]) - np.array(pred_keypoints[j]))
        
        # Check if the predicted distance is within a threshold (example: 50% of ground truth distance)
        error_array.append( abs(gt_dist - pred_dist) / gt_dist )
    
    print(error_array)
    return np.mean(error_array)

## Deploying models and extracting predicted keypoints

### MoveNet

In [7]:
module = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
input_size = 192


def movenet(input_image):
    """Runs detection on an input image.

    Args:
      input_image: A [1, height, width, 3] tensor represents the input image
        pixels. Note that the height/width should already be resized and match the
        expected input resolution of the model before passing into this function.

    Returns:
      A [1, 1, 17, 3] float numpy array representing the predicted keypoint
      coordinates and scores.
    """
    model = module.signatures['serving_default']

    # SavedModel format expects tensor type of int32.
    input_image = tf.cast(input_image, dtype=tf.int32)
    # Run model inference.
    outputs = model(input_image)
    # Output is a [1, 1, 17, 3] tensor.
    keypoints_with_scores = outputs['output_0'].numpy()
    return keypoints_with_scores


# Resize and pad the image to keep the aspect ratio and fit the expected size.
input_image = tf.expand_dims(image, axis=0)
input_image = tf.image.resize_with_pad(input_image, input_size, input_size)

# Run model inference.
keypoints_with_scores = movenet(input_image)


#---------------------------------Predicted Keypoints---------------------------------

predicted_kpts = []   #normalized

selected_indices = [0,5,6,7,8,9,10,11,12,13,14,15,16]
#Filter the keypoints to only include the ones we want
for i in selected_indices:
    predicted_kpts.append(keypoints_with_scores[0][0][i])

# Convert normalized coordinates to image pixel coordinates
pred_kpts = []
for kp in predicted_kpts:
    x = kp[0] * 640
    y = kp[1] * 640
    pred_kpts.append([y, x])

print("Predicted keypoints: ", pred_kpts)

I0000 00:00:1728399473.496170    8463 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


Predicted keypoints:  [[269.8226737976074, 210.44572830200195], [314.4546318054199, 267.2721862792969], [296.9321060180664, 262.52140045166016], [309.8683738708496, 184.03955459594727], [304.48888778686523, 183.32752227783203], [319.0231704711914, 128.33684921264648], [322.3508834838867, 129.77527618408203], [308.92187118530273, 387.1814727783203], [286.1391258239746, 387.32479095458984], [415.4742431640625, 455.6941223144531], [153.52152824401855, 394.9186325073242], [535.9650421142578, 492.1772003173828], [164.03810501098633, 499.5343017578125]]


### Mediapipe

In [4]:
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)

def process_image(image_path):
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pose.process(image_rgb)
    return results, image_rgb

results, image_rgb = process_image(image_path)

# Get image dimensions
image_height, image_width, _ = image.shape


# Assume KEYPOINT_DICT is a dictionary that maps YOLO keypoints to Mediapipe keypoints
KEYPOINT_DICT = {
    0: "nose",
    5: "left_shoulder",
    6: "right_shoulder",
    7: "left_elbow",
    8: "right_elbow",
    9: "left_wrist",
    10: "right_wrist",
    11: "left_hip",
    12: "right_hip",
    13: "left_knee",
    14: "right_knee",
    15: "left_ankle",
    16: "right_ankle"
}


# ----------------------------------Predicted keypoints----------------------------------------------

# Extract the 13 keypoints from MediaPipe according to KEYPOINT_DICT
if results.pose_landmarks:
    keypoints = results.pose_landmarks.landmark
    pred_kpts = []
    
    # Loop over selected keypoints in KEYPOINT_DICT
    for i in KEYPOINT_DICT:
        kp = keypoints[i]
        # Normalize keypoints to YOLO format (x and y between 0 and 1)
        x = kp.x * image_width
        y = kp.y * image_height
        pred_kpts.append([x, y])
    
    # Convert keypoints to numpy array for easier manipulation
    pred_kpts = np.array(pred_kpts)
    print(pred_kpts)

[[     264.34      234.88]
 [      273.8      222.16]
 [     275.47      221.58]
 [     293.67      230.27]
 [     291.19      227.21]
 [     270.55       242.7]
 [     268.84      240.35]
 [     308.02      267.01]
 [     295.48      260.62]
 [     308.78      198.14]
 [      297.5      195.92]
 [     325.39      125.96]
 [      318.9      132.46]]


I0000 00:00:1728399400.405935    8463 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1728399400.428274    8580 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 550.107.02), renderer: NVIDIA GeForce RTX 3070/PCIe/SSE2
W0000 00:00:1728399400.487214    8577 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1728399400.516666    8573 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


### YOLOv8

In [6]:
# Load a pretrained YOLO model (recommended for training)
model = YOLO('yolov8n-pose.pt')

# Perform object detection on an image using the model
results = model(image_path)

# Define the indices for the 13 keypoints we need
# Using MediaPipe indices: 
# 0: Nose, 5: Left Shoulder, 6: Right Shoulder, 11: Left Hip, 12: Right Hip, 
# 13: Left Knee, 14: Right Knee, 15: Left Ankle, 16: Right Ankle, 7: Left Elbow, 8: Right Elbow,
# 9: Left Wrist, 10: Right Wrist
selected_indices = [0, 5, 6, 11, 12, 13, 14, 15, 16, 7, 8, 9, 10]


# ----------------------------------Predicted keypoints----------------------------------------------
denormalized_kps = []

# Process results
for r in results:
    keypoints = r.keypoints.xyn.cpu().numpy()  # Normalized keypoints (x, y, conf)
    
    for kp in keypoints[0]:
            x, y = int(kp[0] * 640), int(kp[1] * 640) # denormalize , if needed
            denormalized_kps.append((x,y))

pred_kpts = []

# Filter for the 13 specific keypoints
for i in selected_indices:
    pred_kpts.append(denormalized_kps[i])

print("Predicted keypoints: ", pred_kpts)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-pose.pt to 'yolov8n-pose.pt'...


100%|██████████| 6.52M/6.52M [00:00<00:00, 11.1MB/s]



image 1/1 /home/rtu/Documents/HPE_models/Yoga poses.v5i.yolov8/test/images/1_123_jpg.rf.38c81030db0d99d8c5a2c090b3028403.jpg: 640x640 1 person, 4.4ms
Speed: 2.2ms preprocess, 4.4ms inference, 100.9ms postprocess per image at shape (1, 3, 640, 640)
Predicted keypoints:  [(267, 206), (316, 245), (303, 241), (306, 379), (266, 376), (397, 426), (172, 413), (542, 500), (174, 485), (308, 231), (312, 227), (305, 152), (284, 151)]


## Extracting ground truth keypoints from dataset

In [8]:
#++++++++++++++++++++++++++++++++++++++Ground Truth Keypoints++++++++++++++++++++++++++++++++++++++

normal_gt_kpts = [float(value) for i, value in enumerate(data.split()) if 0 < float(value) <= 1 ]


# Reshape keypoints into pairs of (x, y)
normal_gt_kpts = np.array(normal_gt_kpts).reshape(-1, 2)

# Denormalize keypoints
gt_kpts = np.zeros_like(normal_gt_kpts)
gt_kpts[:, 0] = normal_gt_kpts[:, 0] * 640   # Denormalize x by image width
gt_kpts[:, 1] = normal_gt_kpts[:, 1] * 640  # Denormalize y by image height


#print(gt_kpts[2:])
# print(selected_keypoints) visualization and debugging

gt_kpts = gt_kpts[2:]   # remove the first two keypoints as they are part of bbox *VERY IMPORTANT*

print("Ground truth keypoints: ", gt_kpts)

Ground truth keypoints:  [[     271.43      210.71]
 [     303.57         280]
 [     311.82      280.91]
 [     307.86      385.71]
 [        295      385.71]
 [     151.82      391.82]
 [     397.27      458.18]
 [     163.64      494.55]
 [     530.91      490.91]
 [     307.14      186.43]
 [     327.86      129.29]
 [     297.14      181.43]
 [     323.57      129.29]]
