In [1]:
!pip install torch torchvision
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html



Collecting torch
  Downloading torch-2.2.2-cp310-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting torchvision
  Downloading torchvision-0.17.2-cp310-cp310-macosx_10_13_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.2.2-cp310-none-macosx_10_9_x86_64.whl (150.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.8/150.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading torchvision-0.17.2-cp310-cp310-macosx_10_13_x86_64.whl (1.7 MB)
[2K   [90m━

In [9]:
import mediapipe as mp
import cv2

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False, min_detection_confidence=0.5)

def mediapipe_pose_estimation(image):
    if image is None:
        raise ValueError("The image is not loaded properly. Check the image path or file integrity.")

    image = cv2.imread(image)
    if image is None:
        raise ValueError("The image is not loaded properly. Check the image path or file integrity.")
    # Convert the BGR image to RGB before processing.

    frame_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    results = pose.process(frame_rgb)
    
    # Extract keypoints from the results
    keypoints = []
    if results.pose_landmarks:
        for landmark in results.pose_landmarks.landmark:
            keypoints.append((landmark.x, landmark.y))
    return keypoints

# Example usage
# image = cv2.imread('path_to_image.jpg')
# mediapipe_keypoints = mediapipe_pose_estimation(image)


In [None]:
import torch
import detectron2
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # Set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml")
predictor = DefaultPredictor(cfg)

def detectron2_pose_estimation(image):
    outputs = predictor(image)
    keypoints = outputs["instances"].pred_keypoints.cpu().numpy()
    return keypoints

# Example usage
image = cv2.imread('path_to_image.jpg')
detectron2_keypoints = detectron2_pose_estimation(image)


In [13]:
import time
import numpy as np
import cv2
import json
import os
import mediapipe as mp

# Initialize MediaPipe Pose Estimation
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False, min_detection_confidence=0.5)

# Mapping COCO keypoints to MediaPipe keypoints
COCO_TO_MEDIAPIPE = [0, 11, 12, 23, 24, 25, 26, 27, 28, 5, 6, 7, 8, 9, 10, 15, 16]

def mediapipe_pose_estimation(image):
    if image is None:
        raise ValueError("The image is not loaded properly. Check the image path or file integrity.")
    
    # Convert the BGR image to RGB before processing.
    frame_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pose.process(frame_rgb)
    
    keypoints = []
    if results.pose_landmarks:
        h, w, _ = image.shape  # Get image dimensions for scaling
        for idx in COCO_TO_MEDIAPIPE:  # Use only relevant keypoints from MediaPipe that correspond to COCO
            landmark = results.pose_landmarks.landmark[idx]
            x = int(landmark.x * w)  # Scale normalized coordinates to image dimensions
            y = int(landmark.y * h)
            keypoints.append((x, y))
    
    # Print the keypoints for debugging
    print(f"MediaPipe keypoints for image: {keypoints}")
    
    return keypoints

def evaluate_models(models, images_paths, ground_truths, batch_size=10):
    results = []

    for model_name, model_function in models.items():
        pck_scores = []
        total_inference_time = 0

        # Process in batches to avoid overloading memory
        for i in range(0, len(images_paths), batch_size):
            batch_images_paths = images_paths[i : i + batch_size]
            batch_ground_truths = ground_truths[i : i + batch_size]

            for img_path, gt in zip(batch_images_paths, batch_ground_truths):
                # Load the image
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Warning: Unable to load image {img_path}")
                    continue

                start_time = time.time()
                keypoints = model_function(img)
                inference_time = time.time() - start_time

                # Ensure keypoints and ground truth have the same length
                if keypoints and len(keypoints) == len(gt):
                    # Calculate PCK (Percentage of Correct Keypoints)
                    correct_keypoints = np.sum(
                        np.linalg.norm(np.array(keypoints) - np.array(gt), axis=1)
                        < 0.05
                    )
                    pck = correct_keypoints / len(gt)
                else:
                    # Print more detailed information for debugging
                    print(f"Warning: Mismatch or missing keypoints for model {model_name}.")
                    print(f"Number of keypoints detected: {len(keypoints)}, Expected: {len(gt)}")
                    pck = 0

                pck_scores.append(pck)
                total_inference_time += inference_time

        avg_pck = np.mean(pck_scores)
        avg_inference_time = total_inference_time / len(images_paths)
        results.append(
            {
                "model": model_name,
                "avg_pck": avg_pck,
                "avg_inference_time": avg_inference_time,
            }
        )

    return results

# List of models and their corresponding functions
models = {
    "MediaPipe": mediapipe_pose_estimation,
}

def load_coco_annotations(annotation_file):
    with open(annotation_file, "r") as f:
        coco_data = json.load(f)
    annotations = {}
    for ann in coco_data["annotations"]:
        image_id = ann["image_id"]
        keypoints = ann["keypoints"]
        keypoints = [
            (keypoints[i], keypoints[i + 1]) for i in range(0, len(keypoints), 3)
        ]  # (x, y) only
        annotations[image_id] = keypoints
    return annotations

# Load COCO image paths
def load_coco_images(image_dir):
    image_paths = [
        os.path.join(image_dir, img_name) for img_name in os.listdir(image_dir) if img_name.endswith('.jpg')
    ]
    return image_paths

# Example evaluation
image_dir = "/Users/brennanlee/Desktop/opencv-healthcare/cocoapi/PythonAPI/coco/train2017"
annotation_file = "/Users/brennanlee/Desktop/opencv-healthcare/cocoapi/PythonAPI/coco/annotations/person_keypoints_train2017.json"

image_paths = load_coco_images(image_dir)
annotations = load_coco_annotations(annotation_file)

image_paths = image_paths[:100]
image_ids = [int(os.path.basename(img_path).split('.')[0]) for img_path in image_paths]

# Filter ground truth annotations for the selected images
ground_truths = [annotations[image_id] for image_id in image_ids if image_id in annotations]

# Run evaluation
results = evaluate_models(models, image_paths, ground_truths)
print(results)


MediaPipe keypoints for image: []
Number of keypoints detected: 0, Expected: 51
MediaPipe keypoints for image: []
Number of keypoints detected: 0, Expected: 51
MediaPipe keypoints for image: [(429, 193), (402, 195), (416, 175), (343, 171), (352, 156), (345, 177), (363, 136), (325, 161), (337, 120), (434, 193), (435, 192), (430, 200), (434, 190), (424, 192), (425, 189), (330, 165), (387, 164)]
Number of keypoints detected: 17, Expected: 51
MediaPipe keypoints for image: [(250, 158), (200, 196), (245, 199), (203, 309), (232, 308), (212, 385), (261, 390), (202, 456), (230, 451), (248, 151), (248, 151), (233, 153), (243, 154), (244, 165), (247, 165), (216, 271), (270, 287)]
Number of keypoints detected: 17, Expected: 51
MediaPipe keypoints for image: [(205, 89), (277, 141), (132, 138), (243, 290), (162, 292), (308, 333), (114, 331), (334, 467), (111, 478), (188, 73), (181, 73), (240, 76), (171, 79), (219, 104), (196, 103), (256, 190), (180, 243)]
Number of keypoints detected: 17, Expected:

In [4]:
from PIL import Image

def remove_black_background(image_path, output_path):
    img = Image.open(image_path).convert("RGBA")
    datas = img.getdata()

    new_data = []
    for item in datas:
        # Change all black (also shades of black)
        if item[0] == 0 and item[1] == 0 and item[2] == 0:  # Detect black color
            # Replace with a transparent pixel
            new_data.append((255, 255, 255, 0))
        else:
            new_data.append(item)

    img.putdata(new_data)
    img.save(output_path, "PNG")

# get all png in current directory
import os
images = [file for file in os.listdir() if file.endswith(".png")]
print(images)

# Example usage:

for image in images:
    remove_black_background(image, f"edited_{image}")

['gswt_start_line.png']
