# 更改的部分

* pose estimation 姿態辨識改為使用paddlehub提供的 `human_pose_estimation_resnet50_mpii`

# Objective:

As input to the system, take the live feed from the webcam and use pose estimation to map out a small dance tutorial.

# Approach:
- We will take a pretrained **openpose estimation model** to prdict the **18 keypoints** on a human body.
- We take openpose model for tensorflow by Ildoo Kim
  - GitHub Repo Link: https://github.com/ildoonet/tf-pose-estimation
<br>**[!] Note**: Some how I found issues with this repo to work with tensorflow 2.0 and followed a modified repo of his by Gunjan Seth.<br>
GitHub Repo Link: https://github.com/gsethi2409/tf-pose-estimation
<br>Medium Blog by Gunjan Seth: https://medium.com/@gsethi2409/pose-estimation-with-tensorflow-2-0-a51162c095ba
- The keypoints of the dancer are obtained and stored in a array list.
- These keypoints are **normalized**.
- The user feed is taken and the keypoints are detected.
- The keypoints are normalized and the **cosine similarity** is found between the user keypoints and the array of dancer keypoints.
- The minimum similarity score is **compared with the threshold** and then it displays is the user steps are correct or not for the given dancer moves.

# Constraints To Look For:
1. The model should be fast for prediction. Latency should be avoided.
2. Predictions should be accurate and the steps should be close enough with  the dancer.


## Import the Necessary Libraries

### Python = 3.8

In [None]:
!pip install --upgrade paddlepaddle
!pip install --upgrade paddlehub
!pip install moviepy
!pip install pygame

In [1]:
# test
import os
import cv2
import paddlehub as hub
from moviepy.editor import *
from matplotlib import pyplot as plt
import numpy as np  
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg 
from PIL import ImageFont, ImageDraw, Image
import requests
import pandas as pd
import time

  from .autonotebook import tqdm as notebook_tqdm
shm_open() failed: No such file or directory
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5703:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2666:(snd_pcm_open_noupdate) Unknown PCM default
shm_open() failed: No such file or directory
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc

In [None]:
!hub install human_pose_estimation_resnet50_mpii

# Take position from the trainer (dancer):
- We made two functions to get all the keypoints from the trainer and store them in a dataframe and in a list.
-  The function **"dance_video_processing"** is used to predict all the keypoints from the video and return all the keypoints for the video.
- The function **"get_position"** is used to take all the keypoints that are returned from the above function, preprocess them and return the dataframe and the list of keypoints.  

`human_pose_estimation_resnet50_mpii` 模型可以檢測 16 個關鍵點。這些關鍵點分別是：

1. 頭頂 (head_top)
2. 頸部 (neck)
3. 右肩 (right_shoulder)
4. 右肘 (right_elbow)
5. 右腕 (right_wrist)
6. 左肩 (left_shoulder)
7. 左肘 (left_elbow)
8. 左腕 (left_wrist)
9. 右髋 (right_hip)
10. 右膝 (right_knee)
11. 右踝 (right_ankle)
12. 左髋 (left_hip)
13. 左膝 (left_knee)
14. 左踝 (left_ankle)
15. 胸部 (thorax)
16. 骶骨 (pelvis)


In [100]:
def get_video_dimensions(video_path):
    """取得影片的解析度

    Args:
        video_path (string) : 影片路徑

    Returns:
        tuple (width, height) : 影片的解析度
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return None

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    cap.release()
    return (width, height)

In [101]:
# 示例使用
video_path = r'dance_video/dancer.mp4'
dimensions = get_video_dimensions(video_path)
if dimensions:
    print(f"影片尺寸: 寬度={dimensions[0]}, 高度={dimensions[1]}")

影片尺寸: 寬度=406, 高度=718


In [93]:
# 加載新的人體姿勢估計模型
pose_resnet50_mpii = hub.Module(name="human_pose_estimation_resnet50_mpii")



def dance_video_processing(video_path, showBG=True, dim = (640, 480)):
    """
    對輸入的視頻進行處理，提取每一幀的關鍵點信息。

    Args:
        video_path (str): 視頻文件的路徑。
        showBG (bool, optional): 是否顯示背景。默認為 True。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。

    Returns:
        list: 包含每幀圖像中提取的16個關鍵點的(x,y)坐標信息。
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return

    fps_time = 0
    keypoints_list = []

    while True:
        ret_val, image = cap.read()
        if not ret_val:
            break

        # resize image
        image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
        results = pose_resnet50_mpii.keypoint_detection(images=[image], use_gpu=False)

        if not showBG:
            image = np.zeros(image.shape)

        if results and len(results) > 0:
            keypoints = results[0]['data']
            keypoints_list.append(keypoints)
            for key, (x, y) in keypoints.items():
                if x > 0 and y > 0:
                    cv2.circle(image, (int(x), int(y)), 5, (0, 255, 0), -1)
                    cv2.putText(image, f'{key}', (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

        # To display fps
        cv2.putText(image, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        # To display image
        cv2.imshow('Dancer', image)
        fps_time = time.time()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    return keypoints_list



def get_position(video_path, showBG=True, dim = (640, 480)):
    """
    提取視頻中的關鍵點，將關鍵點座標(x,y)分開儲存在dataframe中。

    Args:
        video_path (str): 視頻文件的路徑。
        showBG (bool, optional): 是否顯示背景。默認為 True。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。

    Returns:
        DataFrame: 包含每幀圖像中提取的關鍵點信息的數據框。
        list: 包含每幀圖像中提取的關鍵點信息的列表。
    """
    keypoints_list = dance_video_processing(video_path, showBG, dim)

    # 初始化特徵列表
    features = [0] * 32
    keyp_list = []

    # 預處理關鍵點數據
    for keypoints in keypoints_list:
        features = [0] * 32
        for idx, part in enumerate(['left_ankle', 'left_knee', 'left_hip', 'right_hip', 'right_knee', 'right_ankle', 'pelvis', 'thorax', 'upper_neck', 'head_top', 'right_wrist', 'right_elbow', 'right_shoulder', 'left_shoulder', 'left_elbow', 'left_wrist']):
            if part in keypoints:
                x, y = keypoints[part]
                features[idx * 2] = x
                features[idx * 2 + 1] = y
        keyp_list.append(features)

    # 構建數據框的列名
    column_names = [str(i) for i in range(32)]
    data = pd.DataFrame(keyp_list, columns=column_names)

    return data, keyp_list


In [122]:
def draw_skeleton(image, keypoints, color=(0, 255, 0), thickness=2):
    """
    在圖像上畫出骨架。

    Args:
        image (numpy.ndarray): 圖像。
        keypoints (dict): 關鍵點字典，包含每個關鍵點的座標。
        color (tuple, optional): 骨架的顏色。默認為綠色。
        thickness (int, optional): 骨架的線條粗細。默認為2。
    """
    skeleton = [
        ('head_top', 'upper_neck'),
        ('upper_neck', 'thorax'),
        ('upper_neck', 'left_shoulder'),
        ('upper_neck', 'right_shoulder'),
        ('left_shoulder', 'left_elbow'),
        ('left_elbow', 'left_wrist'),
        ('right_shoulder', 'right_elbow'),
        ('right_elbow', 'right_wrist'),
        ('thorax', 'left_hip'),
        ('left_hip', 'left_knee'),
        ('left_knee', 'left_ankle'),
        ('thorax', 'right_hip'),
        ('right_hip', 'right_knee'),
        ('right_knee', 'right_ankle')
    ]
    
    for joint1, joint2 in skeleton:
        if joint1 in keypoints and joint2 in keypoints:
            point1 = keypoints[joint1]
            point2 = keypoints[joint2]
            if point1[0] > 0 and point1[1] > 0 and point2[0] > 0 and point2[1] > 0:
                cv2.line(image, (point1[0], point1[1]), (point2[0], point2[1]), color, thickness)
    return image



**Observation:** 
- We can see how the keypoints data looks from the above example.
- Since they are 16 keypoints and each keypoint has **x-coordinate** and **y-coordinate** we have **32 columns** (16 x 2).

# 相似度計算

## Cosine Similarity:
Cosine Similarity function for our model to find the keypoints.

In [123]:

def findCosineSimilarity_1(source_representation, test_representation):
    """
    計算兩個向量之間的餘弦相似度。
    餘弦相似度越低，表示兩個向量越相似。

    Args:
        source_representation (numpy.ndarray): 來源向量。
        test_representation (numpy.ndarray): 測試向量。

    Returns:
        float: 餘弦相似度，範圍在 [0, 1] 之間。值越低表示相似度越高。
    """
    # 計算內積
    a = np.matmul(np.transpose(source_representation), test_representation)
    # 計算每個向量的范數
    b = np.sum(np.multiply(source_representation, source_representation))
    c = np.sum(np.multiply(test_representation, test_representation))
    # 返回餘弦相似度
    return 1 - (a / (np.sqrt(b) * np.sqrt(c)))

# Comparing:
Comparing the user images with keypoints of the dancer. 

In [124]:
def compare_positions(trainer_video, user_video, keyp_list, dim=(640, 480)):
    """
    比較兩個視頻中的姿勢，並在視頻中畫出骨架。

    Args:
        trainer_video (str): 舞者視頻文件的路徑。
        user_video (str): 用戶視頻文件的路徑。
        keyp_list (list): 舞者關鍵點列表。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。
    """
    # 打開視頻文件
    cap = cv2.VideoCapture(trainer_video)
    cam = cv2.VideoCapture(user_video)
    
    # 檢查視頻文件是否成功打開
    if not cap.isOpened():
        print("Error: Unable to open trainer video.")
        return
    if not cam.isOpened():
        print("Error: Unable to open user video.")
        return
    
    # 設置用戶視頻的尺寸
    cam.set(3, dim[0])
    cam.set(4, dim[1])
    fps_time = 0

    while True:
        # 讀取視頻幀
        ret_val, image_1 = cam.read()
        ret_val_1, image_2 = cap.read()
        
        if not ret_val or not ret_val_1:
            print("Error: Unable to read frame from video.")
            break
        
        # 調整圖像尺寸
        image_2 = cv2.resize(image_2, dim, interpolation=cv2.INTER_AREA)
        image_1 = cv2.resize(image_1, dim, interpolation=cv2.INTER_AREA)

        # 獲取舞者的關鍵點
        trainer_results = pose_resnet50_mpii.keypoint_detection(images=[image_2], use_gpu=False)
        if trainer_results and len(trainer_results) > 0:
            trainer_keypoints = trainer_results[0]['data']
            trainer_features = [0] * 32
            # 將關鍵點轉換為特徵向量
            for idx, key in enumerate(['head_top', 'neck', 'right_shoulder', 'right_elbow', 'right_wrist', 'left_shoulder', 'left_elbow', 'left_wrist', 'right_hip', 'right_knee', 'right_ankle', 'left_hip', 'left_knee', 'left_ankle', 'thorax', 'pelvis']):
                if key in trainer_keypoints:
                    x, y = trainer_keypoints[key]
                    trainer_features[idx * 2] = x
                    trainer_features[idx * 2 + 1] = y
            # 標準化特徵向量
            transformer = Normalizer().fit([trainer_features])
            keyp_list = transformer.transform(keyp_list)
            # 畫出舞者骨架
            image_2 = draw_skeleton(image_2, trainer_keypoints)
        else:
            print("Error: No keypoints detected for the trainer video.")
            break

        # 獲取用戶的關鍵點
        user_results = pose_resnet50_mpii.keypoint_detection(images=[image_1], use_gpu=False)
        if user_results and len(user_results) > 0:
            user_keypoints = user_results[0]['data']
            user_features = [0] * 32
            # 將關鍵點轉換為特徵向量
            for idx, key in enumerate(['head_top', 'neck', 'right_shoulder', 'right_elbow', 'right_wrist', 'left_shoulder', 'left_elbow', 'left_wrist', 'right_hip', 'right_knee', 'right_ankle', 'left_hip', 'left_knee', 'left_ankle', 'thorax', 'pelvis']):
                if key in user_keypoints:
                    x, y = user_keypoints[key]
                    user_features[idx * 2] = x
                    user_features[idx * 2 + 1] = y
            # 標準化用戶特徵向量
            user_features = transformer.transform([user_features])

            # 計算用戶特徵向量與舞者特徵向量之間的餘弦相似度
            min_similarity = 100
            for kp in keyp_list:
                sim_score = findCosineSimilarity_1(kp, user_features[0])
                if min_similarity > sim_score:
                    min_similarity = sim_score

            # 在用戶視頻上顯示餘弦相似度
            cv2.putText(image_1, str(min_similarity), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

            # 根據餘弦相似度判斷動作是否正確
            if min_similarity < 0.2:
                cv2.putText(image_1, "CORRECT STEPS", (120, 700), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            else:
                cv2.putText(image_1, "NOT CORRECT STEPS", (80, 700), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            
            # 畫出用戶骨架
            image_1 = draw_skeleton(image_1, user_keypoints)
        else:
            print("Error: No keypoints detected for the user video.")
            break

        # 在舞者視頻上顯示 FPS
        cv2.putText(image_2, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        # 在用戶視頻上顯示 FPS
        cv2.putText(image_1, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # 顯示視頻幀
        cv2.imshow('Dancer Window', image_2)
        cv2.imshow('User Window', image_1)

        fps_time = time.time()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # 釋放視頻對象
    cam.release()
    cap.release()
    cv2.destroyAllWindows()

In [129]:
# 示例调用
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
data, keyp_list = get_position(trainer_video, showBG=True, dim = (dimensions[0], dimensions[1]))


In [130]:
# 示例调用
trainer_video = 'dance_video/breaking.mp4'
dimensions = get_video_dimensions(trainer_video)
data, keyp_list = get_position(trainer_video, showBG=True, dim = (dimensions[0], dimensions[1]))


##### Note:
Since I cant dance, I'll be using a video for this :P.<br> We can replce the **user_video** attribute to **0 or 1** to turn on live camera depending on the type of camera we have.
### For a wrong positions:

In [126]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions(trainer_video, 'dance_video/locking.mp4', keyp_list, dim=(dimensions[0], dimensions[1]))

### For a correct positions:

In [128]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions('dance_video/dancer.mp4','dance_video/right_dance.mp4',keyp_list, dim=(dimensions[0], dimensions[1])) 

### For the same video

In [132]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions('dance_video/dancer.mp4','dance_video/dancer.mp4',keyp_list, dim = (dimensions[0], dimensions[1]))

# Conclusion:

- We have developed a pose estimation similarity pipeline to compare similarity between two poses from the given feed of videos or live cam.<br>
**Flaws:**
- This approach fails when the trainer is far or the user is near to the camera or vise-versa. This happens because there is a **scale variation** between the keypoints of the image.<br>
**Solution:**
- We can eleminate this problem by **croping out the image of a peron** using a CNN architecture like Yolo or anything that could detect the bounding boxes of a person.
- This image then can be fed to the openpose model to estimate keypoints for both the sources.<br>
**Scope of improvement:**
- The accuracy of the model for keypoint prediction can be increased by taking a much powerful pretrained model architecture than mobilenet.

# Improve

# Test

## 加入 YOLO

In [133]:

yolo_names_path = "coco.names"
# 加載 YOLO 模型
net = cv2.dnn.readNet("yolov4.weights", "yolov4.cfg")

layer_names = net.getLayerNames()

# 獲取未連接的輸出層的名稱
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

# 載入類別名稱
with open(yolo_names_path, "r") as f:
    classes = [line.strip() for line in f.readlines()]

def detect_person(image):
    height, width, channels = image.shape
    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)
    
    class_ids = []
    confidences = []
    boxes = []
    
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and classes[class_id] == "person":  # 檢測到人的類別
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    if isinstance(indexes, (np.ndarray, list)) and len(indexes) > 0:
        if isinstance(indexes[0], (np.ndarray, list)):
            x, y, w, h = boxes[indexes[0][0]]
        else:
            x, y, w, h = boxes[indexes[0]]
        return x, y, w, h
    return None

def dance_video_processing(video_path= r'dance_video/dancer.mp4', showBG=True):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return

    fps_time = 0
    keypoints_list = []

    while cap.isOpened():
        ret_val, image = cap.read()
        if not ret_val:
            break

        person_box = detect_person(image)
        if person_box:
            x, y, w, h = person_box
            image = image[y:y+h, x:x+w]

        results = pose_resnet50_mpii.keypoint_detection(images=[image], use_gpu=False)

        if not showBG:
            image = np.zeros(image.shape)

        if results and len(results) > 0:
            keypoints = results[0]['data']
            keypoints_list.append(keypoints)
            for key, (x, y) in keypoints.items():
                if x > 0 and y > 0:
                    cv2.circle(image, (int(x), int(y)), 5, (0, 255, 0), -1)
                    cv2.putText(image, f'{key}', (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

        # To display fps
        cv2.putText(image, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        # To display image
        cv2.imshow('Dancer', image)
        fps_time = time.time()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    return keypoints_list


# 示例使用
video_path = r'dance_video/breaking.mp4'
keypoints = dance_video_processing(video_path)
