# Import the Necessary Libraries

## 取得 Dance Video 的 影片
直接執行 `python download_video.py`

## 環境
- Python = 3.8.x  
- Cuda = 12.1
- Pytorch = 2.3.0
- OpenCV = 4.6.0
- Protobuf > 3.20.3

In [None]:
!pip install --upgrade paddlepaddle
!pip install --upgrade paddlehub
!pip install moviepy
!pip install pygame

In [1]:
import os
import cv2
import paddlehub as hub
from moviepy.editor import *
from matplotlib import pyplot as plt
import numpy as np  
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg 
from PIL import ImageFont, ImageDraw, Image
import requests
import pandas as pd
import time

  from .autonotebook import tqdm as notebook_tqdm
shm_open() failed: No such file or directory
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5703:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2666:(snd_pcm_open_noupdate) Unknown PCM default
shm_open() failed: No such file or directory
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5180:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc

In [2]:
!hub install human_pose_estimation_resnet50_mpii

[32m[2024-05-25 22:49:57,985] [    INFO][0m - Module human_pose_estimation_resnet50_mpii already installed in /home/brian/.paddlehub/modules/human_pose_estimation_resnet50_mpii[0m


# 函式定義

`human_pose_estimation_resnet50_mpii` 模型可以檢測 16 個關鍵點。這些關鍵點分別是：

1. 頭頂 (head_top)
2. 頸部 (neck)
3. 右肩 (right_shoulder)
4. 右肘 (right_elbow)
5. 右腕 (right_wrist)
6. 左肩 (left_shoulder)
7. 左肘 (left_elbow)
8. 左腕 (left_wrist)
9. 右髋 (right_hip)
10. 右膝 (right_knee)
11. 右踝 (right_ankle)
12. 左髋 (left_hip)
13. 左膝 (left_knee)
14. 左踝 (left_ankle)
15. 胸部 (thorax)
16. 骶骨 (pelvis)


## 取得影片的解析度

In [2]:
def get_video_dimensions(video_path):
    """取得影片的解析度

    Args:
        video_path (string) : 影片路徑

    Returns:
        tuple (width, height) : 影片的解析度
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return None

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    cap.release()
    return (width, height)

## 對輸入的視頻進行處理，提取每一幀的關鍵點信息。

In [55]:
# 加載新的人體姿勢估計模型
pose_resnet50_mpii = hub.Module(name="human_pose_estimation_resnet50_mpii")



def dance_video_processing(video_path, showBG=True, dim = (640, 480)):
    """
    對輸入的視頻進行處理，提取每一幀的關鍵點信息。

    Args:
        video_path (str): 視頻文件的路徑。
        showBG (bool, optional): 是否顯示背景。默認為 True。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。

    Returns:
        list: 包含每幀圖像中提取的16個關鍵點的(x,y)坐標信息。
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return

    fps_time = 0
    keypoints_list = []

    while True:
        ret_val, image = cap.read()
        if not ret_val:
            break

        # resize image
        image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
        results = pose_resnet50_mpii.keypoint_detection(images=[image], use_gpu=False)

        if not showBG:
            image = np.zeros(image.shape)

        if results and len(results) > 0:
            keypoints = results[0]['data']
            keypoints_list.append(keypoints)
            for key, (x, y) in keypoints.items():
                if x > 0 and y > 0:
                    cv2.circle(image, (int(x), int(y)), 5, (0, 255, 0), -1)
                    cv2.putText(image, f'{key}', (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

        # To display fps
        cv2.putText(image, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        # To display image
        cv2.imshow('Dancer', image)
        fps_time = time.time()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    return keypoints_list



def get_position(video_path, showBG=True, dim = (640, 480)):
    """
    提取視頻中的關鍵點，將關鍵點座標(x,y)分開儲存在dataframe中。

    Args:
        video_path (str): 視頻文件的路徑。
        showBG (bool, optional): 是否顯示背景。默認為 True。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。

    Returns:
        DataFrame: 包含每幀圖像中提取的關鍵點信息的數據框。
        list: 包含每幀圖像中提取的關鍵點信息的列表。
    """
    keypoints_list = dance_video_processing(video_path, showBG, dim)

    # 初始化特徵列表
    features = [0] * 32
    keyp_list = []

    # 預處理關鍵點數據
    for keypoints in keypoints_list:
        features = [0] * 32
        for idx, part in enumerate(['left_ankle', 'left_knee', 'left_hip', 'right_hip', 'right_knee', 'right_ankle', 'pelvis', 'thorax', 'upper_neck', 'head_top', 'right_wrist', 'right_elbow', 'right_shoulder', 'left_shoulder', 'left_elbow', 'left_wrist']):
            if part in keypoints:
                x, y = keypoints[part]
                features[idx * 2] = x
                features[idx * 2 + 1] = y
        keyp_list.append(features)

    # 構建數據框的列名
    column_names = [str(i) for i in range(32)]
    data = pd.DataFrame(keyp_list, columns=column_names)

    return data, keyp_list


In [4]:
def draw_skeleton(image, keypoints, color=(0, 255, 0), thickness=2):
    """
    在圖像上畫出骨架。

    Args:
        image (numpy.ndarray): 圖像。
        keypoints (dict): 關鍵點字典，包含每個關鍵點的座標。
        color (tuple, optional): 骨架的顏色。默認為綠色。
        thickness (int, optional): 骨架的線條粗細。默認為2。
    """
    skeleton = [
        ('head_top', 'upper_neck'),
        ('upper_neck', 'thorax'),
        ('upper_neck', 'left_shoulder'),
        ('upper_neck', 'right_shoulder'),
        ('left_shoulder', 'left_elbow'),
        ('left_elbow', 'left_wrist'),
        ('right_shoulder', 'right_elbow'),
        ('right_elbow', 'right_wrist'),
        ('thorax', 'left_hip'),
        ('left_hip', 'left_knee'),
        ('left_knee', 'left_ankle'),
        ('thorax', 'right_hip'),
        ('right_hip', 'right_knee'),
        ('right_knee', 'right_ankle')
    ]
    
    for joint1, joint2 in skeleton:
        if joint1 in keypoints and joint2 in keypoints:
            point1 = keypoints[joint1]
            point2 = keypoints[joint2]
            if point1[0] > 0 and point1[1] > 0 and point2[0] > 0 and point2[1] > 0:
                cv2.line(image, (point1[0], point1[1]), (point2[0], point2[1]), color, thickness)
    return image


# 原本系統的相似度計算

## Cosine Similarity:
Cosine Similarity function for our model to find the keypoints.

In [5]:

def findCosineSimilarity_1(source_representation, test_representation):
    """
    計算兩個向量之間的餘弦相似度。
    餘弦相似度越低，表示兩個向量越相似。

    Args:
        source_representation (numpy.ndarray): 來源向量。
        test_representation (numpy.ndarray): 測試向量。

    Returns:
        float: 餘弦相似度，範圍在 [0, 1] 之間。值越低表示相似度越高。
    """
    # 計算內積
    a = np.matmul(np.transpose(source_representation), test_representation)
    # 計算每個向量的范數
    b = np.sum(np.multiply(source_representation, source_representation))
    c = np.sum(np.multiply(test_representation, test_representation))
    # 返回餘弦相似度
    return 1 - (a / (np.sqrt(b) * np.sqrt(c)))

## Comparing:
Comparing the user images with keypoints of the dancer. 

In [6]:
def compare_positions(trainer_video, user_video, keyp_list, dim=(640, 480)):
    """
    比較兩個視頻中的姿勢，並在視頻中畫出骨架。

    Args:
        trainer_video (str): 舞者視頻文件的路徑。
        user_video (str): 用戶視頻文件的路徑。
        keyp_list (list): 舞者關鍵點列表。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。
    """
    # 打開視頻文件
    cap = cv2.VideoCapture(trainer_video)
    cam = cv2.VideoCapture(user_video)
    
    # 檢查視頻文件是否成功打開
    if not cap.isOpened():
        print("Error: Unable to open trainer video.")
        return
    if not cam.isOpened():
        print("Error: Unable to open user video.")
        return
    
    # 設置用戶視頻的尺寸
    cam.set(3, dim[0])
    cam.set(4, dim[1])
    fps_time = 0

    while True:
        # 讀取視頻幀
        ret_val, image_1 = cam.read()
        ret_val_1, image_2 = cap.read()
        
        if not ret_val or not ret_val_1:
            print("Error: Unable to read frame from video.")
            break
        
        # 調整圖像尺寸
        image_2 = cv2.resize(image_2, dim, interpolation=cv2.INTER_AREA)
        image_1 = cv2.resize(image_1, dim, interpolation=cv2.INTER_AREA)

        # 獲取舞者的關鍵點
        trainer_results = pose_resnet50_mpii.keypoint_detection(images=[image_2], use_gpu=False)
        if trainer_results and len(trainer_results) > 0:
            trainer_keypoints = trainer_results[0]['data']
            trainer_features = [0] * 32
            # 將關鍵點轉換為特徵向量
            for idx, key in enumerate(['head_top', 'neck', 'right_shoulder', 'right_elbow', 'right_wrist', 'left_shoulder', 'left_elbow', 'left_wrist', 'right_hip', 'right_knee', 'right_ankle', 'left_hip', 'left_knee', 'left_ankle', 'thorax', 'pelvis']):
                if key in trainer_keypoints:
                    x, y = trainer_keypoints[key]
                    trainer_features[idx * 2] = x
                    trainer_features[idx * 2 + 1] = y
            # 標準化特徵向量
            transformer = Normalizer().fit([trainer_features])
            keyp_list = transformer.transform(keyp_list)
            # 畫出舞者骨架
            image_2 = draw_skeleton(image_2, trainer_keypoints)
        else:
            print("Error: No keypoints detected for the trainer video.")
            break

        # 獲取用戶的關鍵點
        user_results = pose_resnet50_mpii.keypoint_detection(images=[image_1], use_gpu=False)
        if user_results and len(user_results) > 0:
            user_keypoints = user_results[0]['data']
            user_features = [0] * 32
            # 將關鍵點轉換為特徵向量
            for idx, key in enumerate(['head_top', 'neck', 'right_shoulder', 'right_elbow', 'right_wrist', 'left_shoulder', 'left_elbow', 'left_wrist', 'right_hip', 'right_knee', 'right_ankle', 'left_hip', 'left_knee', 'left_ankle', 'thorax', 'pelvis']):
                if key in user_keypoints:
                    x, y = user_keypoints[key]
                    user_features[idx * 2] = x
                    user_features[idx * 2 + 1] = y
            # 標準化用戶特徵向量
            user_features = transformer.transform([user_features])

            # 計算用戶特徵向量與舞者特徵向量之間的餘弦相似度
            min_similarity = 100
            for kp in keyp_list:
                sim_score = findCosineSimilarity_1(kp, user_features[0])
                if min_similarity > sim_score:
                    min_similarity = sim_score

            # 在用戶視頻上顯示餘弦相似度
            cv2.putText(image_1, str(min_similarity), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

            # 根據餘弦相似度判斷動作是否正確
            if min_similarity < 0.2:
                cv2.putText(image_1, "CORRECT STEPS", (120, 700), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            else:
                cv2.putText(image_1, "NOT CORRECT STEPS", (80, 700), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            
            # 畫出用戶骨架
            image_1 = draw_skeleton(image_1, user_keypoints)
        else:
            print("Error: No keypoints detected for the user video.")
            break

        # 在舞者視頻上顯示 FPS
        cv2.putText(image_2, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        # 在用戶視頻上顯示 FPS
        cv2.putText(image_1, "FPS: %f" % (1.0 / (time.time() - fps_time)), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # 顯示視頻幀
        cv2.imshow('Dancer Window', image_2)
        cv2.imshow('User Window', image_1)

        fps_time = time.time()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # 釋放視頻對象
    cam.release()
    cap.release()
    cv2.destroyAllWindows()

## 示例调用

In [153]:

trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
data, keyp_list = get_position(trainer_video, showBG=True, dim = (dimensions[0], dimensions[1]))


In [158]:
# 示例调用
trainer_video = 'dance_video/breaking.mp4'
# trainer_video = 'dance_video/basic/gBR_sBM_c03_d04_mBR1_ch09.mp4'
dimensions = get_video_dimensions(trainer_video)
data, keyp_list = get_position(trainer_video, showBG=True, dim = (dimensions[0], dimensions[1]))


##### Note:
Since I cant dance, I'll be using a video for this :P.<br> We can replce the **user_video** attribute to **0 or 1** to turn on live camera depending on the type of camera we have.
### For a wrong positions:

In [155]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions(trainer_video, 'dance_video/locking.mp4', keyp_list, dim=(dimensions[0], dimensions[1]))

### For a correct positions:

In [152]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions('dance_video/dancer.mp4','dance_video/right_dance.mp4',keyp_list, dim=(dimensions[0], dimensions[1])) 

### For the same video

In [132]:
trainer_video = 'dance_video/dancer.mp4'
dimensions = get_video_dimensions(trainer_video)
compare_positions('dance_video/dancer.mp4','dance_video/dancer.mp4',keyp_list, dim = (dimensions[0], dimensions[1]))

# 原本的Conclusion:

- We have developed a pose estimation similarity pipeline to compare similarity between two poses from the given feed of videos or live cam.<br>
**Flaws:**
- This approach fails when the trainer is far or the user is near to the camera or vise-versa. This happens because there is a **scale variation** between the keypoints of the image.<br>
**Solution:**
- We can eleminate this problem by **croping out the image of a peron** using a CNN architecture like Yolo or anything that could detect the bounding boxes of a person.
- This image then can be fed to the openpose model to estimate keypoints for both the sources.<br>
**Scope of improvement:**
- The accuracy of the model for keypoint prediction can be increased by taking a much powerful pretrained model architecture than mobilenet.

# Improve

1. pose estimation 姿態辨識改為使用paddlehub提供的 `human_pose_estimation_resnet50_mpii`

2. 加入 **基於人體比例的標準化**

3. 使用 **LSTM**

## 標準化


### 基於人體比例的標準化 (Proportional Normalization)
這種方法考慮到了個體身高和體型的差異，將關鍵點相對於人體的一部分（例如肩膀之間的距離）進行比例縮放。這樣可以保證不同體型的人在標準化後的關鍵點能夠具有可比性。

In [7]:
def proportional_normalize(keypoints_list):
    normalized_keypoints_list = []
    for keypoints in keypoints_list:
        if 'left_shoulder' in keypoints and 'right_shoulder' in keypoints:
            shoulder_distance = np.linalg.norm(np.array(keypoints['left_shoulder']) - np.array(keypoints['right_shoulder']))
            normalized_keypoints = {key: np.array(val) / shoulder_distance for key, val in keypoints.items()}
            normalized_keypoints_list.append(normalized_keypoints)
    return normalized_keypoints_list


In [77]:
# 示例调用
trainer_video = 'dance_video/freeze_edit.mp4'
dimensions = get_video_dimensions(trainer_video)
keypoints_list = dance_video_processing(trainer_video, showBG=True, dim = (dimensions[0], dimensions[1]))

In [78]:
normalized_keypoints = proportional_normalize(keypoints_list)
print(normalized_keypoints)

[{'left_ankle': array([29.62962963, 10.        ]), 'left_knee': array([27.62962963, 18.03703704]), 'left_hip': array([26.66666667, 15.25925926]), 'right_hip': array([26.14814815, 14.7037037 ]), 'right_knee': array([28.14814815, 18.33333333]), 'right_ankle': array([29.62962963, 10.25925926]), 'pelvis': array([26.66666667, 15.        ]), 'thorax': array([21.7037037 , 18.33333333]), 'upper_neck': array([21.7037037 , 19.44444444]), 'head_top': array([19.74074074, 22.22222222]), 'right_wrist': array([24.66666667, 21.11111111]), 'right_elbow': array([24.18518519, 18.03703704]), 'right_shoulder': array([22.22222222, 17.77777778]), 'left_shoulder': array([21.22222222, 17.77777778]), 'left_elbow': array([24.18518519, 18.33333333]), 'left_wrist': array([25.18518519, 21.66666667])}, {'left_ankle': array([20.02074424,  6.64895565]), 'left_knee': array([19.70060934,  7.01834208]), 'left_hip': array([17.73054841,  9.60404705]), 'right_hip': array([17.06565284,  9.60404705]), 'right_knee': array([19.

## LSTM
### 1. 使用的數據

在這次項目中，我們使用了120個影片作為訓練數據，每個影片包含120幀的數據，每幀包含16個關鍵點的(x, y)座標。這些影片中，12個影片標記為流暢的動作，108個影片標記為不流暢的動作。這些關鍵點數據是通過人體姿勢估計模型 `human_pose_estimation_resnet50_mpii` 從每幀圖像中提取出來的。

### 2. 數據處理

對提取出的關鍵點數據進行了標準化處理，以減少因個體差異和拍攝角度造成的影響。標準化方法是使用肩膀之間的距離作為比例因子，將所有關鍵點的座標值歸一化。

### 3. 訓練LSTM模型

我們使用了LSTM（長短期記憶）模型來訓練這些數據。LSTM模型的架構如下：

- **輸入層**：32個節點（16個關鍵點的(x, y)座標）。
- **隱藏層**：2層，每層有64個單元。
- **輸出層**：1個節點，用於輸出流暢性得分。

超參數設置如下：

- **學習率**：0.001
- **訓練輪數**：50
- **損失函數**：二元交叉熵損失（BCEWithLogitsLoss）
- **優化器**：Adam

訓練過程中，每個批次的數據都被送入LSTM模型，計算損失後通過反向傳播來更新模型參數。

### 4. 評估LSTM模型

在評估階段，我們使用一個新的影片，通過以下步驟來計算其動作的流暢性得分：

1. **提取關鍵點**：對影片的每幀圖像提取16個關鍵點的(x, y)座標。
2. **標準化**：使用肩膀之間的距離進行比例標準化。
3. **生成序列**：將連續的幀數據分成長度為120的序列。
   - 如果影片少於120幀，用0補齊。
   - 如果影片等於120幀，直接使用。
   - 如果影片超過120幀，分段處理，不足120的部分用0補齊。
4. **預測流暢性得分**：將生成的序列送入LSTM模型，計算每段的流暢性得分，最終取所有分段的得分平均值作為該影片的流暢性得分。
   得分代表了LSTM模型在分析影片中每一段時間序列時所預測的流暢性分數。分數範圍從0到1，其中越接近1表示模型認為該段時間序列的動作越流暢，越接近0則表示模型認為該段時間序列的動作不流暢。  
   將評估影片依每120偵分成一段一段的時間序列。


### 結論
通過上述步驟，我們訓練並評估了LSTM模型，用於自動化評估舞蹈動作的流暢性。這不僅提升了評估效率，還能提供更準確的一致性判斷。

### 未來展望
目前只訓練一個動作的流暢度模型，希望未來可以判別動作以增加實用程度

### 生成 label.csv

In [69]:

import os
import pandas as pd

def generate_labels(video_dir, output_csv):
    # 獲取影片文件名列表
    video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
    
    # 創建標籤列表
    labels = []

    # 遍歷影片文件，根據文件名模式生成標籤
    for video in video_files:
        if 'c01' in video:  # 只處理包含 'c01' 的影片
            if 'ch01' in video:  
                label = 1  # 標記為正確
            else:
                label = 0  # 標記為錯誤
            labels.append((video, label))

    # 創建 DataFrame 並保存為 CSV 文件
    labels_df = pd.DataFrame(labels, columns=['filename', 'label'])
    labels_df.to_csv(output_csv, index=False)
    print(f'Labels saved to {output_csv}')

# 設置影片目錄和輸出 CSV 文件路徑
video_dir = 'dance_video/basic'
output_csv = 'labels.csv'

# 生成標籤文件
generate_labels(video_dir, output_csv)

# 讀取標籤文件
labels_df = pd.read_csv(output_csv)

# 查看標籤文件
print(labels_df.head())

# 根據標籤加載數據
correct_videos = []
incorrect_videos = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    label = row['label']
    
    if label == 1:
        correct_videos.append(os.path.join(video_dir, filename))
    else:
        incorrect_videos.append(os.path.join(video_dir, filename))

# 打印正確和錯誤的影片列表
print("Correct Videos:", correct_videos)
print("Incorrect Videos:", incorrect_videos)


Labels saved to labels.csv
                        filename  label
0  gBR_sBM_c01_d06_mBR3_ch07.mp4      0
1  gBR_sBM_c01_d04_mBR3_ch10.mp4      0
2  gBR_sBM_c01_d05_mBR5_ch07.mp4      0
3  gBR_sBM_c01_d04_mBR1_ch09.mp4      0
4  gBR_sBM_c01_d04_mBR1_ch04.mp4      0
Correct Videos: ['dance_video/basic/gBR_sBM_c01_d05_mBR0_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR5_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR3_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR1_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR1_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR3_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR2_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR2_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR0_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR4_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR5_ch01.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR4_ch01.mp4']
Incorrect Videos: ['dance_video/basic/gBR_sBM_c01_d06_mBR3_ch07.mp4', 'dance_video/basic/gBR_sBM_c01_d04

### 加載數據集
這些步驟的目的是將準備好的數據集加載到 PyTorch 中，並將其轉換為可供模型訓練和評估的格式。下面是詳細的解釋：

#### `KeypointsDataset` 類
這個類繼承自 `torch.utils.data.Dataset`，它的作用是將數據和標籤封裝成 PyTorch 的數據集對象。該類主要有三個方法：

1. **`__init__`**：初始化數據集對象，接收數據和標籤。
2. **`__len__`**：返回數據集的大小，即數據的樣本數。
3. **`__getitem__`**：根據索引返回數據和對應的標籤，數據和標籤都會轉換為 PyTorch 的張量格式。

```python
class KeypointsDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)
```

#### 創建數據集和數據加載器
這一步的目的是將數據和標籤轉換為 PyTorch 的數據集對象，並通過數據加載器來管理數據的讀取和批次處理。

1. **創建數據集**：
   使用 `KeypointsDataset` 類來封裝數據和標籤，創建數據集對象 `dataset`。

```python
dataset = KeypointsDataset(data, targets)
```

2. **創建數據加載器**：
   使用 `DataLoader` 類來管理數據的讀取和批次處理。`DataLoader` 能夠根據指定的批次大小（`batch_size`）來將數據分批讀取，並在每個 epoch 開始時打亂數據（`shuffle=True`），以提高模型的泛化能力。

```python
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
```

#### 目的

1. **批次處理數據**：
   由於模型訓練通常是分批進行的，`DataLoader` 將數據分成若干小批次，以便在訓練過程中逐批加載數據。

2. **打亂數據**：
   通過在每個 epoch 開始時打亂數據，`DataLoader` 有助於提高模型的泛化能力，避免模型記住數據的順序。

3. **提高數據讀取效率**：
   `DataLoader` 提供了一種高效的數據讀取方式，可以在訓練過程中自動加載數據，並在訓練過程中以迭代器的形式提供數據，這有助於提升訓練效率。

#### 整體流程
1. **數據和標籤準備**：將數據和標籤準備好，並保存在 `data` 和 `targets` 變量中。
2. **數據集對象創建**：使用 `KeypointsDataset` 將數據和標籤封裝成數據集對象 `dataset`。
3. **數據加載器創建**：使用 `DataLoader` 創建數據加載器 `dataloader`，以便在模型訓練和評估過程中高效地讀取和管理數據。

這樣，我們就能夠在後續的訓練和評估過程中方便地加載和處理數據，從而提高模型訓練的效率和效果。

In [18]:
import pickle
from torch.utils.data import Dataset, DataLoader

class KeypointsDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)
    
# 加載數據
with open('data.pkl', 'rb') as f:
    data, targets = pickle.load(f)

# 打印數據形狀以確認加載正確
print(f"Data shape: {data.shape}")
print(f"Targets shape: {targets.shape}")

# data 中有 inf 的值
# 將 inf 替換為 nan 再替換為 0
data = np.where(np.isinf(data), np.nan, data)
data = np.nan_to_num(data, nan=0.0)


dataset = KeypointsDataset(data, targets)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)



Data shape: (120, 120, 32)
Targets shape: (120,)


### LSTM 模型設置

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary

class DanceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout = 0.5):
        # 初始化隐藏层大小和层数
        super(DanceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # 定義LSTM层，在LSTM層之間加入0.5的Dropout層防止過擬合，提高模型的泛化能力
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        # 定義全連接層，将LSTM的输出映射到流畅性得分
        self.fc = nn.Linear(hidden_size, 1)   # 輸出一個值，表示流暢性得分


    def forward(self, x):
        # 初始化隐藏状态和细胞状态
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # 前向传播LSTM层，得到输出和（隐藏状态，细胞状态）
        out, _ = self.lstm(x, (h0, c0))
        # 取LSTM最后一个时间步的输出，通过全连接层得到最终的流畅性得分
        out = self.fc(out[:, -1, :])
        return out



# LSTM 模型設置
input_size = 32  # 16 keypoints * 2 (x, y)
hidden_size = 128  # 增加隱藏層數量
num_layers = 3  # 增加LSTM層數
model = DanceLSTM(input_size, hidden_size, num_layers, dropout=0.5).to('cuda')



# 训练超参数设置
learning_rate = 0.0001  # 調整學習率
num_epochs = 100  # 增加訓練輪數
# 使用二元交叉熵损失，因为是二分类任务（流畅/不流畅）
criterion = nn.BCEWithLogitsLoss()
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

### LSTM 模型訓練

In [33]:
# 训练模型
model.train()  # 切换到训练模式
for epoch in range(num_epochs):
    for data, targets in dataloader:
        data = data.to('cuda')  # 将数据转移到GPU
        targets = targets.to('cuda')  # 将标签转移到GPU
        
        outputs = model(data)  # 前向传播
        loss = criterion(outputs, targets.unsqueeze(1))  # 计算损失
        
        optimizer.zero_grad()  # 清零梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 更新权重
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  # 打印每个epoch的损失

# 评估模型
model.eval()  # 切换到评估模式
with torch.no_grad():  # 评估时不计算梯度
    test_loss = 0
    for data, targets in dataloader:
        data = data.to('cuda')  # 将数据转移到GPU
        targets = targets.to('cuda')  # 将标签转移到GPU
        
        outputs = model(data)  # 前向传播
        loss = criterion(outputs, targets.unsqueeze(1))  # 计算损失
        test_loss += loss.item()  # 累加损失
    
    print(f'Test Loss: {test_loss/len(dataloader):.4f}')  # 打印测试集的平均损失

Epoch [1/100], Loss: 0.2401
Epoch [2/100], Loss: 0.1476
Epoch [3/100], Loss: 0.3440
Epoch [4/100], Loss: 0.0917
Epoch [5/100], Loss: 0.2955
Epoch [6/100], Loss: 0.0582
Epoch [7/100], Loss: 0.0883
Epoch [8/100], Loss: 0.0462
Epoch [9/100], Loss: 0.0684
Epoch [10/100], Loss: 0.2939
Epoch [11/100], Loss: 0.1420
Epoch [12/100], Loss: 0.1589
Epoch [13/100], Loss: 0.2035
Epoch [14/100], Loss: 0.0019
Epoch [15/100], Loss: 0.0258
Epoch [16/100], Loss: 0.0061
Epoch [17/100], Loss: 0.1304
Epoch [18/100], Loss: 0.0865
Epoch [19/100], Loss: 0.0771
Epoch [20/100], Loss: 0.0113
Epoch [21/100], Loss: 0.0037
Epoch [22/100], Loss: 0.0321
Epoch [23/100], Loss: 0.0647
Epoch [24/100], Loss: 0.0579
Epoch [25/100], Loss: 0.1175
Epoch [26/100], Loss: 0.3417
Epoch [27/100], Loss: 0.4474
Epoch [28/100], Loss: 0.5003
Epoch [29/100], Loss: 0.0199
Epoch [30/100], Loss: 0.4214
Epoch [31/100], Loss: 0.1671
Epoch [32/100], Loss: 0.2830
Epoch [33/100], Loss: 0.0724
Epoch [34/100], Loss: 0.2841
Epoch [35/100], Loss: 0

### 評估
得分代表了LSTM模型在分析影片中每一段時間序列時所預測的流暢性分數。分數範圍從0到1，其中越接近1表示模型認為該段時間序列的動作越流暢，越接近0則表示模型認為該段時間序列的動作不流暢。  
將評估影片依每120偵分成一段一段的時間序列。

In [64]:
def evaluate_video(video_path, model, seq_length=120):
    dimensions = get_video_dimensions(video_path)
    keypoints_list = dance_video_processing(video_path, showBG=True, dim=(dimensions[0], dimensions[1]))
    normalized_keypoints_list = proportional_normalize(keypoints_list)
    
    features = []
    for keypoints in normalized_keypoints_list:
        feature = []
        for part in ['left_ankle', 'left_knee', 'left_hip', 'right_hip', 'right_knee', 'right_ankle', 'pelvis', 'thorax', 'upper_neck', 'head_top', 'right_wrist', 'right_elbow', 'right_shoulder', 'left_shoulder', 'left_elbow', 'left_wrist']:
            if part in keypoints:
                x, y = keypoints[part]
                feature.extend([x, y])
            else:
                feature.extend([0, 0])
        features.append(feature)
    
    sequences = []
    if len(features) < seq_length:
        # 如果影片少於seq_length，用0補齊
        features.extend([[0] * 32] * (seq_length - len(features)))
        sequences.append(features)
    elif len(features) == seq_length:
        # 如果影片等於seq_length，不做事
        sequences.append(features)
    else:
        # 如果影片大於seq_length，將其分段，不足seq_length的部分用0補齊
        num_segments = len(features) // seq_length
        for i in range(num_segments):
            segment = features[i * seq_length:(i + 1) * seq_length]
            sequences.append(segment)
        
        remainder = len(features) % seq_length
        if remainder > 0:
            segment = features[-remainder:]
            segment.extend([[0] * 32] * (seq_length - remainder))
            sequences.append(segment)
    
    sequences = torch.tensor(np.array(sequences).astype(np.float32)).to('cuda')
    
        # 替換 NaN 和 Inf 值
    sequences[torch.isnan(sequences)] = 0
    sequences[torch.isinf(sequences)] = 0
    
    model.eval()
    all_scores = []
    with torch.no_grad():
        for seq in sequences:
            seq = seq.unsqueeze(0)  # 增加一個batch維度
            output = model(seq)
            score = torch.sigmoid(output).cpu().numpy()
            all_scores.append(score)
    
    return np.array(all_scores).mean()

In [61]:
# 評估一段新的影片
scores = evaluate_video('dance_video/basic/gBR_sBM_c01_d05_mBR4_ch09.mp4', model)
print(scores)

[[7.3233794e-04]
 [7.1958307e-04]
 [7.1179622e-04]
 [7.0683181e-04]
 [7.0428045e-04]
 [7.0148287e-04]
 [6.9493800e-04]
 [6.9181045e-04]
 [6.9143676e-04]
 [6.9692172e-04]
 [6.9530465e-04]
 [7.1480515e-04]
 [7.0777553e-04]
 [7.0283352e-04]
 [7.0025318e-04]
 [6.9921924e-04]
 [6.9629040e-04]
 [6.9399987e-04]
 [6.9315999e-04]
 [6.9150195e-04]
 [6.8896124e-04]
 [6.8722706e-04]
 [6.8697950e-04]
 [6.8738754e-04]
 [6.8750972e-04]
 [6.8791542e-04]
 [6.8899372e-04]
 [6.8972138e-04]
 [6.9082086e-04]
 [6.9194165e-04]
 [6.9260341e-04]
 [6.9163239e-04]
 [6.9369067e-04]
 [6.9458375e-04]
 [6.9640519e-04]
 [6.9605216e-04]
 [6.9756759e-04]
 [6.9920294e-04]
 [7.0127897e-04]
 [7.0231967e-04]
 [7.0403347e-04]
 [7.0452114e-04]
 [7.0470443e-04]
 [7.0733583e-04]
 [7.0884277e-04]
 [7.1015581e-04]
 [7.1143877e-04]
 [7.1030000e-04]
 [7.0678670e-04]
 [7.0425327e-04]
 [7.0193160e-04]
 [7.0291694e-04]
 [7.0256728e-04]
 [6.9425948e-04]
 [6.9549453e-04]
 [6.9822173e-04]
 [6.9933053e-04]
 [7.0164795e-04]
 [7.0383056e-0

In [65]:
# 評估一段新的影片
scores = evaluate_video('dance_video/freeze_edit.mp4', model, seq_length=120)
print(scores)

0.000700828


In [66]:
# 評估一段新的影片
scores = evaluate_video('dance_video/breaking_edit.mp4', model)
print(scores)

0.8047017


# Download Video

In [45]:
# Download video
from pytube import YouTube

def download_video(url, output_path='.'):
    yt = YouTube(url)
    ys = yt.streams.get_highest_resolution()
    ys.download(output_path)
    return ys.default_filename

# Example usage
url = 'https://www.youtube.com/watch?v=1GCEGF-rIJ0&ab_channel=BimalRana'
output_path = './dance_video/'
filename = download_video(url, output_path)
print(f'Downloaded video: {filename}')


Downloaded video: Baby freeze to Freezes transition bboy tutorial by bimal rana.mp4


In [92]:
from moviepy.editor import VideoFileClip

def trim_video(input_path, output_path, start_time, end_time):
    clip = VideoFileClip(input_path).subclip(start_time, end_time)
    clip.write_videofile(output_path, codec='libx264')

# Example usage
input_path = 'dance_video/basic/gBR_sBM_c01_d05_mBR0_ch01.mp4'
output_path = 'dance_video/breaking_edit2.mp4'
start_time = 0  # start time in seconds
end_time = 2    # end time in seconds
trim_video(input_path, output_path, start_time, end_time)
print(f'Trimmed video saved as: {output_path}')


Moviepy - Building video dance_video/breaking_edit2.mp4.
MoviePy - Writing audio in breaking_edit2TEMP_MPY_wvf_snd.mp3


                                                       

MoviePy - Done.
Moviepy - Writing video dance_video/breaking_edit2.mp4



                                                              

Moviepy - Done !
Moviepy - video ready dance_video/breaking_edit2.mp4
Trimmed video saved as: dance_video/breaking_edit2.mp4


# Process Data

In [20]:
import os
import cv2
import numpy as np
import paddlehub as hub
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import pickle
import time

# 加載新的人體姿勢估計模型
pose_resnet50_mpii = hub.Module(name="human_pose_estimation_resnet50_mpii")

def get_video_dimensions(video_path):
    """取得影片的解析度

    Args:
        video_path (string) : 影片路徑

    Returns:
        tuple (width, height) : 影片的解析度
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return None

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    cap.release()
    return (width, height)

def dance_video_processing(video_path, showBG=True, dim=(640, 480)):
    """
    對輸入的視頻進行處理，提取每一幀的關鍵點信息。

    Args:
        video_path (str): 視頻文件的路徑。
        showBG (bool, optional): 是否顯示背景。默認為 True。
        dim (tuple, optional): 調整圖像的尺寸。默認為 (640, 480)。
        max_duration (int, optional): 影片處理的最長秒數。默認為 2 秒。

    Returns:
        list: 包含每幀圖像中提取的16個關鍵點的(x,y)坐標信息。
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video stream or file")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    max_frames = 120
    frame_count = 0
    keypoints_list = []

    while frame_count < 120:
        ret_val, image = cap.read()
        if not ret_val:
            break

        # resize image
        image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
        results = pose_resnet50_mpii.keypoint_detection(images=[image], use_gpu=False)

        if not showBG:
            image = np.zeros(image.shape)

        if results and len(results) > 0:
            keypoints = results[0]['data']
            keypoints_list.append(keypoints)

        frame_count += 1

    cap.release()
    return keypoints_list

def proportional_normalize(keypoints_list):
    normalized_keypoints_list = []
    for keypoints in keypoints_list:
        if 'left_shoulder' in keypoints and 'right_shoulder' in keypoints:
            shoulder_distance = np.linalg.norm(np.array(keypoints['left_shoulder']) - np.array(keypoints['right_shoulder']))
            normalized_keypoints = {key: np.array(val) / shoulder_distance for key, val in keypoints.items()}
            normalized_keypoints_list.append(normalized_keypoints)
    return normalized_keypoints_list


def prepare_data(video_paths, labels):
    data = []
    targets = []
    
    min_frames = 120

    for video_path, label in zip(video_paths, labels):
        dimensions = get_video_dimensions(video_path)
        keypoints_list = dance_video_processing(video_path, showBG=True, dim=(dimensions[0], dimensions[1]))
        normalized_keypoints_list = proportional_normalize(keypoints_list)
        
        features = []
        for keypoints in normalized_keypoints_list:
            feature = []
            for part in ['left_ankle', 'left_knee', 'left_hip', 'right_hip', 'right_knee', 'right_ankle', 'pelvis', 'thorax', 'upper_neck', 'head_top', 'right_wrist', 'right_elbow', 'right_shoulder', 'left_shoulder', 'left_elbow', 'left_wrist']:
                if part in keypoints:
                    x, y = keypoints[part]
                    feature.extend([x, y])
                else:
                    feature.extend([0, 0])
            features.append(feature)
        
        if len(features) < min_frames:
            features.extend([[0]*32] * (min_frames - len(features)))

        data.append(features[:min_frames])
        targets.append(label)
    
    return np.array(data).astype(np.float32), np.array(targets).astype(np.float32)

class KeypointsDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)

if __name__ == '__main__':
    # 讀取標籤文件
    labels_csv_path = 'labels.csv'
    labels_df = pd.read_csv(labels_csv_path)

    # 設置影片目錄
    video_dir = 'dance_video/basic/'

    # 根據標籤過濾影片路徑
    video_paths = [os.path.join(video_dir, filename) for filename in labels_df['filename']]
    labels = labels_df['label'].tolist()

    # 打印過濾後的影片路徑和標籤
    print("Video Paths:", video_paths)
    print("Labels:", labels)
    
    # video_paths=['dance_video/basic/gBR_sBM_c01_d06_mBR3_ch07.mp4']
    # labels = [0]

    data, targets = prepare_data(video_paths, labels)

    # 保存數據到文件
    with open('data.pkl', 'wb') as f:
        pickle.dump((data, targets), f)

    print("Data and targets have been saved to 'data.pkl'.")


Video Paths: ['dance_video/basic/gBR_sBM_c01_d06_mBR3_ch07.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR3_ch10.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR5_ch07.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR1_ch09.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR1_ch04.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR2_ch07.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR0_ch03.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR2_ch08.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR5_ch08.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR1_ch04.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR0_ch02.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR3_ch05.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR1_ch02.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR4_ch05.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR0_ch09.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR5_ch06.mp4', 'dance_video/basic/gBR_sBM_c01_d05_mBR0_ch06.mp4', 'dance_video/basic/gBR_sBM_c01_d04_mBR0_ch02.mp4', 'dance_video/basic/gBR_sBM_c01_d06_mBR2_ch10.mp4', 'dance_video/basi

In [21]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader

# 加載數據
with open('data.pkl', 'rb') as f:
    data, targets = pickle.load(f)

# 打印數據形狀以確認加載正確
print(f"Data shape: {data.shape}")
print(f"Targets shape: {targets.shape}")

# # 裁剪數據，只保留前120幀
# max_frames = 120
# if data.shape[1] > max_frames:
#     data = data[:, :max_frames, :]
# elif data.shape[1] < max_frames:
#     padding = np.zeros((data.shape[0], max_frames - data.shape[1], data.shape[2]))
#     data = np.concatenate((data, padding), axis=1)

# # 打印裁剪後的數據形狀
# print(f"Reshaped Data shape: {data.shape}")
# print(f"Reshaped Targets shape: {targets.shape}")

# data 中有 inf 的值
# 將 inf 替換為 nan 再替換為 0
data = np.where(np.isinf(data), np.nan, data)
data = np.nan_to_num(data, nan=0.0)

Data shape: (120, 120, 32)
Targets shape: (120,)
