## I. INSTALL & IMPORT NECESSARY PACKAGES

In [1]:
pip install -q mediapipe

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests

url = 'https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task'
filename = 'pose_landmarker.task'

response = requests.get(url)
with open(filename, 'wb') as file:
    file.write(response.content)
print(f"Downloaded {filename}")


Downloaded pose_landmarker.task


In [3]:
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
import cv2
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision


# II. FUNCTIONS

In [4]:
profLandT = []

II.a. draw_landmarks_on_image

In [5]:
#These will draw the landmarks on a detect person, as well as the expected connections between those markers.


def draw_landmarks_on_image(rgb_image, detection_result):
  pose_landmarks_list = detection_result.pose_landmarks
  annotated_image = np.copy(rgb_image)

  # Loop through the detected poses to visualize.
  for idx in range(len(pose_landmarks_list)):
    pose_landmarks = pose_landmarks_list[idx]

    # Draw the pose landmarks.
    pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    pose_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in pose_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      pose_landmarks_proto,
      solutions.pose.POSE_CONNECTIONS,
      solutions.drawing_styles.get_default_pose_landmarks_style())
  return annotated_image

II.b. save_landmarks_and_timestamps

In [6]:
def save_landmarks_and_timestamps(detection_result, timestamp_ms):
    global profLandT
    # with open(filename, 'a') as file:
    for pose_landmarks in detection_result.pose_landmarks:
        landmarks_data = []
        for landmark in pose_landmarks:
            landmarks_data.append((landmark.x, landmark.y, landmark.z))
        profLandT.append([timestamp_ms, landmarks_data])
            # file.write(f"{timestamp_ms} {landmarks_data}\n")

    return profLandT



In [7]:
def process_video(input_video_path, output_video_path):
    # STEP 1: Import the necessary modules.
    mp_pose = mp.solutions.pose

    # STEP 2: Create an PoseLandmarker object.
    base_options = python.BaseOptions(model_asset_path='pose_landmarker.task')
    options = vision.PoseLandmarkerOptions(
        running_mode=mp.tasks.vision.RunningMode.VIDEO,
        base_options=base_options,
        output_segmentation_masks=True)
    detector = vision.PoseLandmarker.create_from_options(options)

    # Load the input Video
    cap = cv2.VideoCapture(input_video_path)

    # Load the frame rate of the video using OpenCV’s CV_CAP_PROP_FPS
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    frame_count = 0

    # Loop through each frame in the video using VideoCapture#read()
    while cap.isOpened():
        # Read a single frame from the video
        ret, frame = cap.read()
        
        # Check if frame reading was successful
        if not ret:
            break
        
        # Convert the frame received from OpenCV to a MediaPipe’s Image object.
        numpy_frame_from_opencv = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=numpy_frame_from_opencv)

        # Detect pose landmarks from the input image.
        timestamp_ms = int((frame_count / fps) * 1000)  # Calculate the timestamp in milliseconds
        detection_result = detector.detect_for_video(mp_image, timestamp_ms)
        
        # Save the detected landmarks and timestamps
        save_landmarks_and_timestamps(detection_result, timestamp_ms)
        
        # Process the detection result. In this case, visualize it.
        annotated_image = draw_landmarks_on_image(numpy_frame_from_opencv, detection_result)
        
        # Convert the annotated image back to BGR for OpenCV
        output_frame = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
        
        # Write the frame to the output video
        out.write(output_frame)
        
        # Display the resulting frame
        cv2.imshow('Pose Detection', output_frame)
        
        # Press 'q' on the keyboard to exit the loop
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1

    # When everything done, release the video capture and writer objects
    cap.release()
    out.release()
    cv2.destroyAllWindows()


In [42]:
def compute_score_comparison(current_landmarks, reference_landmarks):
    if not current_landmarks or not reference_landmarks:
        return None

    score = 0
    total_distance = 0
    num_points = len(current_landmarks)

    for i in range(num_points):

        x_diff = current_landmarks[0][i].x - reference_landmarks[i][0]
        y_diff = current_landmarks[0][i].y - reference_landmarks[i][1]
        z_diff = current_landmarks[0][i].z - reference_landmarks[i][2] 
        distance = (x_diff ** 2 + y_diff ** 2 + z_diff ** 2) ** 0.5
        total_distance += distance
    
    # Normalize the score (e.g., assuming a max possible distance)
    max_distance = num_points * (3**0.5)  # example normalization factor
    score = max(0, 1 - (total_distance / max_distance))  # ensuring score doesn't go below 0
    return score


Score Video

In [9]:
def score_video(input_video_path, output_video_path):
    # Load the saved landmarks and timestamps
    # landmarks_data = load_landmarks_and_timestamps(landmarks_file_path)
    landmarks_data = profLandT

    base_options = python.BaseOptions(model_asset_path='pose_landmarker.task')
    options = vision.PoseLandmarkerOptions(
        running_mode=mp.tasks.vision.RunningMode.VIDEO,
        base_options=base_options,
        output_segmentation_masks=True)
    detector = vision.PoseLandmarker.create_from_options(options)

    # Load the input Video
    cap = cv2.VideoCapture(input_video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    frame_count = 0

    total_score = 0
    num_frames = 0

    # Helper function to find the closest landmarks
    def find_closest_landmarks(timestamp):
        closest_landmarks = None
        min_diff = float('inf')
        for ts, landmarks in landmarks_data:
            diff = abs(ts - timestamp)
            if diff < min_diff:
                min_diff = diff
                closest_landmarks = landmarks
        return closest_landmarks if min_diff <= 50 else None  # Adjust threshold as needed

    # Loop through each frame in the video
    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        numpy_frame_from_opencv = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=numpy_frame_from_opencv)

        current_frame_timestamp_ms = int((frame_count / fps) * 1000)
        detection_result = detector.detect_for_video(mp_image, current_frame_timestamp_ms)

        # Extract the detected landmarks
        current_landmarks = detection_result.pose_landmarks


        score = None
        closest_landmarks = find_closest_landmarks(current_frame_timestamp_ms)
        if closest_landmarks is not None:
            score = compute_score_comparison(current_landmarks, closest_landmarks)
            if score is not None:
                total_score += score  # Accumulate the score
                num_frames += 1

        if score is not None:
            score_text = f"Score: {score * 100:.2f}%"
            cv2.putText(frame, score_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 4, cv2.LINE_AA)
        else:
            print(f"No score computed for frame {frame_count} at timestamp {current_frame_timestamp_ms} ms")

        # Write the frame to the output video
        out.write(frame)

        frame_count += 1

    if num_frames > 0:
        general_score = total_score / num_frames
    else:
        general_score = 0

    print(f"General Score for the entire video: {general_score * 100:.2f}%")

    # Release video capture and writer objects
    cap.release()
    out.release()
    cv2.destroyAllWindows()


# III. LOAD PARAMETERS

Process Instructor Video 

In [45]:
process_video("prof/guy-prof.mp4", "prof/output_prof-guy.mp4")

Score Student Video

In [46]:
score_video("student/student-male01.mp4", "student/scored/male01-scored.mp4")

General Score for the entire video: 67.11%


In [38]:
#FALSE VIDEO
score_video("false-vids/Maria.mp4", "false-vids/scored_Maria.mp4")

No score computed for frame 1665 at timestamp 55500 ms
No score computed for frame 2286 at timestamp 76200 ms
No score computed for frame 2289 at timestamp 76300 ms
General Score for the entire video: 65.64%


In [39]:
# FALSE VIDEO
score_video("false-vids/HipHop.mp4", "false-vids/scored_HipHop.mp4")

No score computed for frame 2138 at timestamp 71266 ms
General Score for the entire video: 58.75%


In [47]:
score_video("student/student-male02.mp4", "student/scored/male02-scored.mp4")

No score computed for frame 1236 at timestamp 41200 ms
General Score for the entire video: 80.93%


FEMALE TIKLOS VID (female reference)

In [43]:
process_video("prof/girl-prof.mp4", "prof/marked/girl-prof.mp4")

In [44]:
score_video("student/student-female.mp4", "student/scored/female01.mp4")

No score computed for frame 2430 at timestamp 81000 ms
General Score for the entire video: 60.45%
