### Analyze how the detection and classification do on pre-labeled videos

* We curated the data manually
* Set out to test on videos after reasonable optimization trying out the real time (live stream) for ouselves

In [1]:
import sys
sys.path.insert(0, '/Users/alejandraduran/Documents/Pton_courses/COS429/COS429_final_project/training_pipeline')

import cv2
import mediapipe as mp
import pickle
from extract_features import FeaturesMP
import numpy as np
import time
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
from IPython.display import Video, display

In [2]:
# Retrieve pre-trained model
mp_model_path = "/Users/alejandraduran/Documents/Pton_courses/COS429/COS429_final_project/pretrained_models/pose_landmarker_full.task"
# Initialize FeaturesMP object
features_mp = FeaturesMP(mp_model_path, image_size=(1080, 1920))
# load the label encoder
with open('/Users/alejandraduran/Documents/Pton_courses/COS429/COS429_final_project/training_pipeline/label_encoder.pkl', 'rb') as f:    
    label_encoder = pickle.load(f)

# load the trained classifier
with open('/Users/alejandraduran/Documents/Pton_courses/COS429/COS429_final_project/trained_classifiers/padded_nn_7.pkl', 'rb') as f:
    classifier = pickle.load(f)
    
# load the sanskrit to english dictionary
with open('/Users/alejandraduran/Documents/Pton_courses/COS429/COS429_final_project/sanskrit_english_dict.pkl', 'rb') as f:
    sanskrit_english_dict = pickle.load(f)
    
# introduce delay in position predictions
buffer = [-1,-1,-1,-1]

# Define text properties
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 3
font_thickness = 4
text_color = (0,0,0)  # White color
bg_color = (0, 0, 0)  # Black color for background rectangle
bg_opacity = 0.6  # Background opacity
coords = (50, 100)  # Coordinates to display the text
text = "Starting pose detection..."

In [3]:
# function to write demonstration videos - rest of testing function is sent in test_video.py

# Load a video
video_path = "/Users/alejandraduran/Documents/Pton_courses/COS429/video_data/Ardha Matsyendrasana/4.mp4"
cap = cv2.VideoCapture(video_path)

frame_rate = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
output_path = 'king_pigeon.mp4'
n = -1

# # Define the codec and create VideoWriter object
# fourcc = cv2.VideoWriter_fourcc(*'mp4v')
# out = cv2.VideoWriter(output_path, fourcc, frame_rate, (frame_width, frame_height))

label_across_frames = []
label_with_delays = []

# Initialize detector
features_mp.init_detector(video=True, min_pose_detection_confidence=0.7)

# Create a loop to read the latest frame from the camera
while cap.isOpened():
    ret, frame = cap.read()
    
    if not ret:
        print("Error: Unable to fetch the frame or finished the video.")
        break
    
    # new frame
    n+=1
    # get timestamp from frame rate
    timestamp = int(n * 1000 / frame_rate)

    # Run inference on the image 
    # uncomment if double: landmarks_draw, landmarks = features_mp.detect(frame, live_stream=True, frame_timestamp_ms=timestamp, double=True)
    landmarks = features_mp.detect(frame, video=True, frame_timestamp_ms=timestamp)
    
    # Draw landmarks if detected
    if landmarks is not None: # uncomment if double: and landmarks_draw is not None:  
        if len(landmarks.pose_landmarks) != 0: # uncomment if double: and len(landmarks_draw.pose_landmarks) != 0:
            
            pose_landmarks_list = landmarks.pose_landmarks 
            # uncomment if double: pose_landmarks_draw_list = landmarks_draw.pose_landmarks

            # get only normalized coordinates - improves latency
            pose_landmarks = pose_landmarks_list[0]
            # uncomment if double: pose_landmarks_draw = pose_landmarks_draw_list[0]
            
            # Draw the pose landmarks.
            pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
            to_classify = np.zeros((features_mp.n_landmarks, 4))
            to_extend = []
            
            # store normalized landmarks to appends and classify
            for k, landmark in enumerate(pose_landmarks):
                # uncomment if double: to_extend.append(landmark_pb2.NormalizedLandmark(x=pose_landmarks_draw[k].x, y=pose_landmarks_draw[k].y, z=pose_landmarks_draw[k].z))
                to_extend.append(landmark_pb2.NormalizedLandmark(x=landmark.x,y=landmark.y, z=landmark.z))
                # store in an array
                to_classify[k] = [landmark.x, landmark.y, landmark.z, landmark.visibility]
                
            # normalize and rotate to_classify
            to_classify = features_mp.make_rot_invariant_partial(to_classify, init_norm=True)
            to_classify = to_classify.reshape(1, features_mp.n_landmarks * 4)
            
            # draw real-time landmarks
            pose_landmarks_proto.landmark.extend(to_extend)
            solutions.drawing_utils.draw_landmarks(
                frame,
                pose_landmarks_proto,
                solutions.pose.POSE_CONNECTIONS,
                solutions.drawing_styles.get_default_pose_landmarks_style())
                
            # Run inference
            predicted_class = classifier.predict(to_classify)
            # Get the string label
            predicted_name = label_encoder.inverse_transform([int(predicted_class-1)])
            english = sanskrit_english_dict[predicted_name[0]]
            label_across_frames.append(english)
            # Append to buffer
            buffer.pop(0)
            buffer.append(predicted_name[0])
            # # if all elements now in the buffer are the same, then we can display the pose
            if buffer[0] == buffer[1] == buffer[2] == buffer[3]:
                text = english
                label_with_delays.append(english)

            cv2.putText(frame, text, coords, font, font_scale, text_color, font_thickness, cv2.LINE_AA)     
            
    # # Write the frame to the output video
    # out.write(frame)
    
    # # delay for rendered
    # if cv2.waitKey(1):
    #     break
    
    # Display the output
    cv2.imshow('Video Testing', frame)
    
    # Add a delay to allow OpenCV to render the frame
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
# out.release()
cv2.destroyAllWindows()

# Display the video in the notebook
# display(Video(output_path, embed=True))


I0000 00:00:1734013578.863662 11626249 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 86), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Error: Unable to fetch the frame or finished the video.


In [4]:
# this video was Eka Pada Rajakapotasana - example of a good, robust classification

# get the most common label in label_across_frames

from collections import Counter

mode_all = Counter(label_across_frames).most_common(1)[0][0]
mode_delay = Counter(label_with_delays).most_common(1)[0][0]
print(f'Mode all: {mode_all}, Mode delay: {mode_delay}')

Mode all: Extended Hand to Toe, Mode delay: Extended Hand to Toe


: 