In [2]:
import mediapipe as mp
import cv2

In [None]:
mp_holistic = mp.solutions.holistic
holistic = mp.holistic.Holistic()

def extract_landmarks(frame):
    """
    Extract head and hand landmarks from a video frame using MediaPipe Holistic.
    Args:
        frame: Input video frame.
    Returns:
        dict: Landmarks for face, right hand, and left hand.
    """
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)

    # Extract landmarks
    landmarks = {
        "face": results.face_landmarks,
        "right_hand": results.right_hand_landmarks,
        "left_hand": results.left_hand_landmarks,
    }
    return landmarks

In [None]:
# Example: Manually annotate syllables for a sentence
syllable_annotations = [
    {"syllable": "ba", "start_frame": 10, "end_frame": 30},
    {"syllable": "ku", "start_frame": 31, "end_frame": 50},
    # Add more syllables as needed
]

In [None]:
def build_syllable_mappings(video_path, syllable_annotations):
    """
    Build syllable-to-syllable mappings by extracting hand coordinates 
    during annotated frames.
    Args:
        video_path: Path to the video file.
        syllable_annotations: List of syllable annotations (start_frame,
                            end_frame, syllable).
    Returns:
        dict: Syllable-to-syllable mappings.
    """

    cap = cv2.VideoCapture(video_path)
    syllable_mappings = {}

    for annotation in syllable_annotations:
        syllable = annotation["syllable"]
        start_frame = annotation["start_frame"]
        end_frame = annotation["end_frame"]

        # Set the video to the start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        # Extract hand coordinates for the syllable
        hand_coordinates = []
        for _ in range(start_frame, end_frame + 1):
            ret, frame = cap.read()
            if not ret:
                break

            landmarks = extract_landmarks(frame)
            if landmarks["right_hand"]:
                hand_coordinates.append(landmarks["right_hand"])

        # Map the syllable to the average hand coordinates
        if hand_coordinates:
            avg_hand_coordinates = np.mean(hand_coordinates, axis=0)
            syllable_mappings[syllable] = avg_hand_coordinates

    cap.release()
    return syllable_mappings

def render_gestures(video)