In [14]:
import cv2
import numpy as np
from skimage.transform import resize
import os

def extract_lip_roi(frame, prev_points, prev_gray):
    """
    Extract lip ROI using KLT feature tracker.
    :param frame: Current video frame.
    :param prev_points: Previous tracked points (for KLT).
    :param prev_gray: Previous frame in grayscale.
    :return: Lip ROI and updated tracked points.
    """
    # Parameters for KLT
    lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
    
    # Convert frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # Calculate optical flow (KLT)
    if prev_points is not None and prev_gray is not None:
        # Ensure prev_points are within the bounds of the current frame
        h, w = gray.shape
        prev_points = np.clip(prev_points, [0, 0], [w - 1, h - 1]).astype(np.float32)
        
        new_points, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, gray, prev_points, None, **lk_params)
        
        # Reshape status to match the dimensions of new_points
        status = status.reshape(-1)
        
        # Filter points based on status
        good_new = new_points[status == 1]
        good_old = prev_points[status == 1]
        
        # Compute bounding box around tracked points
        if len(good_new) > 0:
            x, y, w, h = cv2.boundingRect(good_new)
            # Expand the bounding box by a larger margin (e.g., 100 pixels)
            margin = 100  # Increased margin for larger lip ROI
            x = max(0, x - margin)
            y = max(0, y - margin)
            w = min(frame.shape[1] - x, w + 2 * margin)
            h = min(frame.shape[0] - y, h + 2 * margin)
            lip_roi = frame[y:y+h, x:x+w]
            return lip_roi, good_new, gray
    
    # If no previous points or tracking fails, return None
    return None, None, gray

def extract_hand_roi(frame, fgbg):
    """
    Extract hand ROI using Adaptive Background Mixture Models.
    :param frame: Current video frame.
    :param fgbg: Background subtractor object (cv2.createBackgroundSubtractorMOG2).
    :return: Hand ROI.
    """
    # Apply background subtraction
    fgmask = fgbg.apply(frame)
    
    # Remove noise using morphological operations
    fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))
    fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
    
    # Find contours in the foreground mask
    contours, _ = cv2.findContours(fgmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Filter contours by area to ignore small contours
    min_contour_area = 5000  # Minimum area to consider as a hand
    large_contours = [c for c in contours if cv2.contourArea(c) > min_contour_area]
    
    # Find the largest contour among the filtered contours
    if large_contours:
        largest_contour = max(large_contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        # Expand the bounding box by a much larger margin (e.g., 200 pixels)
        margin = 200  # Increased margin for larger hand ROI
        x = max(0, x - margin)
        y = max(0, y - margin)
        w = min(frame.shape[1] - x, w + 2 * margin)
        h = min(frame.shape[0] - y, h + 2 * margin)
        hand_roi = frame[y:y+h, x:x+w]
        return hand_roi
    return None

def preprocess_roi(roi, target_size=(64, 64)):
    """
    Convert ROI to grayscale and resize to target size.
    :param roi: Input ROI.
    :param target_size: Target size (default: 64x64).
    :return: Preprocessed ROI.
    """
    if roi is None:
        return None
    
    # Convert to grayscale
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    
    # Resize using cubic interpolation
    resized = resize(gray, target_size, order=3, mode='reflect', anti_aliasing=True)
    return (resized * 255).astype(np.uint8)  # Scale back to 0-255 range

def preprocess_cued_speech_video(video_path, output_dir):
    """
    Preprocess a cued speech video to extract lip and hand ROIs and save them.
    :param video_path: Path to the input video.
    :param output_dir: Directory to save the ROIs.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    prev_points = None
    prev_gray = None
    
    # Create background subtractor object
    fgbg = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=16, detectShadows=True)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert frame to grayscale
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Initialize KLT points in the first frame
        if frame_count == 0:
            # Define a region of interest (ROI) for the lips in the first frame
            h, w = frame.shape[:2]
            lip_region = frame[h//2 - 50:h//2 + 50, w//2 - 50:w//2 + 50]  # Adjust this region as needed
            lip_gray = cv2.cvtColor(lip_region, cv2.COLOR_BGR2GRAY)
            prev_points = cv2.goodFeaturesToTrack(lip_gray, maxCorners=100, qualityLevel=0.01, minDistance=10)
            if prev_points is not None:
                prev_points = prev_points.reshape(-1, 1, 2)
                # Adjust prev_points to the full frame coordinates
                prev_points[:, :, 0] += w//2 - 50
                prev_points[:, :, 1] += h//2 - 50
        
        # Extract lip ROI
        lip_roi, prev_points, prev_gray = extract_lip_roi(frame, prev_points, gray)
        
        # Extract hand ROI
        hand_roi = extract_hand_roi(frame, fgbg)
        
        # Preprocess ROIs
        lip_roi_processed = preprocess_roi(lip_roi)
        hand_roi_processed = preprocess_roi(hand_roi)
        
        # Save ROIs if they exist
        if lip_roi_processed is not None:
            lip_path = os.path.join(output_dir, f"lip_frame_{frame_count:04d}.png")
            cv2.imwrite(lip_path, lip_roi_processed)
        
        if hand_roi_processed is not None:
            hand_path = os.path.join(output_dir, f"hand_frame_{frame_count:04d}.png")
            cv2.imwrite(hand_path, hand_roi_processed)
        
        frame_count += 1
    
    cap.release()
    print(f"Saved {frame_count} frames to {output_dir}")

# Example usage
video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/videos/sent_01.mp4"
output_dir = "/scratch2/bsow/Documents/ACSR/output/rois"
preprocess_cued_speech_video(video_path, output_dir)

Saved 249 frames to /scratch2/bsow/Documents/ACSR/output/rois


In [3]:
import os
import cv2
import whisper
import torch
import numpy as np
from moviepy.editor import VideoFileClip
from epitran.backoff import Backoff
import re
import mediapipe as mp

In [18]:
# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()

# Step 1: Extract audio from the video
def extract_audio(video_path, audio_path):
    """
    Extracts audio from a video file and saves it as a WAV file.
    """
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    video_clip = VideoFileClip(video_path)
    video_clip.audio.write_audiofile(audio_path, codec="pcm_s16le")
    print(f"Audio extracted and saved to: {audio_path}")

# Step 2: Convert text to IPA
def text_to_ipa(text, language="fra-Latn"):
    """
    Convert a text sentence into its IPA representation.
    Args:
        text (str): Input text.
        language (str): Language code for IPA conversion (e.g., "fra-Latn" for French).
    Returns:
        str: IPA representation of the text.
    """
    backoff = Backoff([language])
    ipa_text = backoff.transliterate(text)
    return ipa_text

# Step 3: Syllabify IPA text
# Define Cued Speech consonants (hand shapes) and vowels (mouth shapes)
consonants = "ptkbdgmnlrsfvzʃʒɡʁjwŋtrɥʀ"
vowels = "aeɛioɔuyøœəɑ̃ɛ̃ɔ̃œ̃ɑ̃ɔ̃ɑ̃ɔ̃"

# Regex pattern for syllabification
syllable_pattern = re.compile(
    f"[{consonants}]?[{vowels}]|[{consonants}]", re.IGNORECASE
)

def syllabify_word(word):
    """
    Syllabify a single word based on the allowed patterns: CV, V, C.
    """
    syllables = syllable_pattern.findall(word)
    return " ".join(syllables)

def syllabify_sentence(sentence):
    """
    Syllabify an entire sentence.
    """
    sentence = sentence.lower()
    sentence = text_to_ipa(sentence)
    words = sentence.split()
    syllabified_sentence = []
    for word in words:
        syllabified_sentence.append(syllabify_word(word))
    return " ".join(syllabified_sentence)

# Step 4: Transcribe the entire audio using Whisper
def transcribe_audio(audio_path, device="cuda"):
    """
    Transcribes the entire audio file using OpenAI's Whisper model.
    Args:
        audio_path (str): Path to the audio file.
        device (str): Device to use for inference ("cuda" for GPU or "cpu" for CPU).
    Returns:
        list: A list of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    """
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    # Check if the specified device is available
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA is not available. Falling back to CPU.")
        device = "cpu"

    # Load the Whisper model on the specified device
    model = whisper.load_model("medium", device=device)  # Use "medium" or "large" for better accuracy

    # Transcribe the entire audio file
    result = model.transcribe(audio_path, language="fr")
    print("Audio transcription completed.")

    # Extract segments from the result
    segments = []
    for segment in result["segments"]:
        text = segment["text"]
        ipa_text = text_to_ipa(text)  # Convert text to IPA
        syllabified_text = syllabify_sentence(ipa_text)  # Syllabify IPA text
        segments.append((segment["start"], segment["end"], text, ipa_text, syllabified_text))
    
    return segments



In [19]:
# Example: Manually annotate syllables for a sentence
syllable_annotations = [
    {"syllable": "y", "start_frame": 13, "end_frame": 20},
    {"syllable": "go", "start_frame": 24, "end_frame": 38},
    {"syllable": "vi", "start_frame": 42, "end_frame": 45},
    {"syllable": "vɛ", "start_frame": 47, "end_frame": 55},
    {"syllable": "dɑ̃", "start_frame": 60, "end_frame": 67},
    {"syllable": "y", "start_frame": 72, "end_frame": 80},
    {"syllable": "n", "start_frame": 82, "end_frame": 90}, 
    {"syllable": "ka", "start_frame": 93, "end_frame": 100},
    {"syllable": "ba", "start_frame": 101, "end_frame": 104},
    {"syllable": "n", "start_frame": 105, "end_frame": 124},
    {"syllable": "ki", "start_frame": 128, "end_frame": 134},
    {"syllable": "la", "start_frame": 138, "end_frame": 140},
    {"syllable": "vɛ", "start_frame": 146, "end_frame": 152},
    {"syllable": "lu", "start_frame": 154, "end_frame": 160},
    {"syllable": "mɛ", "start_frame": 164, "end_frame": 172},
    {"syllable": "m", "start_frame": 174, "end_frame": 182},
    {"syllable": "kɔ̃", "start_frame": 193, "end_frame": 202},
    {"syllable": "s", "start_frame": 207, "end_frame": 211},
    {"syllable": "t", "start_frame": 212, "end_frame": 217},
    {"syllable": "ru", "start_frame": 218, "end_frame": 222},
    {"syllable": "t", "start_frame": 224, "end_frame": 227}
]
text = " ".join([s["syllable"] for s in syllable_annotations])
print(text)

y go vi vɛ dɑ̃ y n ka ba n ki la vɛ lu mɛ m kɔ̃ s t ru t


In [23]:
# Step 5: Extract landmarks using MediaPipe
def extract_landmarks(frame):
    """
    Extract head and hand landmarks from a video frame using MediaPipe Holistic.
    Args:
        frame: Input video frame.
    Returns:
        dict: Landmarks for face, right hand, and left hand.
    """
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)

    # Extract landmarks
    landmarks = {
        "face": results.face_landmarks,
        "right_hand": results.right_hand_landmarks,
        "left_hand": results.left_hand_landmarks,
    }
    return landmarks

# Step 6: Build syllable-to-gesture mappings
def build_syllable_mappings(video_path, segments):
    """
    Build syllable-to-gesture mappings by extracting hand coordinates during annotated frames.
    Args:
        video_path: Path to the video file.
        segments: List of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    Returns:
        dict: Syllable-to-gesture mappings.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = 33
    syllable_mappings = {}

    for segment in segments:
        start_time, end_time, text, ipa_text, syllabified_text = segment
        syllabified_text = text
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        # Set the video to the start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        # Extract hand coordinates for the syllable
        hand_coordinates = []
        for _ in range(start_frame, end_frame + 1):
            ret, frame = cap.read()
            if not ret:
                break

            landmarks = extract_landmarks(frame)
            if landmarks["right_hand"]:
                hand_coordinates.append(landmarks["right_hand"])

        # Map the syllable to the average hand coordinates
        if hand_coordinates:
            avg_hand_coordinates = np.mean(hand_coordinates, axis=0)
            syllable_mappings[syllabified_text] = avg_hand_coordinates

    cap.release()
    return syllable_mappings

# Step 7: Render gestures on the video
def render_gestures(video_path, syllable_mappings, output_video_path):
    """
    Render gestures on the video by overlaying hand positions on the head.
    Args:
        video_path: Path to the input video.
        syllable_mappings: Syllable-to-gesture mappings.
        output_video_path: Path to save the output video.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Extract landmarks
        landmarks = extract_landmarks(frame)

        # Overlay hand gestures based on syllable mappings
        for syllable, hand_coordinates in syllable_mappings.items():
            if landmarks["face"] and landmarks["right_hand"]:
                # Draw hand on the face
                cv2.circle(frame, (int(hand_coordinates.x * frame_width), int(hand_coordinates.y * frame_height)), 10, (0, 255, 0), -1)

        # Write the frame to the output video
        out.write(frame)

    cap.release()
    out.release()


In [31]:
# Main function
def main():
    # File paths
    video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/sent_01.mp4"  # Replace with your video file path
    audio_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav"  # Temporary audio file
    output_video_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_video.mp4"  # Output video file

    # Step 1: Extract audio from the video
    extract_audio(video_path, audio_path)

    # Step 2: Transcribe the entire audio
    device = "cuda"  # Set to "cuda" for GPU or "cpu" for CPU
    segments = transcribe_audio(audio_path, device=device)

    # Step 3: Build syllable-to-gesture mappings
    syllable_mappings = build_syllable_mappings(video_path, segments)

    # Step 4: Render gestures on the video
    render_gestures(video_path, syllable_mappings, output_video_path)

    # Clean up temporary audio file
    os.remove(audio_path)
    print("Temporary audio file removed.")

if __name__ == "__main__":
    main()

MoviePy - Writing audio in /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav


                                                                        

MoviePy - Done.
Audio extracted and saved to: /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav




FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [35]:
!module list

Currently Loaded Modulefiles:[m
 1) [100mglibc/2.34-dtaq[0m   2) [100mgcc-runtime/11.4.1-hyx3[0m   3) miniconda3/24.3.0-ui7c  [m
[m
Key:[m
[100mauto-loaded[0m  [m
[K[?1l>

In [7]:
import os
import cv2
import mediapipe as mp
import json

# Initialize MediaPipe FaceMesh and Hands
mp_face_mesh = mp.solutions.face_mesh
mp_hands = mp.solutions.hands
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5)
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

# Define input and output directories
input_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/images"
output_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/coordinates"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each image and extract hand landmarks, nose coordinates, and additional face information
for i in range(1, 9):  # Loop through handshape_1 to handshape_8
    image_path = os.path.join(input_dir, f"handshape_{i}.jpg")
    output_path = os.path.join(output_dir, f"handshape_{i}.json")

    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not load image {image_path}")
        continue

    # Convert the image to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Process the image with MediaPipe FaceMesh to get nose coordinates and face bounding box
    face_results = face_mesh.process(rgb_image)
    nose_landmarks = None
    face_bbox = None
    eye_distance = None

    if face_results.multi_face_landmarks:
        for face_landmarks in face_results.multi_face_landmarks:
            # Extract nose landmark (landmark 1)
            nose_landmarks = [
                face_landmarks.landmark[1].x,
                face_landmarks.landmark[1].y,
                face_landmarks.landmark[1].z,
            ]

            # save landmarks 227 (left), 454 (right), 10 (top), 159 (-)
            # Extract all x and y coordinates of the face landmarks
            x_coords = [landmark.x for landmark in face_landmarks.landmark]
            y_coords = [landmark.y for landmark in face_landmarks.landmark]

            # Calculate the bounding box of the face
            face_bbox = {
                "x_min": min(x_coords),
                "x_max": max(x_coords),
                "y_min": min(y_coords),
                "y_max": max(y_coords),
            }

            # Calculate the distance between the eyes (landmarks 33 and 263)
            left_eye = face_landmarks.landmark[33]
            right_eye = face_landmarks.landmark[263]
            eye_distance = ((left_eye.x - right_eye.x) ** 2 + (left_eye.y - right_eye.y) ** 2) ** 0.5
            break

    # Process the image with MediaPipe Hands to get hand landmarks
    hand_results = hands.process(rgb_image)
    hand_landmarks = None

    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            # Extract 3D coordinates of the hand landmarks
            landmarks = []
            for landmark in hand_landmarks.landmark:
                landmarks.append([landmark.x, landmark.y, landmark.z])
            hand_landmarks = landmarks
            break

    # Save the nose, hand landmarks, face bounding box, and eye distance to a JSON file
    if nose_landmarks and hand_landmarks and face_bbox and eye_distance:
        data = {
            "nose_landmarks": nose_landmarks,
            "hand_landmarks": hand_landmarks,
            "face_bbox": face_bbox,
            "eye_distance": eye_distance,
        }
        with open(output_path, "w") as f:
            json.dump(data, f)
        print(f"Saved landmarks and additional information for handshape_{i} to {output_path}")
    else:
        print(f"No face or hand detected in {image_path}")

Saved landmarks and additional information for handshape_1 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_1.json
Saved landmarks and additional information for handshape_2 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_2.json
Saved landmarks and additional information for handshape_3 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_3.json
Saved landmarks and additional information for handshape_4 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_4.json
Saved landmarks and additional information for handshape_5 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_5.json
Saved landmarks and additional information for handshape_6 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_6.json
Saved landmarks and additional information for handshape_7 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_7.json
Saved landmarks and additional informatio

In [1]:
import os
import cv2
import json
import numpy as np

# Define input and output directories
input_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/images"
landmarks_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/coordinates"
output_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/rendered_handshapes"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define connections between hand landmarks (based on MediaPipe hand connections)
HAND_CONNECTIONS = [
    (0, 1), (1, 2), (2, 3), (3, 4),  # Thumb
    (0, 5), (5, 6), (6, 7), (7, 8),  # Index finger
    (0, 9), (9, 10), (10, 11), (11, 12),  # Middle finger
    (0, 13), (13, 14), (14, 15), (15, 16),  # Ring finger
    (0, 17), (17, 18), (18, 19), (19, 20),  # Pinky
    (5, 9), (9, 13), (13, 17)  # Palm
]

# Process each image and draw landmarks on a black background
for i in range(1, 9):  # Loop through handshape_1 to handshape_8
    image_path = os.path.join(input_dir, f"handshape_{i}.jpg")
    landmarks_path = os.path.join(landmarks_dir, f"handshape_{i}.json")
    output_path = os.path.join(output_dir, f"handshape_{i}_rendered.jpg")

    # Load the image to get its dimensions
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not load image {image_path}")
        continue

    # Create a blank black image of the same size
    black_image = np.zeros_like(image)

    # Load the landmarks from the JSON file
    if not os.path.exists(landmarks_path):
        print(f"Error: Could not find landmarks file {landmarks_path}")
        continue

    with open(landmarks_path, "r") as f:
        landmarks = json.load(f)

    # Draw the landmarks on the black image
    for landmark in landmarks:
        x, y = int(landmark[0] * image.shape[1]), int(landmark[1] * image.shape[0])
        cv2.circle(black_image, (x, y), 5, (0, 255, 0), -1)  # Green circles for landmarks

    # Draw connections between landmarks
    for connection in HAND_CONNECTIONS:
        start_idx, end_idx = connection
        start_x, start_y = int(landmarks[start_idx][0] * image.shape[1]), int(landmarks[start_idx][1] * image.shape[0])
        end_x, end_y = int(landmarks[end_idx][0] * image.shape[1]), int(landmarks[end_idx][1] * image.shape[0])
        cv2.line(black_image, (start_x, start_y), (end_x, end_y), (255, 0, 0), 2)  # Blue lines for connections

    # Save the rendered image
    cv2.imwrite(output_path, black_image)
    print(f"Saved rendered handshape_{i} to {output_path}")

ValueError: invalid literal for int() with base 10: 'nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

In [10]:
# Define the mapping of consonants to hand shapes
consonant_to_handshape = {
    "p": 1, "t": 5, "k": 2, "b": 4, "d": 1, "g": 7, "m": 5, "n": 4,
    "l": 6, "r": 3, "s": 3, "f": 5, "v": 2, "z": 2, "ʃ": 6, "ʒ": 1,
    "ɡ": 7, "ʁ": 3, "j": 8, "w": 6, "ŋ": 8, "ɥ": 4, "ʀ": 3, "y": 8, "c": 2
}

# Define vowel positions relative to the nose (right side of the face/body)
vowel_positions = {
    # Position 1: /a/, /o/, /œ/, /ə/
    "a": (0.15, 0.1, 0.0),   # Right side of the mouth
    "o": (0.15, 0.1, 0.0),   # Same as /a/
    "œ": (0.15, 0.1, 0.0),   # Same as /a/
    "ə": (0.15, 0.1, 0.0),   # Same as /a/

    # Position 2: /ɛ̃/, /ø/
    "ɛ̃": (0.2, 0.05, 0.0),   # Right cheek
    "ø": (0.2, 0.05, 0.0),   # Same as /ɛ̃/

    # Position 3: /i/, /ɔ̃/, /ɑ̃/
    "i": (0.1, 0.15, 0.0),   # Right corner of the mouth
    "ɔ̃": (0.1, 0.15, 0.0),   # Same as /i/
    "ɑ̃": (0.1, 0.15, 0.0),   # Same as /i/

    # Position 4: /u/, /ɛ/, /ɔ/
    "u": (0.0, 0.2, 0.0),    # Chin (below the mouth)
    "ɛ": (0.0, 0.2, 0.0),    # Same as /u/
    "ɔ": (0.0, 0.2, 0.0),    # Same as /u/

    # Position 5: /œ̃/, /y/, /e/
    "œ̃": (0.0, 0.3, 0.0),    # Throat (below the chin)
    "y": (0.0, 0.3, 0.0),    # Same as /œ̃/
    "e": (0.0, 0.3, 0.0),    # Same as /œ̃/
}

def map_syllable_to_cue(syllable):
    """
    Map a syllable to its corresponding hand shape and hand position.
    Args:
        syllable (str): Syllable in IPA format (e.g., "si", "ne", "ma").
    Returns:
        tuple: (hand_shape, hand_position)
    """
    # Define vowels and consonants
    vowels = set("aeɛioɔuøœəɑ̃ɛ̃ɔ̃œ̃y")
    consonants = set("ptkbdgmnlrsfvzʃʒɡʁjwŋtrɥgʀyc")

    # Check if the syllable is CV, C, or V
    if len(syllable) == 2:  # CV syllable
        consonant, vowel = syllable[0], syllable[1]
        if consonant in consonants and vowel in vowels:
            hand_shape = consonant_to_handshape.get(consonant, 1)  # Default to Hand Shape 1
            hand_position = vowel_positions.get(vowel, (0.15, 0.1, 0.0))  # Default to Position 1
            return hand_shape, hand_position

    elif len(syllable) == 1:  # Single letter (C or V)
        if syllable in consonants:  # Single consonant
            hand_shape = consonant_to_handshape.get(syllable, 1)  # Default to Hand Shape 1
            hand_position = vowel_positions["a"]  # Default to Position 1
            return hand_shape, hand_position
        elif syllable in vowels:  # Single vowel
            hand_shape = 5  # Default to Hand Shape 5
            hand_position = vowel_positions.get(syllable, (0.15, 0.1, 0.0))  # Default to Position 1
            return hand_shape, hand_position

    # Default fallback
    return 1, (0.15, 0.1, 0.0)  # Hand Shape 1, Position 1

# Example usage
syllables = ["si", "ne", "ma"]
for syllable in syllables:
    hand_shape, hand_position = map_syllable_to_cue(syllable)
    print(f"Syllable: {syllable} -> Hand Shape: {hand_shape}, Hand Position: {hand_position}")

Syllable: si -> Hand Shape: 3, Hand Position: (0.1, 0.15, 0.0)
Syllable: ne -> Hand Shape: 4, Hand Position: (0.0, 0.3, 0.0)
Syllable: ma -> Hand Shape: 5, Hand Position: (0.15, 0.1, 0.0)
