In [3]:
import os
import cv2
import whisper
import torch
import numpy as np
from moviepy.editor import VideoFileClip
from epitran.backoff import Backoff
import re
import mediapipe as mp

In [18]:
# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()

# Step 1: Extract audio from the video
def extract_audio(video_path, audio_path):
    """
    Extracts audio from a video file and saves it as a WAV file.
    """
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    video_clip = VideoFileClip(video_path)
    video_clip.audio.write_audiofile(audio_path, codec="pcm_s16le")
    print(f"Audio extracted and saved to: {audio_path}")

# Step 2: Convert text to IPA
def text_to_ipa(text, language="fra-Latn"):
    """
    Convert a text sentence into its IPA representation.
    Args:
        text (str): Input text.
        language (str): Language code for IPA conversion (e.g., "fra-Latn" for French).
    Returns:
        str: IPA representation of the text.
    """
    backoff = Backoff([language])
    ipa_text = backoff.transliterate(text)
    return ipa_text

# Step 3: Syllabify IPA text
# Define Cued Speech consonants (hand shapes) and vowels (mouth shapes)
consonants = "ptkbdgmnlrsfvzʃʒɡʁjwŋtrɥʀ"
vowels = "aeɛioɔuyøœəɑ̃ɛ̃ɔ̃œ̃ɑ̃ɔ̃ɑ̃ɔ̃"

# Regex pattern for syllabification
syllable_pattern = re.compile(
    f"[{consonants}]?[{vowels}]|[{consonants}]", re.IGNORECASE
)

def syllabify_word(word):
    """
    Syllabify a single word based on the allowed patterns: CV, V, C.
    """
    syllables = syllable_pattern.findall(word)
    return " ".join(syllables)

def syllabify_sentence(sentence):
    """
    Syllabify an entire sentence.
    """
    sentence = sentence.lower()
    sentence = text_to_ipa(sentence)
    words = sentence.split()
    syllabified_sentence = []
    for word in words:
        syllabified_sentence.append(syllabify_word(word))
    return " ".join(syllabified_sentence)

# Step 4: Transcribe the entire audio using Whisper
def transcribe_audio(audio_path, device="cuda"):
    """
    Transcribes the entire audio file using OpenAI's Whisper model.
    Args:
        audio_path (str): Path to the audio file.
        device (str): Device to use for inference ("cuda" for GPU or "cpu" for CPU).
    Returns:
        list: A list of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    """
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    # Check if the specified device is available
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA is not available. Falling back to CPU.")
        device = "cpu"

    # Load the Whisper model on the specified device
    model = whisper.load_model("medium", device=device)  # Use "medium" or "large" for better accuracy

    # Transcribe the entire audio file
    result = model.transcribe(audio_path, language="fr")
    print("Audio transcription completed.")

    # Extract segments from the result
    segments = []
    for segment in result["segments"]:
        text = segment["text"]
        ipa_text = text_to_ipa(text)  # Convert text to IPA
        syllabified_text = syllabify_sentence(ipa_text)  # Syllabify IPA text
        segments.append((segment["start"], segment["end"], text, ipa_text, syllabified_text))
    
    return segments



In [19]:
# Example: Manually annotate syllables for a sentence
syllable_annotations = [
    {"syllable": "y", "start_frame": 13, "end_frame": 20},
    {"syllable": "go", "start_frame": 24, "end_frame": 38},
    {"syllable": "vi", "start_frame": 42, "end_frame": 45},
    {"syllable": "vɛ", "start_frame": 47, "end_frame": 55},
    {"syllable": "dɑ̃", "start_frame": 60, "end_frame": 67},
    {"syllable": "y", "start_frame": 72, "end_frame": 80},
    {"syllable": "n", "start_frame": 82, "end_frame": 90}, 
    {"syllable": "ka", "start_frame": 93, "end_frame": 100},
    {"syllable": "ba", "start_frame": 101, "end_frame": 104},
    {"syllable": "n", "start_frame": 105, "end_frame": 124},
    {"syllable": "ki", "start_frame": 128, "end_frame": 134},
    {"syllable": "la", "start_frame": 138, "end_frame": 140},
    {"syllable": "vɛ", "start_frame": 146, "end_frame": 152},
    {"syllable": "lu", "start_frame": 154, "end_frame": 160},
    {"syllable": "mɛ", "start_frame": 164, "end_frame": 172},
    {"syllable": "m", "start_frame": 174, "end_frame": 182},
    {"syllable": "kɔ̃", "start_frame": 193, "end_frame": 202},
    {"syllable": "s", "start_frame": 207, "end_frame": 211},
    {"syllable": "t", "start_frame": 212, "end_frame": 217},
    {"syllable": "ru", "start_frame": 218, "end_frame": 222},
    {"syllable": "t", "start_frame": 224, "end_frame": 227}
]
text = " ".join([s["syllable"] for s in syllable_annotations])
print(text)

y go vi vɛ dɑ̃ y n ka ba n ki la vɛ lu mɛ m kɔ̃ s t ru t


In [23]:
# Step 5: Extract landmarks using MediaPipe
def extract_landmarks(frame):
    """
    Extract head and hand landmarks from a video frame using MediaPipe Holistic.
    Args:
        frame: Input video frame.
    Returns:
        dict: Landmarks for face, right hand, and left hand.
    """
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)

    # Extract landmarks
    landmarks = {
        "face": results.face_landmarks,
        "right_hand": results.right_hand_landmarks,
        "left_hand": results.left_hand_landmarks,
    }
    return landmarks

# Step 6: Build syllable-to-gesture mappings
def build_syllable_mappings(video_path, segments):
    """
    Build syllable-to-gesture mappings by extracting hand coordinates during annotated frames.
    Args:
        video_path: Path to the video file.
        segments: List of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    Returns:
        dict: Syllable-to-gesture mappings.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = 33
    syllable_mappings = {}

    for segment in segments:
        start_time, end_time, text, ipa_text, syllabified_text = segment
        syllabified_text = text
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        # Set the video to the start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        # Extract hand coordinates for the syllable
        hand_coordinates = []
        for _ in range(start_frame, end_frame + 1):
            ret, frame = cap.read()
            if not ret:
                break

            landmarks = extract_landmarks(frame)
            if landmarks["right_hand"]:
                hand_coordinates.append(landmarks["right_hand"])

        # Map the syllable to the average hand coordinates
        if hand_coordinates:
            avg_hand_coordinates = np.mean(hand_coordinates, axis=0)
            syllable_mappings[syllabified_text] = avg_hand_coordinates

    cap.release()
    return syllable_mappings

# Step 7: Render gestures on the video
def render_gestures(video_path, syllable_mappings, output_video_path):
    """
    Render gestures on the video by overlaying hand positions on the head.
    Args:
        video_path: Path to the input video.
        syllable_mappings: Syllable-to-gesture mappings.
        output_video_path: Path to save the output video.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Extract landmarks
        landmarks = extract_landmarks(frame)

        # Overlay hand gestures based on syllable mappings
        for syllable, hand_coordinates in syllable_mappings.items():
            if landmarks["face"] and landmarks["right_hand"]:
                # Draw hand on the face
                cv2.circle(frame, (int(hand_coordinates.x * frame_width), int(hand_coordinates.y * frame_height)), 10, (0, 255, 0), -1)

        # Write the frame to the output video
        out.write(frame)

    cap.release()
    out.release()


In [31]:
# Main function
def main():
    # File paths
    video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/sent_01.mp4"  # Replace with your video file path
    audio_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav"  # Temporary audio file
    output_video_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_video.mp4"  # Output video file

    # Step 1: Extract audio from the video
    extract_audio(video_path, audio_path)

    # Step 2: Transcribe the entire audio
    device = "cuda"  # Set to "cuda" for GPU or "cpu" for CPU
    segments = transcribe_audio(audio_path, device=device)

    # Step 3: Build syllable-to-gesture mappings
    syllable_mappings = build_syllable_mappings(video_path, segments)

    # Step 4: Render gestures on the video
    render_gestures(video_path, syllable_mappings, output_video_path)

    # Clean up temporary audio file
    os.remove(audio_path)
    print("Temporary audio file removed.")

if __name__ == "__main__":
    main()

MoviePy - Writing audio in /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav


                                                                        

MoviePy - Done.
Audio extracted and saved to: /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav




FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [35]:
!module list

Currently Loaded Modulefiles:[m
 1) [100mglibc/2.34-dtaq[0m   2) [100mgcc-runtime/11.4.1-hyx3[0m   3) miniconda3/24.3.0-ui7c  [m
[m
Key:[m
[100mauto-loaded[0m  [m
[K[?1l>

In [37]:
!which python

~/.cache/pypoetry/virtualenvs/acsr-jnt0UJEK-py3.11/bin/python
