In [31]:
import whisper
from noisereduce import reduce_noise
from pydub import AudioSegment
from moviepy.editor import VideoFileClip

# Extract audio from video
video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf002/csf002.mp4"
audio_path = "temp_audio.wav"

video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path, codec='pcm_s16le')

# Load and denoise audio
audio = AudioSegment.from_wav(audio_path)
audio_array = audio.get_array_of_samples()
rate = audio.frame_rate

# Reduce noise
denoised_audio = reduce_noise(y=audio_array, sr=rate, stationary=True)
denoised_audio = AudioSegment(
    denoised_audio.tobytes(), 
    frame_rate=rate,
    sample_width=audio.sample_width, 
    channels=audio.channels
)
denoised_audio.export("denoised_audio.wav", format="wav")

# Transcribe with Whisper (try larger models like 'large-v3')
model = whisper.load_model("large-v3")
result = model.transcribe("denoised_audio.wav", fp16=False)  # Disable FP16 if no GPU
print(result["text"])

MoviePy - Writing audio in temp_audio.wav


                                                        

MoviePy - Done.




 Il se garantira une fois avec ce bon capuchon.


In [9]:
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

def create_custom_styles():
    styles = getSampleStyleSheet()
    
    # Style pour les titres
    styles.add(ParagraphStyle(
        name='CustomTitle',
        parent=styles['Heading1'],
        fontSize=12,
        spaceAfter=12,
        spaceBefore=12,
        textColor=colors.HexColor('#2C3E50'),
        leading=16
    ))
    
    # Style pour le texte normal
    styles.add(ParagraphStyle(
        name='CustomBody',
        parent=styles['Normal'],
        fontSize=10,
        leading=14,
        alignment=TA_JUSTIFY,
        spaceAfter=8
    ))
    
    # Style pour les puces
    styles.add(ParagraphStyle(
        name='CustomBullet',
        parent=styles['Normal'],
        fontSize=10,
        leading=14,
        leftIndent=20,
        spaceAfter=8
    ))
    
    return styles

def format_content(text):
    # Remplacer les tabulations par des espaces
    text = text.replace('\t', '    ')
    # Gérer les emojis spéciaux
    text = text.replace('✅', '•')
    text = text.replace('📌', '•')
    return text

def create_pdf(output_filename):
    doc = SimpleDocTemplate(
        output_filename,
        pagesize=A4,
        rightMargin=50,
        leftMargin=50,
        topMargin=50,
        bottomMargin=50
    )
    
    styles = create_custom_styles()
    story = []
    
    # Titre principal
    story.append(Paragraph("Dossier de Présentation", styles['CustomTitle']))
    story.append(Spacer(1, 20))
    
    # Lecture et formatage du contenu
    content_lines = [
        "1 - Parlez-moi de vous",
        "2 - Projet professionnel",
        "3 - Quel est votre projet d'études?",
        "4 - Pourquoi voulez-vous venir étudier en France et pas dans un autre pays ?",
        "5 - Pourquoi cette formation et pas une autre ?",
        "6 - Avez-vous une idée des cours que vous suivrez à l'université avec cette formation ?",
        "7 - Quels sont les débouchés de cette formation ?",
        "8 - Comment vous prendriez-vous pour réussir cette formation ?",
        "9 - Que savez-vous de la France en général ?",
        "10 - Parlez-moi de ce que vous savez des villes de vos universités ?",
        "11 - Pourquoi cette formation et pas une autre ?",
        "12 - Débouchés de la formation",
        "13 - Qui financera vos études en France?",
        "14 - Avez-vous de la famille en France / Que font vos frères en France?",
        "15 - Quel sera le cout de la vie en France pour vous?"
    ]

    current_section = ""
    current_content = []

    with open('paste.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        
    for line in lines:
        line = format_content(line.strip())
        if line.startswith(tuple(str(i) + " -" for i in range(1, 16))):
            # Si nous avons déjà une section en cours, ajoutons-la au document
            if current_section:
                story.append(Paragraph(current_section, styles['CustomTitle']))
                for content in current_content:
                    if content.strip():
                        story.append(Paragraph(content, styles['CustomBody']))
                story.append(Spacer(1, 12))
            
            # Commencer une nouvelle section
            current_section = line
            current_content = []
        else:
            if line.strip():
                current_content.append(line)
    
    # Ajouter la dernière section
    if current_section:
        story.append(Paragraph(current_section, styles['CustomTitle']))
        for content in current_content:
            if content.strip():
                story.append(Paragraph(content, styles['CustomBody']))
    
    # Construire le PDF
    doc.build(story)

if __name__ == "__main__":
    output_filename = "dossier_presentation.pdf"
    create_pdf(output_filename)
    print(f"Le PDF a été généré avec succès : {output_filename}")

Le PDF a été généré avec succès : dossier_presentation.pdf


In [4]:
pip install reportlab==3.6.11

Collecting reportlab==3.6.11
  Using cached reportlab-3.6.11-cp311-cp311-linux_x86_64.whl
Installing collected packages: reportlab
  Attempting uninstall: reportlab
    Found existing installation: reportlab 4.3.1
    Uninstalling reportlab-4.3.1:
      Successfully uninstalled reportlab-4.3.1
Successfully installed reportlab-3.6.11
Note: you may need to restart the kernel to use updated packages.


In [28]:
import nemo.collections.asr as nemo_asr
from moviepy.editor import VideoFileClip

# Extract audio
video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf124/csf124.mp4"
audio_path = "temp_audio.wav"

video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path, codec='pcm_s16le')

# Load QuartzNet (noise-robust model)
model = nemo_asr.models.EncDecCTCModel.from_pretrained("stt_en_quartznet15x5")

# Transcribe
transcription = model.transcribe([audio_path])
print(transcription[0])

ModuleNotFoundError: No module named 'nemo'

In [26]:
pip install nemo_toolkit[all]

Collecting nemo_toolkit[all]
  Downloading nemo_toolkit-2.1.0-py3-none-any.whl.metadata (70 kB)
Collecting onnx>=1.7.0 (from nemo_toolkit[all])
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting ruamel.yaml (from nemo_toolkit[all])
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting tensorboard (from nemo_toolkit[all])
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting text-unidecode (from nemo_toolkit[all])
  Using cached text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting wget (from nemo_toolkit[all])
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting wrapt (from nemo_toolkit[all])
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Collecting black~=24.3 (from nemo_toolkit[all])
  Using cached black-24.10.0-cp311-c

In [17]:
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")

# Load and resample audio file to 16000 Hz
audio_input, sample_rate = librosa.load("/scratch2/bsow/Documents/ACSR/src/acsr/temp_audio.wav", sr=16000)

# Process the audio input
input_values = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_values

# Perform inference
with torch.no_grad():
    logits = model(input_values).logits

# Decode the predicted IDs to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

print("Transcription:", transcription)

OSError: Can't load tokenizer for 'facebook/wav2vec2-large-xlsr-53'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'facebook/wav2vec2-large-xlsr-53' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [3]:
import os
import cv2
import whisper
import torch
import numpy as np
from moviepy.editor import VideoFileClip
from epitran.backoff import Backoff
import re
import mediapipe as mp

In [18]:
# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()

# Step 1: Extract audio from the video
def extract_audio(video_path, audio_path):
    """
    Extracts audio from a video file and saves it as a WAV file.
    """
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    video_clip = VideoFileClip(video_path)
    video_clip.audio.write_audiofile(audio_path, codec="pcm_s16le")
    print(f"Audio extracted and saved to: {audio_path}")

# Step 2: Convert text to IPA
def text_to_ipa(text, language="fra-Latn"):
    """
    Convert a text sentence into its IPA representation.
    Args:
        text (str): Input text.
        language (str): Language code for IPA conversion (e.g., "fra-Latn" for French).
    Returns:
        str: IPA representation of the text.
    """
    backoff = Backoff([language])
    ipa_text = backoff.transliterate(text)
    return ipa_text

# Step 3: Syllabify IPA text
# Define Cued Speech consonants (hand shapes) and vowels (mouth shapes)
consonants = "ptkbdgmnlrsfvzʃʒɡʁjwŋtrɥʀ"
vowels = "aeɛioɔuyøœəɑ̃ɛ̃ɔ̃œ̃ɑ̃ɔ̃ɑ̃ɔ̃"

# Regex pattern for syllabification
syllable_pattern = re.compile(
    f"[{consonants}]?[{vowels}]|[{consonants}]", re.IGNORECASE
)

def syllabify_word(word):
    """
    Syllabify a single word based on the allowed patterns: CV, V, C.
    """
    syllables = syllable_pattern.findall(word)
    return " ".join(syllables)

def syllabify_sentence(sentence):
    """
    Syllabify an entire sentence.
    """
    sentence = sentence.lower()
    sentence = text_to_ipa(sentence)
    words = sentence.split()
    syllabified_sentence = []
    for word in words:
        syllabified_sentence.append(syllabify_word(word))
    return " ".join(syllabified_sentence)

# Step 4: Transcribe the entire audio using Whisper
def transcribe_audio(audio_path, device="cuda"):
    """
    Transcribes the entire audio file using OpenAI's Whisper model.
    Args:
        audio_path (str): Path to the audio file.
        device (str): Device to use for inference ("cuda" for GPU or "cpu" for CPU).
    Returns:
        list: A list of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    """
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    # Check if the specified device is available
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA is not available. Falling back to CPU.")
        device = "cpu"

    # Load the Whisper model on the specified device
    model = whisper.load_model("medium", device=device)  # Use "medium" or "large" for better accuracy

    # Transcribe the entire audio file
    result = model.transcribe(audio_path, language="fr")
    print("Audio transcription completed.")

    # Extract segments from the result
    segments = []
    for segment in result["segments"]:
        text = segment["text"]
        ipa_text = text_to_ipa(text)  # Convert text to IPA
        syllabified_text = syllabify_sentence(ipa_text)  # Syllabify IPA text
        segments.append((segment["start"], segment["end"], text, ipa_text, syllabified_text))
    
    return segments



In [23]:
# Step 5: Extract landmarks using MediaPipe
def extract_landmarks(frame):
    """
    Extract head and hand landmarks from a video frame using MediaPipe Holistic.
    Args:
        frame: Input video frame.
    Returns:
        dict: Landmarks for face, right hand, and left hand.
    """
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)

    # Extract landmarks
    landmarks = {
        "face": results.face_landmarks,
        "right_hand": results.right_hand_landmarks,
        "left_hand": results.left_hand_landmarks,
    }
    return landmarks

# Step 6: Build syllable-to-gesture mappings
def build_syllable_mappings(video_path, segments):
    """
    Build syllable-to-gesture mappings by extracting hand coordinates during annotated frames.
    Args:
        video_path: Path to the video file.
        segments: List of tuples containing (start_time, end_time, text, ipa_text, syllabified_text).
    Returns:
        dict: Syllable-to-gesture mappings.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = 33
    syllable_mappings = {}

    for segment in segments:
        start_time, end_time, text, ipa_text, syllabified_text = segment
        syllabified_text = text
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        # Set the video to the start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        # Extract hand coordinates for the syllable
        hand_coordinates = []
        for _ in range(start_frame, end_frame + 1):
            ret, frame = cap.read()
            if not ret:
                break

            landmarks = extract_landmarks(frame)
            if landmarks["right_hand"]:
                hand_coordinates.append(landmarks["right_hand"])

        # Map the syllable to the average hand coordinates
        if hand_coordinates:
            avg_hand_coordinates = np.mean(hand_coordinates, axis=0)
            syllable_mappings[syllabified_text] = avg_hand_coordinates

    cap.release()
    return syllable_mappings

# Step 7: Render gestures on the video
def render_gestures(video_path, syllable_mappings, output_video_path):
    """
    Render gestures on the video by overlaying hand positions on the head.
    Args:
        video_path: Path to the input video.
        syllable_mappings: Syllable-to-gesture mappings.
        output_video_path: Path to save the output video.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Extract landmarks
        landmarks = extract_landmarks(frame)

        # Overlay hand gestures based on syllable mappings
        for syllable, hand_coordinates in syllable_mappings.items():
            if landmarks["face"] and landmarks["right_hand"]:
                # Draw hand on the face
                cv2.circle(frame, (int(hand_coordinates.x * frame_width), int(hand_coordinates.y * frame_height)), 10, (0, 255, 0), -1)

        # Write the frame to the output video
        out.write(frame)

    cap.release()
    out.release()


In [31]:
# Main function
def main():
    # File paths
    video_path = "/scratch2/bsow/Documents/ACSR/data/training_videos/sent_01.mp4"  # Replace with your video file path
    audio_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav"  # Temporary audio file
    output_video_path = "/scratch2/bsow/Documents/ACSR/data/transcriptions/output_video.mp4"  # Output video file

    # Step 1: Extract audio from the video
    extract_audio(video_path, audio_path)

    # Step 2: Transcribe the entire audio
    device = "cuda"  # Set to "cuda" for GPU or "cpu" for CPU
    segments = transcribe_audio(audio_path, device=device)

    # Step 3: Build syllable-to-gesture mappings
    syllable_mappings = build_syllable_mappings(video_path, segments)

    # Step 4: Render gestures on the video
    render_gestures(video_path, syllable_mappings, output_video_path)

    # Clean up temporary audio file
    os.remove(audio_path)
    print("Temporary audio file removed.")

if __name__ == "__main__":
    main()

MoviePy - Writing audio in /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav


                                                                        

MoviePy - Done.
Audio extracted and saved to: /scratch2/bsow/Documents/ACSR/data/transcriptions/output_audio.wav




FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [35]:
!module list

Currently Loaded Modulefiles:[m
 1) [100mglibc/2.34-dtaq[0m   2) [100mgcc-runtime/11.4.1-hyx3[0m   3) miniconda3/24.3.0-ui7c  [m
[m
Key:[m
[100mauto-loaded[0m  [m
[K[?1l>

In [7]:
import os
import cv2
import mediapipe as mp
import json

# Initialize MediaPipe FaceMesh and Hands
mp_face_mesh = mp.solutions.face_mesh
mp_hands = mp.solutions.hands
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5)
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

# Define input and output directories
input_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/images"
output_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/coordinates"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each image and extract hand landmarks, nose coordinates, and additional face information
for i in range(1, 9):  # Loop through handshape_1 to handshape_8
    image_path = os.path.join(input_dir, f"handshape_{i}.jpg")
    output_path = os.path.join(output_dir, f"handshape_{i}.json")

    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not load image {image_path}")
        continue

    # Convert the image to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Process the image with MediaPipe FaceMesh to get nose coordinates and face bounding box
    face_results = face_mesh.process(rgb_image)
    nose_landmarks = None
    face_bbox = None
    eye_distance = None

    if face_results.multi_face_landmarks:
        for face_landmarks in face_results.multi_face_landmarks:
            # Extract nose landmark (landmark 1)
            nose_landmarks = [
                face_landmarks.landmark[1].x,
                face_landmarks.landmark[1].y,
                face_landmarks.landmark[1].z,
            ]

            # save landmarks 227 (left), 454 (right), 10 (top), 159 (-)
            # Extract all x and y coordinates of the face landmarks
            x_coords = [landmark.x for landmark in face_landmarks.landmark]
            y_coords = [landmark.y for landmark in face_landmarks.landmark]

            # Calculate the bounding box of the face
            face_bbox = {
                "x_min": min(x_coords),
                "x_max": max(x_coords),
                "y_min": min(y_coords),
                "y_max": max(y_coords),
            }

            # Calculate the distance between the eyes (landmarks 33 and 263)
            left_eye = face_landmarks.landmark[33]
            right_eye = face_landmarks.landmark[263]
            eye_distance = ((left_eye.x - right_eye.x) ** 2 + (left_eye.y - right_eye.y) ** 2) ** 0.5
            break

    # Process the image with MediaPipe Hands to get hand landmarks
    hand_results = hands.process(rgb_image)
    hand_landmarks = None

    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            # Extract 3D coordinates of the hand landmarks
            landmarks = []
            for landmark in hand_landmarks.landmark:
                landmarks.append([landmark.x, landmark.y, landmark.z])
            hand_landmarks = landmarks
            break

    # Save the nose, hand landmarks, face bounding box, and eye distance to a JSON file
    if nose_landmarks and hand_landmarks and face_bbox and eye_distance:
        data = {
            "nose_landmarks": nose_landmarks,
            "hand_landmarks": hand_landmarks,
            "face_bbox": face_bbox,
            "eye_distance": eye_distance,
        }
        with open(output_path, "w") as f:
            json.dump(data, f)
        print(f"Saved landmarks and additional information for handshape_{i} to {output_path}")
    else:
        print(f"No face or hand detected in {image_path}")

Saved landmarks and additional information for handshape_1 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_1.json
Saved landmarks and additional information for handshape_2 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_2.json
Saved landmarks and additional information for handshape_3 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_3.json
Saved landmarks and additional information for handshape_4 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_4.json
Saved landmarks and additional information for handshape_5 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_5.json
Saved landmarks and additional information for handshape_6 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_6.json
Saved landmarks and additional information for handshape_7 to /scratch2/bsow/Documents/ACSR/data/handshapes/coordinates/handshape_7.json
Saved landmarks and additional informatio

In [10]:
import cv2
import mediapipe as mp
import numpy as np
import os

def process_all_videos(root_dir):
    # Path configuration
    mp4_base = os.path.join(root_dir, "mp4")
    output_base = os.path.join(root_dir, "lip_rois_mp4")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_base, exist_ok=True)

    # Initialize MediaPipe FaceMesh
    mp_face_mesh = mp.solutions.face_mesh
    face_mesh = mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        refine_landmarks=True,
        min_detection_confidence=0.5
    )

    # Define lip landmark indices
    LIP_LANDMARKS = [61, 78, 95, 88, 87, 14, 317, 402, 324, 308, 
                    0, 267, 269, 270, 409, 40, 37, 39, 40, 185, 
                    17, 314, 405, 321, 375, 291, 84, 181, 91, 146, 
                    80, 81, 82, 13, 312, 311, 319, 308]

    # Walk through directory structure
    for root, dirs, files in os.walk(mp4_base):
        for file in files:
            if file.endswith(".mp4"):
                # Construct paths
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(root, mp4_base)
                output_dir = os.path.join(output_base, relative_path, os.path.splitext(file)[0])
                
                # Create output directory
                os.makedirs(output_dir, exist_ok=True)
                
                # Process video
                print(f"Processing: {input_path}")
                process_video(input_path, output_dir, face_mesh, LIP_LANDMARKS)

    face_mesh.close()

def process_video(input_path, output_dir, face_mesh, lip_landmarks, padding=0.15):
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error opening video: {input_path}")
        return

    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_name = os.path.splitext(os.path.basename(input_path))[0]
    
    prev_bbox = None
    frame_number = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process frame
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(rgb_frame)
        
        if results.multi_face_landmarks:
            landmarks = results.multi_face_landmarks[0].landmark
            lip_points = np.array([(landmarks[i].x * width, landmarks[i].y * height) 
                                 for i in lip_landmarks], dtype=np.int32)

            # Calculate bounding box with padding
            min_x, min_y = np.min(lip_points, axis=0)
            max_x, max_y = np.max(lip_points, axis=0)
            
            # Add padding and clamp to image boundaries
            w = max_x - min_x
            h = max_y - min_y
            min_x = max(0, int(min_x - w * padding))
            min_y = max(0, int(min_y - h * padding))
            max_x = min(width, int(max_x + w * padding))
            max_y = min(height, int(max_y + h * padding))
            
            prev_bbox = (min_x, min_y, max_x, max_y)
        else:
            if prev_bbox is None:
                frame_number += 1
                continue
            min_x, min_y, max_x, max_y = prev_bbox

        # Crop and save image
        lip_roi = frame[min_y:max_y, min_x:max_x]
        if lip_roi.size > 0:
            output_filename = f"{video_name}_lips_{frame_number:04d}.png"
            output_path = os.path.join(output_dir, output_filename)
            cv2.imwrite(output_path, lip_roi)

        frame_number += 1

    cap.release()
    print(f"Saved {frame_number} frames to {output_dir}")

# Run the processing
root_directory = "/scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train"
process_all_videos(root_directory)

Processing: /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf001/csf001.mp4
Saved 293 frames to /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/lip_rois_mp4/csf001/csf001
Processing: /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf002/csf002.mp4
Saved 294 frames to /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/lip_rois_mp4/csf002/csf002
Processing: /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf007/csf007.mp4
Saved 296 frames to /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/lip_rois_mp4/csf007/csf007
Processing: /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf003/csf003.mp4
Saved 294 frames to /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/lip_rois_mp4/csf003/csf003
Processing: /scratch2/bsow/Documents/ACSR/data/training_videos/CSF22_train/mp4/csf004/csf004.mp4
Saved 296 frames to /scratch2/bsow/Documents/ACSR/data/training_videos/CSF2

In [1]:
import os
import cv2
import numpy as np
import pandas as pd

def get_yellow_pixel_coordinates(image_path):
    """
    Identify yellow pixels in the given image.
    Args:
        image_path (str): Path to the input image.
    Returns:
        np.ndarray: Array of yellow pixel coordinates.
    """
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    
    # Convert to HSV color space
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Define the range for yellow color in HSV
    lower_yellow = np.array([20, 100, 100])  # Lower bound for yellow
    upper_yellow = np.array([30, 255, 255])  # Upper bound for yellow
    
    # Create a mask for yellow pixels
    yellow_mask = cv2.inRange(hsv_image, lower_yellow, upper_yellow)
    
    # Find the coordinates of yellow pixels
    yellow_pixels = np.column_stack(np.where(yellow_mask > 0))
    
    return yellow_pixels

def save_yellow_pixels_to_csv(hand_images_dir, output_csv):
    """
    Process all hand images in the directory, identify one yellow pixel per image,
    and save the results to a CSV file.
    Args:
        hand_images_dir (str): Directory containing hand images.
        output_csv (str): Path to save the CSV file.
    """
    # List to store results
    results = []
    
    # Loop through all hand images in the directory
    for hand_image_name in os.listdir(hand_images_dir):
        if hand_image_name.endswith(".png"):  # Process only PNG files
            hand_image_path = os.path.join(hand_images_dir, hand_image_name)
            
            # Get yellow pixel coordinates
            yellow_pixels = get_yellow_pixel_coordinates(hand_image_path)
            
            if len(yellow_pixels) > 0:
                # Select the first yellow pixel as the representative
                representative_pixel = yellow_pixels[0]
                
                # Append the result (image name and pixel coordinates)
                results.append({
                    "image_name": hand_image_name,
                    "yellow_pixel_x": representative_pixel[1],  # Column index
                    "yellow_pixel_y": representative_pixel[0]   # Row index
                })
            else:
                print(f"No yellow pixels found in {hand_image_name}")
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    results_df.to_csv(output_csv, index=False)
    print(f"Yellow pixel coordinates saved to {output_csv}")

# Example usage
hand_images_dir = "/scratch2/bsow/Documents/ACSR/data/handshapes/hand_images"
output_csv = "/scratch2/bsow/Documents/ACSR/data/handshapes/yellow_pixels.csv"
save_yellow_pixels_to_csv(hand_images_dir, output_csv)

Yellow pixel coordinates saved to /scratch2/bsow/Documents/ACSR/data/handshapes/yellow_pixels.csv
