In [None]:
import os
import csv
import re

def find_r2d2_wav_files(base_dir, output_csv):
    pattern = re.compile(r"R2D2\d{5}")
    rows = []

    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".wav"):
                full_path = os.path.join(root, file)

                # Normalize and split path
                parts = os.path.normpath(full_path).split(os.sep)

                # Find patientID in the path
                try:
                    patient_index = next(i for i, part in enumerate(parts) if pattern.fullmatch(part))
                    patient_id = parts[patient_index]
                except StopIteration:
                    continue  # No valid R2D2NNNNN folder in path

                # Relative path from patient folder
                rel_path = os.path.relpath(full_path, start=os.path.join(*parts[:patient_index + 1]))

                # Use basename to ensure filename is valid
                filename = os.path.basename(full_path)

                rows.append({
                    'patientID': patient_id,
                    'filename': filename,
                    'path': rel_path
                })

    # Write to CSV safely
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['patientID', 'filename', 'path'], quoting=csv.QUOTE_ALL)
        writer.writeheader()
        writer.writerows(rows)

    print(f"✅ CSV written to {output_csv} with {len(rows)} valid .wav files.")

# Example usage:
find_r2d2_wav_files("/Users/abelvillcaroque/data/Audium/UCSF_20250508/R2D2_Train_Data", "r2d2_audio_index.csv")