### RAVDESS â€” Train/Val/Test Split

In [None]:
import os
from pathlib import Path
import pandas as pd
from IPython.display import display

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
BASE_DIR = Path("/content/drive/MyDrive/PainRecognitionProject/data/RAVDESS/")
OUTPUT_DIR = BASE_DIR
print(f"Base dir: {BASE_DIR}")

In [None]:
EMOTION_MAP = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

In [None]:
def parse_filename(fname):
    """
    Parse a RAVDESS filename, e.g.:
    03-01-06-01-02-01-12.mp4
    Returns: (emotion_id, actor_id) as ints
    """
    parts = fname.replace(".mp4", "").split("-")
    if len(parts) != 7:
        raise ValueError(f"Invalid filename format: {fname}")

    # modality - vocal - emotion - intensity - statement - repetition - actor
    _, _, emotion_id, _, _, _, actor = parts
    return int(emotion_id), int(actor)

In [None]:
def scan_ravdess(base_dir: Path):
    """
    Walks through Actor_XX folders and collects video paths and emotion labels.
    Returns a DataFrame with columns: video_path, label, actor
    """
    if not base_dir.exists():
        raise FileNotFoundError(f"Base directory not found: {base_dir}")

    records = []
    for actor_folder in sorted(os.listdir(base_dir)):
        if not actor_folder.startswith("Actor_"):
            continue

        actor_id = int(actor_folder.split("_")[1])
        actor_path = base_dir / actor_folder

        for fname in os.listdir(actor_path):
            if fname.endswith(".mp4"):
                emotion_id, _ = parse_filename(fname)
                full_path = os.path.join(actor_folder, fname)
                records.append({"video_path": full_path, "label": emotion_id, "actor": actor_id})

    df = pd.DataFrame(records)
    print(f"Found {len(df)} recordings.")
    return df

In [None]:
def split_by_actor(df, train_n=16, val_n=4, test_n=4):
    """
    Split dataset by actor ids into train/val/test sets.
    Default split uses 16 train, 4 val, 4 test actors.
    """
    actors = sorted(df["actor"].unique())
    print(f"Actors in dataset: {actors}")

    assert len(actors) == (train_n + val_n + test_n), "Actor count does not match requested split!"

    train_actors = actors[:train_n]
    val_actors = actors[train_n:train_n + val_n]
    test_actors = actors[train_n + val_n:]

    df_train = df[df["actor"].isin(train_actors)][["video_path", "label"]]
    df_val = df[df["actor"].isin(val_actors)][["video_path", "label"]]
    df_test = df[df["actor"].isin(test_actors)][["video_path", "label"]]

    print("---------------------------------")
    print(f"Train: {len(df_train)} clips (actors: {train_actors})")
    print(f"Val:   {len(df_val)} clips (actors: {val_actors})")
    print(f"Test:  {len(df_test)} clips (actors: {test_actors})")
    print("---------------------------------")

    return df_train, df_val, df_test

In [None]:
def save_splits(df_train, df_val, df_test, output_dir: Path):
    """
    Save train/val/test CSVs to output_dir.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    train_path = output_dir / "train.csv"
    val_path = output_dir / "val.csv"
    test_path = output_dir / "test.csv"

    df_train.to_csv(train_path, index=False)
    df_val.to_csv(val_path, index=False)
    df_test.to_csv(test_path, index=False)

    print("Saved:")
    print(" -", train_path)
    print(" -", val_path)
    print(" -", test_path)

In [None]:
df_all = scan_ravdess(BASE_DIR)
print(f"Total recordings: {len(df_all)}; unique actors: {df_all['actor'].nunique()}")
display(df_all.head())

df_train, df_val, df_test = split_by_actor(df_all, train_n=16, val_n=4, test_n=4)

save_splits(df_train, df_val, df_test, OUTPUT_DIR)