## BioVid HeatPain â€” Train/Val/Test Split

In [None]:
import pandas as pd
from pathlib import Path
import os
from IPython.display import display
from typing import Optional, Dict, Any


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
BASE_DIR = Path('/content/drive/MyDrive/PainRecognitionProject/data/BioVid_HeatPain/')
SOURCE_CSV_FILENAME = 'samples.csv'
SOURCE_CSV_PATH = BASE_DIR / SOURCE_CSV_FILENAME

print(f"Source CSV: {SOURCE_CSV_PATH}")

### Data division - Training / Validation / Test

Subject division proposed by the creators of the database
https://www.nit.ovgu.de/nit_media/Bilder/Dokumente/BIOVID_Dokumente/BioVid_HoldOutEval_Proposal.pdf

In [None]:
VAL_TEST_SUBJECTS_IDS = [
    '100914_m_39', '101114_w_37', '082315_w_60', '083114_w_55', '083109_m_60',
    '072514_m_27', '080309_m_29', '112016_m_25', '112310_m_20', '092813_w_24',
    '112809_w_23', '112909_w_20', '071313_m_41', '101309_m_48', '101609_m_36',
    '091809_w_43', '102214_w_36', '102316_w_50', '112009_w_43', '101814_m_58',
    '101908_m_61', '102309_m_61', '112209_m_51', '112610_w_60', '112914_w_51',
    '120514_w_56'
]

In [None]:
def perform_subject_split(df_all_clips, val_test_subjects_ids):
    """
    Split clips into train/validation/test sets based on subject IDs.
    Implements a fixed, balanced split where 26 specified subjects are divided into Validation (13) and Test (13).
    """

    temp_data = [{'subject_name': id, 'gender': id.split('_')[1], 'expression': 'Low' if id in ['100914_m_39', '101114_w_37', '082315_w_60', '083114_w_55', '083109_m_60'] else 'Normal'}
                 for id in val_test_subjects_ids]
    df_26 = pd.DataFrame(temp_data)

    # Low expression (5): Val: 1 M, 1 F; Test: 1 M, 2 F
    low_m = df_26[(df_26['expression'] == 'Low') & (df_26['gender'] == 'm')]
    low_w = df_26[(df_26['expression'] == 'Low') & (df_26['gender'] == 'w')]
    val_low_ids = pd.concat([low_m.iloc[0:1], low_w.iloc[0:1]])['subject_name'].tolist()
    test_low_ids = pd.concat([low_m.iloc[1:2], low_w.iloc[1:3]])['subject_name'].tolist()

    # Normal expression (21): Val: 11; Test: 10 (balanced split of the remaining subjects)
    df_normal = df_26[df_26['expression'] == 'Normal'].sample(frac=1, random_state=42).reset_index(drop=True)
    val_normal_ids = df_normal.iloc[:11]['subject_name'].tolist()
    test_normal_ids = df_normal.iloc[11:]['subject_name'].tolist()

    # Final subject ID lists
    val_ids = val_low_ids + val_normal_ids
    test_ids = test_low_ids + test_normal_ids

    df_val = df_all_clips[df_all_clips['subject_name'].isin(val_ids)].copy()
    df_test = df_all_clips[df_all_clips['subject_name'].isin(test_ids)].copy()
    df_train = df_all_clips[~df_all_clips['subject_name'].isin(val_ids + test_ids)].copy()

    print(f"\n--- Subject Split Verification ---")
    print(f"Train (clips): {len(df_train)} | Subjects: {df_train['subject_name'].nunique()}")
    print(f"Validation (clips): {len(df_val)} | Subjects: {df_val['subject_name'].nunique()}")
    print(f"Test (clips): {len(df_test)} | Subjects: {df_test['subject_name'].nunique()}")
    print(f"Total subjects: {df_all_clips['subject_name'].nunique()}")

    return df_train, df_val, df_test

In [None]:
try:
    df_master = pd.read_csv(SOURCE_CSV_PATH, sep='\t')
except FileNotFoundError:
    raise FileNotFoundError(f"Source CSV not found: {SOURCE_CSV_PATH}")

print(f"Loaded {len(df_master)} rows; unique subjects: {df_master['subject_name'].nunique()}")
display(df_master.head())

In [None]:
df_master['video_path'] = df_master['subject_name'] + '/' + df_master['sample_name'] + '.mp4'
df_master['label'] = df_master['class_id']

display(df_master[['video_path', 'label']].head())
print("Label counts:")
print(df_master['label'].value_counts())

In [None]:
df_train, df_val, df_test = perform_subject_split(df_master, VAL_TEST_SUBJECTS_IDS)

print("Clips:", len(df_train), len(df_val), len(df_test))
print("Unique subjects:", df_train['subject_name'].nunique(), df_val['subject_name'].nunique(), df_test['subject_name'].nunique())

display(df_train.head())
display(df_val.head())
display(df_test.head())

In [None]:
BASE_DIR.mkdir(parents=True, exist_ok=True)
output_columns = ['video_path', 'label']

df_train[output_columns].to_csv(BASE_DIR / 'train.csv', index=False)
df_val[output_columns].to_csv(BASE_DIR / 'val.csv', index=False)
df_test[output_columns].to_csv(BASE_DIR / 'test.csv', index=False)

print("Saved files:")
print(BASE_DIR / 'train.csv')
print(BASE_DIR / 'val.csv')
print(BASE_DIR / 'test.csv')