Section 1: File Path Collection

We collect all .wav file paths from the dataset folder (recursively) and infer their labels from filenames.
Then we export the list to data/audio_filepaths.csv for reproducible access in later steps.

In [None]:
import pandas as pd
from src.data_loader import get_audio_file_paths, export_file_list_to_csv

from utils_notebook_init import init_notebook
init_notebook()

In [None]:
data_dir = "data/raw"
files, labels = get_audio_file_paths(data_dir)

print(f"✅ Found {len(files)} audio files in {data_dir}")
print("Example file paths:", files[:3])
print("Example labels:", labels[:3])

In [None]:
output_csv = "data/audio_filepaths.csv"
export_file_list_to_csv(files, labels, output_csv)

In [None]:
df = pd.read_csv(output_csv)
print(f"✅ CSV successfully created at: {output_csv}")
display(df.head())
print(f"Unique labels found: {df['label'].nunique()}")
print("Label distribution:")
print(df['label'].value_counts())


Section 2: Data Splitting

We’ll split the dataset into train, validation, and test sets using the file list from audio_filepaths.csv.
Each subset keeps balanced class proportions (stratified).
Output files:

data/train_split.csv
data/val_split.csv
data/test_split.csv

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# Step 1: Load the audio file paths CSV
df = pd.read_csv("data/audio_filepaths.csv")
print(f"Total samples: {len(df)}")
print(df.head())

In [None]:
# Step 2: Split into train / test first
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [None]:
# Step 3: Split train into train / val
train_df, val_df = train_test_split(
    train_df,
    test_size=0.15,
    random_state=42,
    stratify=train_df["label"]
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# Step 4: save them for future reuse
train_df.to_csv("data/train_split.csv", index=False)
val_df.to_csv("data/val_split.csv", index=False)
test_df.to_csv("data/test_split.csv", index=False)
print("✅ Saved data splits to CSV.")