In [9]:
# RUN ON YOUR MACHINE: git clone https://huggingface.co/datasets/drengskapur/midi-classical-music


In [4]:
from collections import Counter
from datasets import load_dataset
from sklearn.model_selection import train_test_split

repo_id = "drengskapur/midi-classical-music"

# For adding composer name to dataset samples
def get_composer(sample):
    # assume filename is formatted as "data/composer-title.mid"
    filename = sample["file_name"]
    composer = filename.split("/")[1].split("-")[0]
    return {"composer": composer}

# Load dataset
ds = load_dataset(repo_id)

# Add composer name into dataset
ds = ds.map(get_composer)
files = ds["train"]["file_name"]
composers = ds["train"]["composer"]

# Ensure composers with fewer than 3 works are not stratified
counts = Counter(composers)
rare_idx = [i for i, c in enumerate(composers) if counts[c] < 3]
common_idx = [i for i, c in enumerate(composers) if counts[c] >= 3]

rare_files = [files[i] for i in rare_idx]
rare_composers = [composers[i] for i in rare_idx]

common_files = [files[i] for i in common_idx]
common_composers = [composers[i] for i in common_idx]

# Create train-test-val split, stratified by composer:
# Assign 85% of data to training
train_files, temp_files, train_composers, temp_composers = train_test_split(
    common_files, common_composers, test_size=0.15, stratify=common_composers, random_state=2025
)

# Add rare composers back into training set
train_files += rare_files
train_composers += rare_composers

# 7.5% validation data, 7.5% testing data (non-stratified)
val_files, test_files, val_composers, test_composers = train_test_split(
    temp_files, temp_composers, test_size=0.5, random_state=2025
)

print(f"Number of training samples: {len(train_files)}")
print(f"Number of validation samples: {len(val_files)}")
print(f"Number of testing samples: {len(test_files)}")

  from .autonotebook import tqdm as notebook_tqdm


Number of training samples: 4090
Number of validation samples: 353
Number of testing samples: 353


In [8]:
import os, shutil

# Path to the cloned dataset repo
repo_path = "midi-classical-music"

# Base output folder (your own data directory)
base_out = "data"   # change this if you want a different name

splits = {
    "train": train_files,
    "val": val_files,
    "test": test_files
}

for split_name, file_list in splits.items():
    out_dir = os.path.join(base_out, split_name)
    os.makedirs(out_dir, exist_ok=True)
    print(f"Copying {len(file_list)} files into {out_dir}...")
    
    for fname in file_list:
        src = os.path.join(repo_path, fname)
        dst = os.path.join(out_dir, os.path.basename(fname))
        shutil.copy(src, dst)

print("All files copied into train/val/test subfolders")


Copying 4090 files into data\train...
Copying 353 files into data\val...
Copying 353 files into data\test...
All files copied into train/val/test subfolders
