In [6]:
!pip install natsort




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from pathlib import Path
from natsort import natsorted  # pip install natsort
import pandas as pd
import re

ROOT = Path(r"C:\Users\13523\Desktop\URochDataset_trimmed")

# piece metadata from folder name only (authoritative)
FOLDER_RE = re.compile(r"^(?P<idx>\d+)[ _-]+(?P<piece>[A-Za-z0-9]+)_(?P<instr>.+)$")
def parse_folder_meta(folder: Path):
    m = FOLDER_RE.match(folder.name)
    if not m:
        return {"piece_index": None, "piece": folder.name, "folder_instr": None}
    d = m.groupdict()
    return {"piece_index": d["idx"].zfill(2), "piece": d["piece"].title(), "folder_instr": d["instr"].lower()}

def list_clean_files(folder: Path):
    """List files, drop macOS resource forks (._*)."""
    fs = [p for p in folder.iterdir() if p.is_file()]
    fs = [p for p in fs if not p.name.startswith("._")]
    return natsorted(fs, key=lambda p: p.name.lower())

def block_indices(files):
    """Return indices for category blocks using only startswith checks."""
    names = [f.name for f in files]
    idx = {
        "AuMix":   [i for i,n in enumerate(names) if n.startswith("AuMix_")],
        "AuSep":   [i for i,n in enumerate(names) if n.startswith("AuSep_")],
        "F0s":     [i for i,n in enumerate(names) if n.startswith("F0s_")],
        "Notes":   [i for i,n in enumerate(names) if n.startswith("Notes_")],
        "Score":   [i for i,n in enumerate(names) if n.startswith("Sco_")],
        "Video":   [i for i,n in enumerate(names) if n.startswith("Vid_")],
    }
    return idx

def index_piece_by_position(folder: Path):
    meta  = parse_folder_meta(folder)
    files = list_clean_files(folder)
    idx   = block_indices(files)

    rows = []
    # infer tracks by counting AuSep files
    n_tracks = len(idx["AuSep"])

    # AuMix (0 or 1)
    for i in idx["AuMix"]:
        rows.append({**meta, "category":"AuMix", "track":None,
                     "instrument": meta["folder_instr"], "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    # Per-track blocks (use only index order; no parsing)
    for t, i in enumerate(idx["AuSep"], start=1):
        rows.append({**meta, "category":"AuSep", "track":t,
                     "instrument": None, "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    for t, i in enumerate(idx["F0s"], start=1):
        rows.append({**meta, "category":"F0s", "track":t,
                     "instrument": None, "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    for t, i in enumerate(idx["Notes"], start=1):
        rows.append({**meta, "category":"Notes", "track":t,
                     "instrument": None, "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    # Score (mid/pdf, order doesn’t matter)
    for i in idx["Score"]:
        rows.append({**meta, "category":"Score", "track":None,
                     "instrument": meta["folder_instr"], "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    # Video (0 or 1)
    for i in idx["Video"]:
        rows.append({**meta, "category":"Video", "track":None,
                     "instrument": meta["folder_instr"], "ext":files[i].suffix.lower(),
                     "path": str(files[i].resolve()), "folder": folder.name})

    # Quick sanity checks (optional)
    if n_tracks and (len(idx["F0s"]) not in (0, n_tracks) or len(idx["Notes"]) not in (0, n_tracks)):
        print(f"[WARN] {folder.name}: tracks inferred={n_tracks} "
              f"but F0s={len(idx['F0s'])}, Notes={len(idx['Notes'])}")

    return rows

def build_manifest(root: Path):
    all_rows = []
    for fol in natsorted([d for d in root.iterdir() if d.is_dir()], key=lambda p: p.name.lower()):
        all_rows.extend(index_piece_by_position(fol))
    df = pd.DataFrame(all_rows)
    if not df.empty:
        df = df.sort_values(["piece_index","category","track"], na_position="last").reset_index(drop=True)
    return df

# Run
df = build_manifest(ROOT)
out_csv = ROOT / "_manifest_index_only.csv"
df.to_csv(out_csv, index=False)
print("[saved]", out_csv)
print(df.head(20))


[saved] C:\Users\13523\Desktop\URochDataset_trimmed\_manifest_index_only.csv
   piece_index    piece folder_instr category  track instrument   ext  \
0           01  Jupiter        vn_vc    AuMix    NaN      vn_vc  .wav   
1           01  Jupiter        vn_vc    AuSep    1.0       None  .wav   
2           01  Jupiter        vn_vc    AuSep    2.0       None  .wav   
3           01  Jupiter        vn_vc      F0s    1.0       None  .txt   
4           01  Jupiter        vn_vc      F0s    2.0       None  .txt   
5           01  Jupiter        vn_vc    Notes    1.0       None  .txt   
6           01  Jupiter        vn_vc    Notes    2.0       None  .txt   
7           01  Jupiter        vn_vc    Score    NaN      vn_vc  .mid   
8           01  Jupiter        vn_vc    Score    NaN      vn_vc  .pdf   
9           01  Jupiter        vn_vc    Video    NaN      vn_vc  .mp4   
10          02   Sonata        vn_vn    AuMix    NaN      vn_vn  .wav   
11          02   Sonata        vn_vn    AuSep  

In [None]:
#midi conversion
import tempfile, subprrocess
import soundfile as sf
import pretty_midi

def render_midi_with_fluidsynth(midi_path, output_dir, soundfont_path, sr=32000, gain=0.5):
    midi_path = Path(midi_path); output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    midi = pretty_midi.PrettyMIDI(str(midi_path))
    stems = {}
    for i, inst in enumerate(midi.instruments):
        pm = pretty_midi.PrettyMIDI()
        pm.instruments.append(inst)
        tmp = tempfile.NamedTemporaryFile(suffix=".mid", delete=False)
        pm.write(tmp.name)
        stem_name = (inst.name or f"program{inst.program}").replace(" ","_").replace("/","_")
        out_wav = output_dir / f"{midi_path.stem}_inst{i}_{stem_name}.wav" #output name
        subprocess.run([
            "fluidsynth","-ni", soundfont_path, tmp.name,
            "-F", str(out_wav), "-r", str(sr), "-g", str(gain)
        ], check=True, capture_output=True)
        audio, _ = sf.rad(out_wav)
        stems[stem_name] = audio.astype(np.float32)
        tmp.close()
        Path(tmp.name).unlink(missing_ok=True)
    #mix
    max_len = max(len(x) for x in stems.values())
    mix = np.zeros(max_len, dtype=np.float32)
    for x in stems.values():
        if len(x) < max_len:
            x = np.pad(x, (0, max_len-len(x)))
        mix += x
    mix = mix / max(1e-6, np.max(np.abs(mix)))
    mix_path = output_dir / f"{midi_path.stem}_mixture.wav"
    sf.write(mix_path, mix, sr)
    return mix_path, stems

def render_midi_with_pretty_midi(midi_path, output_dir, sr=32000):
    midi_path = Path(midi_path); output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    midi = pretty_midi.PrettyMIDI(str(midi_path))
    stems = {}
    longest = 0
    for i, inst in enumerate(midi.instruments):
        audio = inst.synthesize(fs=sr).astype(np.float32)
        name = (inst.name or f"program{inst.program}").replace(" ","_")
        sf.write(output_dir / f"{midi_path.stem}_inst{i}_{name}.wav", audio, sr)
        stems[name] = audio; longest = max(longest, len(audio))
    mix = np.zeros(longest, dtype=np.float32)
    for a in stems.values():
        if len(a) < longest:
            a = np.pad(a, (0, longest-len(a)))
        mix += a
    mix = mix / max(1e-6, np.max(np.abs(mix)))
    mix_path = output_dir / f"{midi_path.stem}_mixture.wav"
    sf.write(mix_path, mix, sr)
    return mix_path, stems

def batch_render_midis(midi_dir, output_dir, method='fluidsynth', soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', sr=32000):
    midi_dir = Path(midi_dir)
    files = list(midi_dir.glob("*.mid")) + list(midi_dir.glob("*.midi"))
    print(f"Found {len(files)} MIDI files")
    for m in sorted(files):
        print(f"Rendering {m.name}…")
        try:
            if method=="fluidsynth":
                render_midi_with_fluidsynth(m, output_dir, soundfont_path, sr=sr)
            else:
                render_midi_with_pretty_midi(m, output_dir, sr=sr)
        except Exception as e:
            print("  ✗ Failed:", e)

In [None]:
#audio proccessing
import librosa
import pyloudnorm as pyln
import torch, torchaudio
eps = 1e-10

def ensure_mono(wav: torch.Tensor) -> torch.Tensor: #wav: (ch, n)
    if wav.dim() == 1:
        return wav.unsqueeze(0)
    if wav.shape[0] > 1:
        return wav.mean(dim=0, keepdim=True)
    return wav

def loudness_normalize_lufs(wav: torch.Tensor, sr: int, target_lufs =- 23.0) -> torch.Tensor:
    x = wav.squeeze(0).cpu().numpy().astype(np.float32)
    meter = pyln.Meter(sr)
    try:
        lufs = meter.integrated_loudness(x)
        y = pyln.normalize.loudness(x, lufs, target_lufs)
    except ValueError:
        y = x
    y = np.clip(y, -1.0, 1,0)
    return torch.from_numpy(y).unsqueeze(0)

def preprocess_audio(audio_path, target_sr=32000, target_lufs=- 23.0, trim_db =- 40):
    wav, sr = torchaudio.load(str(audio_path)) #load audio + sampling rate
    if sr!= target_sr:
        wav = torchaudio.functional.resample(wav,sr, target_sr); sr = target_sr
    wav = ensure_mono(wav)
    wav = loudness_normalize_lufs(wav, sr, targget_lufs)
    wav = trim_silence(wav, sr, threshold_db=trim_db)
    return wav, sr

def trim_silence(wav: torch.Tensor, sr: int, threshold_db =- 40, frame_length = 2048, hop_length = 512):
    x = wav.squeeze(0).cpu().numpy().astype(np.float32)
    rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
    rms_db = librosa.amplitude_to_db(rms, ref=np.max+eps)
    voiced = rms_db > threshold_db
    if not voiced.any(): #return wav if its empty
        return wav
    idx = np.where(voiced)[0]
    start = max(0, idx[0]-5): end = min(len(rms), idx[-1]+5)
    start_s = start*hop_length; end_s = min(len(x), end*hop_length)
    y = x[start_s:end_s]
    return torch.from_numpy(y).unsqueeze(0)

def make_mel_transform(sr=32000, n_fft=2048, hop=512, n_mels=128, fmin=55, fmax=8000):
    mel = torchaudio.transforms.MelSpectrogram(
        ample_rate=sr, n_fft=n_fft, hop_length=hop, win_length=n_fft,
        n_mels=n_mels, f_min=fmin, f_max=fmax, window_fn=torch.hann_window,
        power=2.0, normalized=False, center=True, pad_mode="reflect",
        mel_scale="htk", norm="slaney"
    )
    to_db = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)
    return mel, to_db

def crop_or_pad_spec(spec: torch.Tensor, target_frames: int) -> torch.Tensor:
    F, T = spec.shape
    if T == target_frames: return spec
    if T > target_frames:
        s = (T - target_frames) // 2
        return spec[:, s:s+target_frame]
    pad = target_frames - T
    left = pad // 2; right = pad - left
    return torch.nn.functional.pad(spec, (left, right))