
# Music Realism Scoring with WGAN-GP (Log-Mel, Codec Parity)

This notebook trains a **WGAN-GP on real music only** (using **log-mel spectrograms**) and then uses the **discriminator** as a **realism score** for new tracks (AI vs. real).  
It handles **codec parity** (WAV ↔ MP3 round-trip) to prevent shortcut learning, and fixes input shapes to **[1, 128, 256]** (channels, mels, frames).


In [1]:

# === Config (edit these paths & hyperparameters as you like) ===

REAL_WAV_DIR = "data/REAL_audio"   # folder with *.wav (real music)
AI_MP3_DIR   = "data/AI_audio"     # folder with *.mp3 (AI-generated music)

# Audio & feature params
SR            = 22050           # target sample rate
WIN_SECS      = 10.0            # window length in seconds (we'll resize time to FRAMES)
N_FFT         = 1024
HOP           = 512             # gives ~430 frames for 10s, then we interpolate to FRAMES below
N_MELS        = 128
FMIN, FMAX    = 20.0, 8000.0
FRAMES        = 256             # fixed time frames after interpolation (controls model size)
TARGET_LUFS   = -14.0           # fallback to peak norm if pyloudnorm isn't available
MP3_BITRATE   = "192k"          # codec parity target (requires ffmpeg, otherwise silently skipped)

# Training params
BATCH_SIZE    = 16
EPOCHS        = 5               # bump as needed
LR_G          = 2e-4
LR_D          = 2e-4
BETA1, BETA2  = 0.5, 0.9
Z_DIM         = 128             # latent dim
N_CRITIC      = 4               # WGAN-GP: D steps per G step
LAMBDA_GP     = 10.0
DEVICE        = "cuda"  # "cuda" if available, else "cpu" will be auto-detected below
SEED          = 1337

# Eval params
MAX_EVAL_FILES_PER_CLASS = 100  # limit for fast demos; set None for all


In [2]:

import os, sys, math, random, shutil, tempfile, subprocess, warnings, glob
from pathlib import Path
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchaudio
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

# Optional deps
try:
    import pyloudnorm as pyln
    HAVE_PYL = True
except Exception:
    HAVE_PYL = False

try:
    import soundfile as sf
    HAVE_SF = True
except Exception:
    HAVE_SF = False


# sklearn is optional for metrics
try:
    from sklearn.metrics import roc_auc_score, average_precision_score
    HAVE_SK = True
except Exception:
    HAVE_SK = False

# Check FFmpeg availability (for MP3 round-trip)
def _have_ffmpeg():
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
        return True
    except Exception:
        return False

HAVE_FFMPEG = _have_ffmpeg()

# Device selection
if DEVICE == "cuda" and not torch.cuda.is_available():
    DEVICE = "cpu"
print(f"Device: {DEVICE} | pyloudnorm: {HAVE_PYL} | ffmpeg: {HAVE_FFMPEG} | sklearn: {HAVE_SK}")
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); 
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)


Device: cpu | pyloudnorm: True | ffmpeg: True | sklearn: True


In [15]:
import librosa

try:
    import soundfile as sf
    HAVE_SF = True
except Exception:
    HAVE_SF = False


In [16]:
from pathlib import Path

from pathlib import Path
import numpy as np
import torch

def load_audio(path, sr=SR):
    """
    Robust loader that avoids TorchCodec crashes:
    1) For WAV/AIFF/FLAC: try soundfile (libsndfile).
    2) For everything (incl. MP3/M4A): try librosa (audioread/ffmpeg).
    3) Last resort: torchaudio.load.
    Returns mono float tensor at target SR.
    """
    ext = Path(path).suffix.lower()

    # 1) Prefer soundfile for lossless containers
    if HAVE_SF and ext in [".wav", ".aiff", ".aif", ".flac", ".ogg"]:
        try:
            y, file_sr = sf.read(str(path), dtype="float32", always_2d=False)
            if y.ndim == 2:
                y = y.mean(axis=1)
            wav_t = torch.from_numpy(y)
            if file_sr != SR:
                wav_t = torchaudio.functional.resample(wav_t.unsqueeze(0), file_sr, SR).squeeze(0)
            return wav_t.contiguous()
        except Exception:
            pass

    # 2) Librosa for anything (mp3, m4a, wav, etc.)
    try:
        y, file_sr = librosa.load(str(path), sr=None, mono=True)
        y = y.astype(np.float32, copy=False)
        wav_t = torch.from_numpy(y)
        if file_sr != SR:
            wav_t = torchaudio.functional.resample(wav_t.unsqueeze(0), file_sr, SR).squeeze(0)
        return wav_t.contiguous()
    except Exception:
        pass

    # 3) Fallback to torchaudio (may require TorchCodec)
    wav, file_sr = torchaudio.load(str(path))  # [C, T]
    wav_t = wav.mean(dim=0)
    if file_sr != SR:
        wav_t = torchaudio.functional.resample(wav_t.unsqueeze(0), file_sr, SR).squeeze(0)
    return wav_t.contiguous()

def lufs_normalize(wav_t, sr=SR, target_lufs=TARGET_LUFS):
    if HAVE_PYL:
        y = wav_t.detach().cpu().numpy().astype(np.float32)
        meter = pyln.Meter(sr)
        try:
            loud = meter.integrated_loudness(y)
            gain_db = target_lufs - loud
            gain = 10 ** (gain_db / 20.0)
            y = np.clip(y * gain, -1.0, 1.0)
            return torch.from_numpy(y)
        except Exception:
            pass
    # Fallback: simple peak normalization
    peak = wav_t.abs().max().item()
    if peak > 0:
        wav_t = wav_t / peak
    return wav_t

def mp3_roundtrip(wav_t, sr=SR, bitrate=MP3_BITRATE):
    if not HAVE_FFMPEG:
        return wav_t
    try:
        with tempfile.TemporaryDirectory() as td:
            wav_path = os.path.join(td, "tmp.wav")
            mp3_path = os.path.join(td, "tmp.mp3")
            torchaudio.save(wav_path, wav_t.unsqueeze(0), sr)
            cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
                   "-i", wav_path, "-b:a", bitrate, mp3_path]
            res = subprocess.run(cmd, check=False)
            if res.returncode != 0 or not os.path.exists(mp3_path):
                # fall back gracefully
                return wav_t
            wav2, sr2 = torchaudio.load(mp3_path)
            if sr2 != sr:
                wav2 = torchaudio.functional.resample(wav2, sr2, sr)
            return torch.mean(wav2, dim=0)
    except Exception:
        return wav_t

# Log-mel transforms
_mel = MelSpectrogram(
    sample_rate=SR, n_fft=N_FFT, hop_length=HOP,
    n_mels=N_MELS, f_min=FMIN, f_max=FMAX, center=True, power=2.0
)
_to_db = AmplitudeToDB(stype="power")

def to_logmel(wav_t):
    # wav_t: [T] float32/float64
    x = wav_t.unsqueeze(0)  # [1, T]
    mel = _mel(x)           # [1, mels, frames]
    mel_db = _to_db(mel)    # [1, mels, frames]
    # min-max scale to [-1, 1] per-sample
    m = mel_db.amin(dim=(1,2), keepdim=True)
    M = mel_db.amax(dim=(1,2), keepdim=True)
    mel_n = (mel_db - m) / (M - m + 1e-9)
    mel_n = mel_n * 2.0 - 1.0
    return mel_n   # [1, mels, frames] range ~ [-1,1]

def fix_frames(spec_1mT, frames=FRAMES):
    # spec_1mT: [1, mels, T]
    T = spec_1mT.shape[-1]
    if T == frames:
        return spec_1mT
    spec = F.interpolate(spec_1mT.unsqueeze(0), size=(N_MELS, frames), mode="bilinear", align_corners=False)
    return spec.squeeze(0)  # [1, mels, frames]

def preprocess_file(path, codec_parity=False):
    wav = load_audio(path, sr=SR)
    if codec_parity:
        wav = mp3_roundtrip(wav, sr=SR)
    wav = lufs_normalize(wav, sr=SR, target_lufs=TARGET_LUFS)
    # Extract a center window of ~WIN_SECS before mel (or pad if short)
    N_SAMPLES = int(SR * WIN_SECS)
    if wav.numel() < N_SAMPLES:
        wav = F.pad(wav, (0, N_SAMPLES - wav.numel()))
    else:
        start = (wav.numel() - N_SAMPLES) // 2
        wav = wav[start:start+N_SAMPLES]
    mel = to_logmel(wav)          # [1, 128, T']
    mel = fix_frames(mel, FRAMES) # [1, 128, 256]
    return mel


In [5]:

class RealMelDataset(Dataset):
    """Real-only dataset. Each item is a fresh random 10s window from a file.
    We do codec parity via MP3 round-trip so the discriminator can't cheat.
    """
    def __init__(self, files, codec_parity=True):
        self.files = files
        self.codec_parity = codec_parity

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        # For variety, pick a random 10s segment BEFORE mel
        wav = load_audio(path, sr=SR)
        N_SAMPLES = int(SR * WIN_SECS)
        if wav.numel() < N_SAMPLES:
            wav = F.pad(wav, (0, N_SAMPLES - wav.numel()))
        else:
            max_start = wav.numel() - N_SAMPLES
            start = int(torch.randint(0, max_start + 1, (1,)).item())
            wav = wav[start:start+N_SAMPLES]
        if self.codec_parity:
            wav = mp3_roundtrip(wav, sr=SR)
        wav = lufs_normalize(wav, sr=SR, target_lufs=TARGET_LUFS)
        mel = to_logmel(wav)           # [1, 128, T']
        mel = fix_frames(mel, FRAMES)  # [1, 128, 256]
        return mel

def list_audio_files(root, exts):
    return sorted([p for ext in exts for p in Path(root).rglob(f"*{ext}")])

real_files = list_audio_files(REAL_WAV_DIR, [".wav", ".WAV"])
ai_files   = list_audio_files(AI_MP3_DIR,   [".mp3", ".MP3"])

print(f"Found real WAV files: {len(real_files)} | AI MP3 files: {len(ai_files)}")

def validate_real_files(files, sample_limit=None):
    ok, bad = [], []
    count = 0
    for p in files:
        try:
            # light check: load a few seconds + mel (no parity here to be fast)
            wav = load_audio(p, sr=SR)
            if wav.numel() < int(SR * 1.0):  # require at least 1s audio
                raise RuntimeError("too short")
            _ = to_logmel(wav[:int(SR * 2.0)])  # quick 2s mel test
            ok.append(p)
        except Exception as e:
            bad.append((p, str(e)))
        count += 1
        if sample_limit and count >= sample_limit:
            break
    if bad:
        print(f"Skipping {len(bad)} problematic files (showing first 10):")
        for p, msg in bad[:10]:
            print(" -", p, "->", msg)
    print(f"Validated: {len(ok)} usable files")
    return ok

# Rebuild file list after validation
real_files = validate_real_files(real_files)


# DataLoaders
# DataLoaders (SAFE SETTINGS for Windows/Jupyter)
if len(real_files) > 0:
    train_ds = RealMelDataset(real_files, codec_parity=True)
    train_loader = DataLoader(
        train_ds,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0,            # <= important
        pin_memory=False,         # ok to False on CPU / notebook
        persistent_workers=False, # <= important
        drop_last=True,           # avoid last tiny batch oddities
    )
else:
    train_ds, train_loader = None, None
    warnings.warn("No real WAV files found. Please populate REAL_WAV_DIR to train.")


Found real WAV files: 2000 | AI MP3 files: 512
Skipping 2 problematic files (showing first 10):
 - data\REAL_audio\jazz.00054.wav -> Error opening 'data\\REAL_audio\\jazz.00054.wav': Format not recognised.
 - data\REAL_audio\jazz.00054.wav -> Error opening 'data\\REAL_audio\\jazz.00054.wav': Format not recognised.
Validated: 1998 usable files


In [6]:
batch = next(iter(train_loader))
print(batch.shape, batch.dtype, batch.min().item(), batch.max().item())
# Expect ~ torch.Size([BATCH_SIZE, 1, 128, 256]) and values in [-1, 1]  

torch.Size([16, 1, 128, 256]) torch.float32 -0.9930947422981262 0.9921972751617432


In [7]:

def spectral_conv2d(in_ch, out_ch, k, s, p):
    return nn.utils.spectral_norm(nn.Conv2d(in_ch, out_ch, k, s, p))

class Discriminator(nn.Module):
    """PatchGAN-style: accepts [B,1,128,256] and outputs [B,1] scalar scores via global pooling."""
    def __init__(self, use_spectral_norm=True):
        super().__init__()
        Conv = spectral_conv2d if use_spectral_norm else nn.Conv2d
        chs = [1, 32, 64, 128, 256, 256]
        self.net = nn.Sequential(
            Conv(chs[0], chs[1],  (3,3), (2,2), (1,1)), nn.LeakyReLU(0.2, inplace=True),
            Conv(chs[1], chs[2],  (3,3), (2,2), (1,1)), nn.LeakyReLU(0.2, inplace=True),
            Conv(chs[2], chs[3],  (3,3), (2,2), (1,1)), nn.LeakyReLU(0.2, inplace=True),
            Conv(chs[3], chs[4],  (3,3), (2,2), (1,1)), nn.LeakyReLU(0.2, inplace=True),
            Conv(chs[4], chs[5],  (3,3), (2,2), (1,1)), nn.LeakyReLU(0.2, inplace=True),
        )
        self.head = nn.Linear(chs[5], 1)

    def forward(self, x):
        # x: [B,1,128,256]
        feat = self.net(x)                     # [B,C,h,w]
        feat = feat.mean(dim=(2,3))            # global average pool -> [B,C]
        out = self.head(feat)                  # [B,1]
        return out

class Generator(nn.Module):
    """DCGAN-ish generator to [1,128,256]."""
    def __init__(self, z_dim=Z_DIM, base=256):
        super().__init__()
        # We'll map z -> [C0, 8, 16] then upsample to [1,128,256]
        C0 = base
        self.fc = nn.Linear(z_dim, C0*8*16)
        def block(in_ch, out_ch):
            return nn.Sequential(
                nn.ConvTranspose2d(in_ch, out_ch, kernel_size=4, stride=2, padding=1),
                nn.BatchNorm2d(out_ch),
                nn.ReLU(True),
            )
        self.up = nn.Sequential(
            block(C0, 128),
            block(128, 64),
            block(64, 32),
            block(32, 16),
            nn.ConvTranspose2d(16, 1, kernel_size=3, stride=1, padding=1),
            nn.Tanh()
        )

    def forward(self, z):
        x = self.fc(z).view(z.size(0), 256, 8, 16)  # [B,256,8,16]
        x = self.up(x)                              # [B,1,128,256] (approximately after upsamples)
        # If off by a few pixels due to stride math, interpolate to exact size
        x = F.interpolate(x, size=(N_MELS, FRAMES), mode="bilinear", align_corners=False)
        return x


In [8]:

def grad_penalty(D, real, fake, device):
    B = real.size(0)
    eps = torch.rand(B, 1, 1, 1, device=device)
    x_hat = eps * real + (1 - eps) * fake
    x_hat.requires_grad_(True)
    d_hat = D(x_hat)
    ones = torch.ones_like(d_hat, device=device)
    grads = torch.autograd.grad(
        outputs=d_hat, inputs=x_hat, grad_outputs=ones,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gp = ((grads.view(B, -1).norm(2, dim=1) - 1.0) ** 2).mean()
    return gp

from tqdm.auto import tqdm

def train_wgan_gp(train_loader, epochs=EPOCHS, z_dim=Z_DIM, device=DEVICE):
    if train_loader is None:
        print("No training data. Populate REAL_WAV_DIR with .wav files and rerun.")
        return None, None

    D = Discriminator(use_spectral_norm=True).to(device)
    G = Generator(z_dim=z_dim).to(device)

    optD = torch.optim.Adam(D.parameters(), lr=LR_D, betas=(BETA1, BETA2))
    optG = torch.optim.Adam(G.parameters(), lr=LR_G, betas=(BETA1, BETA2))

    global_step = 0
    for epoch in range(1, epochs + 1):
        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=True)
        for mel in pbar:
            mel = mel.to(device)
            B = mel.size(0)

            # ===== Train D (N_CRITIC steps) =====
            loss_D_val = 0.0
            for _ in range(N_CRITIC):
                z = torch.randn(B, z_dim, device=device)
                fake = G(z).detach()
                d_real = D(mel).mean()
                d_fake = D(fake).mean()
                gp = grad_penalty(D, mel, fake, device)
                loss_D = (d_fake - d_real) + LAMBDA_GP * gp

                optD.zero_grad(set_to_none=True)
                loss_D.backward()
                optD.step()
                loss_D_val = loss_D.item()

            # ===== Train G (1 step) =====
            z = torch.randn(B, z_dim, device=device)
            fake = G(z)
            loss_G = -D(fake).mean()
            optG.zero_grad(set_to_none=True)
            loss_G.backward()
            optG.step()

            # tqdm stats
            pbar.set_postfix({
                "D": f"{loss_D_val:.3f}",
                "G": f"{loss_G.item():.3f}",
                "d_real": f"{d_real.item():.3f}",
                "d_fake": f"{d_fake.item():.3f}"
            })
            global_step += 1

    return G, D
# Quick dry-run (skips if no data)
# G, D = train_wgan_gp(train_loader, epochs=1)
# torch.save(D.state_dict(), "D.pth"); torch.save(G.state_dict(), "G.pth")


  from .autonotebook import tqdm as notebook_tqdm


In [17]:

@torch.no_grad()
def score_windows_with_D(D, mels, device=DEVICE):
    D.eval()
    scores = []
    for i in range(0, len(mels), 16):
        batch = torch.stack(mels[i:i+16], dim=0).to(device)  # [B,1,128,256]
        s = D(batch).squeeze(1).detach().cpu().numpy()
        scores.extend(s.tolist())
    return float(np.mean(scores)), scores

def slice_track_to_mels(path, codec_parity_for_real=False, step_secs=10.0):
    """Return list of [1,128,256] mel windows for an entire track.
    - For real WAVs, pass codec_parity_for_real=True to round-trip MP3.
    - For AI MP3s, leave False (already MP3).
    """
    wav = load_audio(path, sr=SR)
    if codec_parity_for_real:
        wav = mp3_roundtrip(wav, sr=SR)
    wav = lufs_normalize(wav, sr=SR, target_lufs=TARGET_LUFS)

    N = int(SR * WIN_SECS)
    step = int(SR * step_secs)
    if wav.numel() < N:
        wav = F.pad(wav, (0, N - wav.numel()))

    mels = []
    for start in range(0, max(1, wav.numel() - N + 1), step):
        seg = wav[start:start+N]
        mel = to_logmel(seg)
        mel = fix_frames(mel, FRAMES)
        mels.append(mel)
    return mels  # list of [1,128,256]

def score_folder(D, folder, exts, real_folder=False, max_files=None):
    files = sorted([p for ext in exts for p in Path(folder).rglob(f"*{ext}")])
    if max_files is not None:
        files = files[:max_files]

    kept_files, kept_scores = [], []
    for p in files:
        try:
            codec_parity = real_folder  # True for real WAVs, False for AI MP3s
            mels = slice_track_to_mels(p, codec_parity_for_real=codec_parity, step_secs=WIN_SECS)
            if not mels:
                continue
            mean_s, _ = score_windows_with_D(D, mels)
            kept_files.append(p)
            kept_scores.append(mean_s)
        except Exception as e:
            # Skip problematic file; keep evaluation running
            print(f"[skip] {p} -> {e}")
            continue

    return kept_files, np.array(kept_scores, dtype=np.float32)


In [18]:

def evaluate_discriminator(D, real_dir=REAL_WAV_DIR, ai_dir=AI_MP3_DIR, max_files=MAX_EVAL_FILES_PER_CLASS):
    if not HAVE_SK:
        print("scikit-learn not found — skipping ROC-AUC/PR-AUC. You can 'pip install scikit-learn' and rerun.")
        return None

    real_files, real_scores = score_folder(D, real_dir, [".wav", ".WAV"], real_folder=True, max_files=max_files)
    ai_files,   ai_scores   = score_folder(D, ai_dir,   [".mp3", ".MP3"], real_folder=False, max_files=max_files)

    y_true = np.array([1]*len(real_scores) + [0]*len(ai_scores), dtype=np.int32)
    y_pred = np.concatenate([real_scores, ai_scores], axis=0)

    roc = roc_auc_score(y_true, y_pred)
    pr  = average_precision_score(y_true, y_pred)
    print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr:.4f}")
    return {
        "roc_auc": float(roc),
        "pr_auc": float(pr),
        "real_scores": real_scores,
        "ai_scores": ai_scores,
        "real_files": [str(p) for p in real_files],
        "ai_files": [str(p) for p in ai_files],
    }



## Quickstart

1. Put your data here (or change the config):
   - `data/real_wav/**.wav`
   - `data/ai_mp3/**.mp3`

2. Run training:


In [11]:

# Train the GAN on REAL only (WGAN-GP). Increase EPOCHS later.
G, D = train_wgan_gp(train_loader, epochs=EPOCHS)

# Save checkpoints
if D is not None and G is not None:
    torch.save(D.state_dict(), "D.pth")
    torch.save(G.state_dict(), "G.pth")
    print("Saved D.pth and G.pth")


Epoch 1/5: 100%|██████████| 124/124 [05:49<00:00,  2.82s/it, D=-10.361, G=81.013, d_real=-73.901, d_fake=-84.466]
Epoch 2/5: 100%|██████████| 124/124 [03:43<00:00,  1.80s/it, D=-9.126, G=10.613, d_real=-1.455, d_fake=-10.982]    
Epoch 3/5: 100%|██████████| 124/124 [03:16<00:00,  1.58s/it, D=-7.015, G=16.547, d_real=-2.427, d_fake=-9.511]     
Epoch 4/5: 100%|██████████| 124/124 [03:09<00:00,  1.53s/it, D=-11.593, G=-3.206, d_real=8.712, d_fake=-3.721]      
Epoch 5/5: 100%|██████████| 124/124 [03:25<00:00,  1.66s/it, D=-11.495, G=18.409, d_real=-11.392, d_fake=-22.922]  

Saved D.pth and G.pth






3. Score folders and compute metrics:


In [19]:

if 'D' in globals() and D is not None:
    _ = evaluate_discriminator(D, real_dir=REAL_WAV_DIR, ai_dir=AI_MP3_DIR, max_files=MAX_EVAL_FILES_PER_CLASS)
else:
    print("Train (or load) a Discriminator first.")


  from pkg_resources import resource_filename


ROC-AUC: 0.1468 | PR-AUC: 0.3437



### Load Discriminator later & score single files


In [20]:

def load_discriminator(path="D.pth", device=DEVICE):
    D = Discriminator(use_spectral_norm=True).to(device)
    sd = torch.load(path, map_location=device)
    D.load_state_dict(sd)
    D.eval()
    return D

# Example single-file scoring (edit the paths):
D = load_discriminator("D.pth")
mels = slice_track_to_mels("data/AI_audio/-0Gj8-vB1q4_1.mp3", codec_parity_for_real=False, step_secs=WIN_SECS)
mean_score, window_scores = score_windows_with_D(D, mels)
print("Mean realism score:", mean_score)


Mean realism score: -14.58976697921753



## Notes & Tips

- **Codec parity matters**: we round-trip real WAVs through MP3 (192 kbps) during training and when *scoring* real files.  
  If `ffmpeg` isn't available, the notebook will silently skip parity (you can install ffmpeg to enable it).

- **Loudness normalization**: LUFS if `pyloudnorm` is installed, else peak-normalization fallback.

- **Input shape fixed**: mel specs resized to `[1, 128, 256]`. You can increase `FRAMES` (e.g., 512) if your GPU has room.

- **Stability**: We use **WGAN-GP** and **spectral norm** on the discriminator.

- **Sanity check**: Before full training, try a few batches to ensure losses move and `d_real > d_fake` early on.

- **Evaluation**: The **mean discriminator score** per track is your realism score. With scikit-learn installed, we report **ROC-AUC** and **PR-AUC**.

- **Next steps**: try **PCEN** instead of dB, add light augmentations, or move to longer windows, then compare metrics.
