In [None]:
import os, random, math, time, json
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

SR_LIST = [16000]
SR_LIST = sorted(set(int(s) for s in SR_LIST)) 
SR = SR_LIST[-1] 

if 'VERBOSE_CONFIG' not in globals():
    VERBOSE_CONFIG = True

NOISE_COLOR = "white"  
N_MFCC = 25           
N_MELS = 40           
ARCH = "transformer"  

def mfcc_params_for_sr(sr: int):
    sr = int(sr)
    if sr == 16000:
        return {"n_fft": 512, "win_length": 400, "hop_length": 160}
    if sr == 8000:
        return {"n_fft": 256, "win_length": 200, "hop_length": 80}
    win = int(sr * 0.025)
    hop = int(sr * 0.010)
    def _next_pow2(x):
        p = 1
        while p < x:
            p <<= 1
        return p
    return {"n_fft": _next_pow2(win), "win_length": win, "hop_length": hop}

if "COMMIT_MODE" not in globals():
    COMMIT_MODE = False

def set_seed(seed: int = 42, deterministic: bool = True):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = bool(deterministic)
    torch.backends.cudnn.benchmark = not bool(deterministic)

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
PIN_MEMORY = True if USE_CUDA else False
NON_BLOCK = True if USE_CUDA else False

if not COMMIT_MODE and VERBOSE_CONFIG:
    print("Device:", device)

WHITE_NOISE_FILE = "white_noise.wav"
PINK_NOISE_FILE  = "pink_noise.wav"

def resolve_root_dir():
    candidates = []
    env = os.environ.get("SPEECH_COMMANDS_ROOT")
    if env:
        candidates.append(env)
    candidates += [
        "/kaggle/input/speech-commands",
        os.path.join(os.getcwd(), "data", "speech-commands"),
        os.path.join(os.getcwd(), "speech-commands"),
    ]
    for p in candidates:
        if p and os.path.isdir(p):
            return p
    return None

ROOT_DIR = resolve_root_dir()

def resolve_noise_dir(root_dir):
    if root_dir and os.path.isdir(root_dir):
        p = os.path.join(root_dir, "_background_noise_")
        if os.path.isdir(p):
            return p
    kaggle_noise = "/kaggle/input/speech-commands/_background_noise_"
    return kaggle_noise if os.path.isdir(kaggle_noise) else None

NOISE_DIR = resolve_noise_dir(ROOT_DIR)

if not COMMIT_MODE and VERBOSE_CONFIG:
    print("Data root:", ROOT_DIR if ROOT_DIR else "<not found>")
    print("Noise dir:", NOISE_DIR if NOISE_DIR else "<not found>")

SELECTED_CLASSES = ["down", "left", "right", "up"]
CLASS_TO_IDX = {c: i for i, c in enumerate(SELECTED_CLASSES)}

MAX_LENGTH = SR  

BATCH_SIZE = 64
EPOCHS = 10
N_SPLITS = 5
SEEDS = [36, 38, 42]

NOISE_PROB = 0.30
SNR_RANGE = (5, 20)

NUM_WORKERS = max(2, (os.cpu_count() or 2) // 2)

In [None]:
def resolve_noise_path(noise_dir: str, filename: str):
    if not noise_dir or not filename:
        return None
    p = os.path.join(noise_dir, filename)
    return p if os.path.exists(p) else None
    
WHITE_PATH = resolve_noise_path(NOISE_DIR, WHITE_NOISE_FILE)
PINK_PATH  = resolve_noise_path(NOISE_DIR, PINK_NOISE_FILE)

try:
    _color = str(NOISE_COLOR).lower()
except Exception:
    _color = 'white'
if _color == 'pink' and PINK_PATH:
    cand_path = PINK_PATH; cand_name = 'pink'
else:
    cand_path = WHITE_PATH; cand_name = 'white'
if not cand_path or not os.path.exists(cand_path):
    ACTIVE_NOISE_PATH = None
    ACTIVE_NOISE_NAME = f"{cand_name} (missing)"
else:
    ACTIVE_NOISE_PATH = cand_path
    ACTIVE_NOISE_NAME = cand_name

_NOISE_CACHE = {}

def _load_and_prepare_noise(noise_path: str, target_sr: int):
    noise, sr = torchaudio.load(noise_path)
    if noise.dim() == 1:
        noise = noise.unsqueeze(0)
    if noise.shape[0] > 1:
        noise = noise.mean(dim=0, keepdim=True)
    if sr != target_sr:
        noise = torchaudio.functional.resample(noise, sr, target_sr)
    noise = noise.to(dtype=torch.float32, device='cpu')
    noise = noise - noise.mean()
    return noise

def _get_noise_cached(noise_path: str, target_sr: int):
    if (noise_path is None) or (not os.path.exists(noise_path)):
        return None
    key = (noise_path, int(target_sr))
    if key not in _NOISE_CACHE:
        _NOISE_CACHE[key] = _load_and_prepare_noise(noise_path, target_sr)
    return _NOISE_CACHE[key]

def _take_random_segment(noise_cpu: torch.Tensor, T: int):
    N = noise_cpu.shape[1]
    if N >= T:
        start = int(torch.randint(0, max(1, N - T + 1), (1,)).item()) if N > T else 0
        return noise_cpu[:, start:start+T]
    rep = math.ceil(T / N)
    tiled = noise_cpu.repeat(1, rep)
    return tiled[:, :T]

def add_specific_noise(
    waveform: torch.Tensor,
    noise_path: str,
    snr_db: float,
    target_sr: int
):
    noise_cpu = _get_noise_cached(noise_path, target_sr)
    if noise_cpu is None:
        return waveform
    T = waveform.shape[1]
    seg = _take_random_segment(noise_cpu, T)
    noise = seg.to(device=waveform.device, dtype=waveform.dtype)
    sig_power   = waveform.pow(2).mean().clamp(min=1e-12)
    noise_power = noise.pow(2).mean().clamp(min=1e-12)
    snr_linear  = 10.0 ** (snr_db / 10.0)
    scale = torch.sqrt(sig_power / (snr_linear * noise_power))
    noisy = waveform + scale * noise
    return torch.clamp(noisy, -1.0, 1.0)

In [None]:
class VoiceCommandDatasetWithNoise(Dataset):
    def __init__(self,
                 samples, class_to_idx,
                 noise_path=None, is_training=True,
                 sr=SR, max_length=None,
                 n_mfcc=N_MFCC, n_fft=400, hop_length=160, n_mels=None,
                 augment="file_noise", noise_prob=0.0,
                 snr_range=(5,20), fixed_snr_db=None,
                 norm_mode="none", global_mean=None, global_std=None,
                 crop_mode="left", return_path=False, seed=None):
        super().__init__()
        self.samples, self.class_to_idx = samples, class_to_idx
        self.noise_path, self.is_training = noise_path, bool(is_training)
        self.sr = int(sr)
        self.max_length = int(max_length if max_length is not None else sr)

        self.augment      = str(augment)   
        self.noise_prob   = float(noise_prob)
        self.snr_range    = snr_range
        self.fixed_snr_db = fixed_snr_db
        self.norm_mode    = str(norm_mode)
        self.global_mean  = (torch.tensor(global_mean, dtype=torch.float32)
                             if global_mean is not None else None)
        self.global_std   = (torch.tensor(global_std, dtype=torch.float32)
                             if global_std is not None else None)
        self.crop_mode    = str(crop_mode)
        self.return_path  = bool(return_path)
        self._rng = random.Random(seed) if seed is not None else random

        if self.is_training and self.augment == "file_noise":
            if not self.noise_path or not os.path.exists(self.noise_path):
                raise FileNotFoundError(f"noise_path tidak valid: {self.noise_path}")

    def __len__(self):
        return len(self.samples)

    def _crop(self, x_2d):
        T = x_2d.shape[1]
        if T < self.max_length:
            return F.pad(x_2d, (0, self.max_length - T))
        if T > self.max_length:
            start = 0 if self.crop_mode == "left" else max(0, (T - self.max_length)//2)
            return x_2d[:, start:start+self.max_length]
        return x_2d

    def _maybe_augment(self, x_2d):
        if not (self.is_training and self.augment != "none"):
            return x_2d
        if self._rng.random() >= self.noise_prob:
            return x_2d

        snr_db = float(self.fixed_snr_db) if self.fixed_snr_db is not None \
                 else self._rng.uniform(*self.snr_range)

        if self.augment == "file_noise" and self.noise_path:
            return add_specific_noise(x_2d, self.noise_path, snr_db, target_sr=self.sr)

        return x_2d

    def __getitem__(self, idx: int):
        path, cname = self.samples[idx]
        y = int(self.class_to_idx[cname])

        x_2d, sr0 = torchaudio.load(path)
        if x_2d.dim() == 1:
            x_2d = x_2d.unsqueeze(0)
        if x_2d.shape[0] > 1:
            x_2d = x_2d.mean(0, keepdim=True)

        if sr0 != self.sr:
            x_2d = torchaudio.functional.resample(x_2d, sr0, self.sr)

        x_2d = self._crop(x_2d).to(torch.float32)
        x_2d = self._maybe_augment(x_2d)

        x_1d = x_2d.squeeze(0).contiguous()
        return (x_1d, torch.tensor(y, dtype=torch.long))

In [None]:
from pathlib import Path
from collections import Counter

def collect_samples_4classes(root_dir: str, selected_classes, exts=(".wav", ".WAV")):
    samples = []
    if not root_dir or not Path(root_dir).is_dir():
        return samples
    root = Path(root_dir)

    for cname in selected_classes:
        cdir = root / cname
        if not cdir.is_dir():
            continue
        for ent in cdir.iterdir():
            if ent.is_file() and ent.suffix.lower() == ".wav":
                samples.append((str(ent), cname))
    samples.sort(key=lambda x: x[0])
    return samples

all_samples = collect_samples_4classes(ROOT_DIR, SELECTED_CLASSES)

if VERBOSE_CONFIG:
    per_class = Counter([c for _, c in all_samples])
    print(f"[DATASET] classes={len(SELECTED_CLASSES)} samples={len(all_samples)} per_class={dict(per_class)}")

In [None]:
if 'VERBOSE_CONFIG' in globals() and VERBOSE_CONFIG:
    mfcc_all = {int(sr): mfcc_params_for_sr(int(sr)) for sr in SR_LIST}
    lines = ["[CONFIG]",
             f"ARCH={ARCH}",
             f"Device={device}",
             f"SR_LIST={SR_LIST}",
             "MFCC params per SR:"]
    for _sr, v in sorted(mfcc_all.items()):
        lines.append(f"  SR={_sr}: n_fft={v['n_fft']} win={v['win_length']} hop={v['hop_length']}")
    lines.extend([
        f"Classes={len(SELECTED_CLASSES)} Samples={len(all_samples)}",
        f"Noise color={ACTIVE_NOISE_NAME} path={ACTIVE_NOISE_PATH if ACTIVE_NOISE_PATH else 'None'}",
        f"Noise prob={NOISE_PROB} SNR_RANGE={SNR_RANGE}",
        f"Batch size={BATCH_SIZE} Epochs={EPOCHS} KFold={N_SPLITS} Seeds={SEEDS}"])
    print("\n".join(lines))

In [None]:
import os              
import torch           
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold
import numpy as np

try:
    WHITE_PATH
except NameError:
    WHITE_PATH = resolve_noise_path(NOISE_DIR, WHITE_NOISE_FILE) if 'NOISE_DIR' in globals() and 'WHITE_NOISE_FILE' in globals() else None

def build_kfold_loaders_generic(
    samples, class_to_idx,
    sr=None, n_splits=5, seed=42,
    batch_size=64, num_workers=2, pin_memory=True,
    noise_path=None,
    max_length=None,
    noise_prob=0.0, snr_range=(5,20),
    augment_mode="file_noise",
    norm_mode="none",
    crop_mode="left"
):
    sr = int(sr if sr is not None else globals().get("SR", 16000))
    max_length = int(max_length if max_length is not None else sr)

    if not samples or len(samples) == 0:
        if 'VERBOSE_CONFIG' in globals() and VERBOSE_CONFIG and not globals().get('COMMIT_MODE', False):
            print(f"[WARN] No samples found for sr={sr}. Skipping KFold construction.")
        return []

    params = mfcc_params_for_sr(sr)
    n_fft_rule      = params["n_fft"]
    win_length_rule = params["win_length"]
    hop_length_rule = params["hop_length"]

    y_all = np.array([class_to_idx[c] for _, c in samples], dtype=np.int64)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=int(seed))

    use_cuda = torch.cuda.is_available()
    pm = bool(pin_memory and use_cuda)
    dl_args = dict(
        batch_size=int(batch_size),
        num_workers=int(num_workers),
        pin_memory=pm,
        persistent_workers=True if int(num_workers) > 0 else False,
        prefetch_factor=4 if int(num_workers) > 0 else None,
        drop_last=False,
    )
    if dl_args["prefetch_factor"] is None:
        dl_args.pop("prefetch_factor")

    use_file_noise = (augment_mode == "file_noise")
    effective_noise_path = noise_path
    if use_file_noise and (effective_noise_path is None or not os.path.exists(effective_noise_path)):
        if 'VERBOSE_CONFIG' in globals() and VERBOSE_CONFIG and not globals().get('COMMIT_MODE', False):
            print("[NOISE] Missing or invalid noise_path -> disabling file_noise augmentation.")
        use_file_noise = False
        effective_noise_path = None
        noise_prob = 0.0

    g_base = int(seed) * 1_000_003
    folds = []

    for fold_id, (tr_idx, va_idx) in enumerate(skf.split(samples, y_all), start=1):
        tr_s = [samples[i] for i in tr_idx]
        va_s = [samples[i] for i in va_idx]

        ds_tr = VoiceCommandDatasetWithNoise(
            tr_s, class_to_idx,
            noise_path=(effective_noise_path if use_file_noise else None),
            is_training=True,
            sr=sr, max_length=max_length,
            n_mfcc=N_MFCC, n_fft=n_fft_rule, hop_length=hop_length_rule, n_mels=N_MELS,
            augment=("file_noise" if use_file_noise else "none"), noise_prob=noise_prob, snr_range=snr_range,
            norm_mode=norm_mode, crop_mode=crop_mode,
            return_path=False, seed=g_base + fold_id
        )
        ds_va = VoiceCommandDatasetWithNoise(
            va_s, class_to_idx,
            noise_path=None, is_training=False,
            sr=sr, max_length=max_length,
            n_mfcc=N_MFCC, n_fft=n_fft_rule, hop_length=hop_length_rule, n_mels=N_MELS,
            augment="none", noise_prob=0.0, snr_range=snr_range,
            norm_mode=norm_mode, crop_mode=crop_mode,
            return_path=False, seed=g_base + 10_000 + fold_id
        )

        g = torch.Generator(device="cpu"); g.manual_seed(g_base + fold_id)

        dl_tr = DataLoader(ds_tr, shuffle=True,  generator=g, **dl_args)
        dl_va = DataLoader(ds_va, shuffle=False,                 **dl_args)

        if not COMMIT_MODE and ('VERBOSE_CONFIG' in globals() and VERBOSE_CONFIG):
            aug_str = ("file_noise" if use_file_noise else "none")
            print(f"[SR={sr} | seed={seed} | fold={fold_id}] "
                  f"train={len(tr_s)} val={len(va_s)} | n_fft={n_fft_rule} win={win_length_rule} hop={hop_length_rule} "
                  f"(workers={num_workers}, pin_memory={pm}) | augment={aug_str}")

        folds.append({"fold": fold_id, "train_loader": dl_tr, "val_loader": dl_va})

    return folds


def build_kfold_loaders_noise(
    samples, class_to_idx, n_splits=5, seed=42,
    batch_size=64, num_workers=2, pin_memory=True,
    noise_path=None, max_length=None,
    noise_prob=NOISE_PROB, snr_range=SNR_RANGE, sr=None
):
    return build_kfold_loaders_generic(
        samples=samples, class_to_idx=class_to_idx,
        sr=sr, n_splits=n_splits, seed=seed,
        batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory,
        noise_path=(noise_path if noise_path is not None else ACTIVE_NOISE_PATH),
        max_length=max_length,
        noise_prob=noise_prob, snr_range=snr_range,
        augment_mode="file_noise",
        norm_mode="none", crop_mode="left"
    )

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 4096, dropout: float = 0.0):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0), persistent=False)  

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        T = x.size(1)
        pe = self.pe[:, :T, :].to(dtype=x.dtype, device=x.device)
        return self.dropout(x + pe)

class MFCC_Transformer(nn.Module):
    def __init__(
        self,
        n_mfcc: int = N_MFCC,
        num_classes: int | None = None,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 2,
        dim_feedforward: int = 256,
        dropout: float = 0.3,
        max_len: int = 4096,
    ):
        super().__init__()
        if num_classes is None:
            num_classes = len(SELECTED_CLASSES)

        assert d_model % nhead == 0, "d_model harus kelipatan nhead"
        self.n_mfcc = int(n_mfcc)

        self.input_proj = nn.Linear(self.n_mfcc, d_model, bias=True)
        self.posenc = PositionalEncoding(d_model, max_len=max_len, dropout=0.0)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes),
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.dim() != 3:
            raise ValueError(f"Expected 3D input [B,T,{self.n_mfcc}] or [B,{self.n_mfcc},T], got {tuple(x.shape)}")
        B, A, C = x.shape
        if C == self.n_mfcc:
            pass
        elif A == self.n_mfcc:
            x = x.transpose(1, 2).contiguous()
        else:
            raise ValueError(f"Input last dim must be n_mfcc={self.n_mfcc}; got {tuple(x.shape)}")

        h = self.input_proj(x)     
        h = self.posenc(h)          
        h = self.encoder(h)         
        feat = h.mean(dim=1)        
        logits = self.head(feat)    
        return logits

In [None]:
from contextlib import nullcontext
from torch.nn.utils import clip_grad_norm_

_mfcc_cfg = mfcc_params_for_sr(SR)  
WIN_LENGTH = _mfcc_cfg["win_length"]
HOP_LENGTH = _mfcc_cfg["hop_length"]
N_FFT      = _mfcc_cfg["n_fft"]

mfcc = None  

def _compute_feats_gpu(wav: torch.Tensor) -> torch.Tensor:
    if mfcc is None:
        raise RuntimeError("MFCC transform not initialized yet; run the training loop cell after defining SR-specific mfcc")
    feats = mfcc(wav)
    feats = feats.transpose(1, 2).contiguous()
    return feats

def train_one_epoch(model, loader, optimizer, criterion, scheduler=None, grad_clip_norm=None):
    model.train()
    total_loss = 0.0

    use_amp = ("SCALER" in globals()) and (SCALER is not None) and (device.type == "cuda")
    amp_ctx = (torch.autocast(device_type="cuda", dtype=AMP_DTYPE) if use_amp else nullcontext())

    step_per_batch = (
        scheduler is not None
        and not isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
    )

    for wav, y in loader:
        wav = wav.to(device, non_blocking=NON_BLOCK).float()
        y   = y.to(device, non_blocking=NON_BLOCK).long()

        optimizer.zero_grad(set_to_none=True)
        with amp_ctx:
            feats  = _compute_feats_gpu(wav)
            logits = model(feats)
            loss   = criterion(logits, y)

        if use_amp:
            SCALER.scale(loss).backward()
            if grad_clip_norm is not None:
                SCALER.unscale_(optimizer)
                clip_grad_norm_(model.parameters(), max_norm=float(grad_clip_norm))
            SCALER.step(optimizer)
            SCALER.update()
        else:
            loss.backward()
            if grad_clip_norm is not None:
                clip_grad_norm_(model.parameters(), max_norm=float(grad_clip_norm))
            optimizer.step()

        if step_per_batch:
            scheduler.step()

        total_loss += loss.item() * y.size(0)

    return float(total_loss) / float(len(loader.dataset))


@torch.no_grad()
def evaluate(model, loader, criterion=None):
    model.eval()
    all_true, all_pred = [], []
    total_loss = 0.0
    have_loss = criterion is not None

    use_amp = (device.type == "cuda")
    amp_ctx = (torch.autocast(device_type="cuda", dtype=AMP_DTYPE) if use_amp else nullcontext())

    for wav, y in loader:
        wav = wav.to(device, non_blocking=NON_BLOCK).float()
        y   = y.to(device, non_blocking=NON_BLOCK).long()

        with amp_ctx:
            feats  = _compute_feats_gpu(wav)
            logits = model(feats)
            if have_loss:
                total_loss += criterion(logits, y).item() * y.size(0)

        pred = logits.argmax(dim=1)
        all_true.extend(y.tolist())
        all_pred.extend(pred.tolist())

    acc = accuracy_score(all_true, all_pred)
    f1  = f1_score(all_true, all_pred, average="macro")
    avg_loss = (float(total_loss) / float(len(loader.dataset))) if have_loss else None

    return float(acc), float(f1), avg_loss, np.array(all_true, dtype=np.int64), np.array(all_pred)

In [None]:
COMMIT_MODE = True

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
plt.ioff()

from tqdm import tqdm as _tqdm

def tqdm(*args, **kwargs):

    if COMMIT_MODE:
        kwargs["disable"] = True
    return _tqdm(*args, **kwargs)

import pandas as pd
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_colwidth", 120)

if COMMIT_MODE:
    def _no_show(*args, **kwargs):
        pass
    plt.show = _no_show

In [None]:
import io, sys, contextlib, os, time, numpy as np, pandas as pd, shutil, json
from pathlib import Path
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

SR_LIST = sorted(set(int(s) for s in SR_LIST))

def _outdirs_for(SR: int):
    root = Path(f"/kaggle/working/kfold_outputs_sr{SR}")
    d_fold = root / "per_seed_and_fold"
    d_sum  = root / "summary"
    d_fold.mkdir(parents=True, exist_ok=True)
    d_sum.mkdir(parents=True, exist_ok=True)
    return root, d_fold, d_sum

@contextlib.contextmanager
def mute_outputs(active: bool):
    if not active:
        yield
    else:
        buf = io.StringIO()
        with contextlib.redirect_stdout(buf), contextlib.redirect_stderr(buf):
            yield

USE_CUDA   = torch.cuda.is_available()
AMP_ENABLE = USE_CUDA
if AMP_ENABLE:
    try:
        major_cc = torch.cuda.get_device_capability()[0]
    except Exception:
        major_cc = 7
    AMP_DTYPE = torch.bfloat16 if major_cc >= 8 else torch.float16
    try:
        SCALER = torch.amp.GradScaler(device="cuda") if AMP_DTYPE is torch.float16 else None
    except Exception:
        SCALER = torch.cuda.amp.GradScaler(enabled=(AMP_DTYPE is torch.float16))
else:
    AMP_DTYPE = None
    SCALER = None

results = []
summary_rows = []
efficiency_rows_global = []

set_seed(0)

with mute_outputs(COMMIT_MODE):
    for seed in SEEDS:
        if not COMMIT_MODE and VERBOSE_CONFIG:
            print("\n" + "="*70)
            print(f"Running SEED = {seed} | ARCH = {ARCH}")
            print("="*70)
        set_seed(seed)

        for sr in SR_LIST:
            SR = int(sr)
            ROOT_OUT, OUT_FOLD, OUT_SUMM = _outdirs_for(SR)
            MAX_LENGTH = SR

            folds = build_kfold_loaders_noise(
                all_samples, CLASS_TO_IDX,
                n_splits=N_SPLITS, seed=seed,
                batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                noise_path=ACTIVE_NOISE_PATH,
                max_length=MAX_LENGTH, noise_prob=NOISE_PROB, snr_range=SNR_RANGE,
                sr=SR
            )

            # MFCC transform per SR
            _params = mfcc_params_for_sr(SR)
            if not COMMIT_MODE and VERBOSE_CONFIG:
                print(f"[MFCC] SR={SR} -> n_fft={_params['n_fft']} win_length={_params['win_length']} hop_length={_params['hop_length']} n_mels={N_MELS}")
            mfcc = torchaudio.transforms.MFCC(
                sample_rate=SR, n_mfcc=N_MFCC,
                melkwargs={
                    "n_mels": N_MELS,
                    "n_fft": int(_params["n_fft"]),
                    "hop_length": int(_params["hop_length"]),
                    "win_length": int(_params["win_length"]),
                    "center": True, "f_min": 0.0, "f_max": SR/2
                }
            ).to(device)

            def _compute_feats_gpu(wav: torch.Tensor) -> torch.Tensor:
                return mfcc(wav).transpose(1, 2).contiguous()

            for fd in folds:
                fold_id      = fd["fold"]
                train_loader = fd["train_loader"]
                val_loader   = fd["val_loader"]

                RUN_NAME = f"{ARCH}_seed{seed}_fold{fold_id}_sr{SR}"
                OUT_SUB  = str((ROOT_OUT / "per_seed_and_fold" / RUN_NAME).resolve())
                os.makedirs(OUT_SUB, exist_ok=True)
                BEST_CKPT = os.path.join(OUT_SUB, "best_model.pth")
                LAST_CKPT = os.path.join(OUT_SUB, "last_model.pth")

                model = MFCC_Transformer(n_mfcc=N_MFCC, num_classes=len(SELECTED_CLASSES)).to(device)
                assert next(model.parameters()).is_cuda == USE_CUDA, "Model belum di CUDA!"
                n_params = int(sum(p.numel() for p in model.parameters()))

                optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
                total_steps = max(1, EPOCHS * len(train_loader))
                scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer, max_lr=1e-3, total_steps=total_steps
                )
                criterion = nn.CrossEntropyLoss().to(device)

                best_val_acc = -1.0
                best_true = best_pred = None

                history = []

                for ep in range(1, EPOCHS+1):
                    model.train()
                    tr_loss_sum, n_train = 0.0, 0
                    ep_t0 = time.time()

                    for wav, y in train_loader:
                        wav = wav.to(device, non_blocking=NON_BLOCK).float()
                        y   = y.to(device,  non_blocking=NON_BLOCK).long()
                        optimizer.zero_grad(set_to_none=True)

                        if AMP_ENABLE:
                            with torch.autocast(device_type='cuda', dtype=AMP_DTYPE):
                                feats  = _compute_feats_gpu(wav)
                                logits = model(feats)
                                loss   = criterion(logits, y)
                            if SCALER is not None:
                                SCALER.scale(loss).backward()
                                SCALER.step(optimizer)
                                SCALER.update()
                            else:
                                loss.backward()
                                optimizer.step()
                        else:
                            feats  = _compute_feats_gpu(wav)
                            logits = model(feats)
                            loss   = criterion(logits, y)
                            loss.backward()
                            optimizer.step()

                        if scheduler is not None:
                            scheduler.step()
                        bs = y.size(0)
                        tr_loss_sum += loss.item() * bs
                        n_train += bs

                    tr_loss = tr_loss_sum / max(1, n_train)

                    val_acc, val_f1, val_loss, y_true, y_pred = evaluate(model, val_loader, criterion)

                    if val_acc > best_val_acc:
                        best_val_acc = float(val_acc)
                        best_true = y_true.copy()
                        best_pred = y_pred.copy()
                        torch.save(model.state_dict(), BEST_CKPT)

                    ep_time = time.time() - ep_t0
                    thr = float(n_train) / ep_time if ep_time > 0 else 0.0
                    history.append({
                        "epoch": ep,
                        "train_loss": float(tr_loss),
                        "val_loss": float(val_loss),
                        "val_acc": float(val_acc),
                        "val_f1": float(val_f1),
                        "epoch_time_sec": float(ep_time),
                        "throughput_samples_per_sec": float(thr),
                    })

                    if (ep % 5 == 0 or ep == 1 or ep == EPOCHS) and not COMMIT_MODE:
                        if USE_CUDA:
                            gpu_mb = torch.cuda.memory_allocated() / 1e6
                            print(f"[SR={SR} | GPU {gpu_mb:.1f} MB]", end=" ")
                        print(f"Seed {seed} | Fold {fold_id} | Epoch {ep:02d} "
                              f"| tr_loss={tr_loss:.4f} | va_loss={val_loss:.4f} "
                              f"| va_acc={val_acc:.4f} | va_f1={val_f1:.4f} | ep_time={ep_time:.2f}s | thr={thr:.1f}/s")

                torch.save(model.state_dict(), LAST_CKPT)

                # Confusion matrix and history saving
                cm = confusion_matrix(best_true, best_pred, labels=list(range(len(SELECTED_CLASSES))))
                cm_df = pd.DataFrame(cm, index=SELECTED_CLASSES, columns=SELECTED_CLASSES)
                cm_path = os.path.join(OUT_FOLD, f"cm_{ARCH}_seed{seed}_fold{fold_id}_sr{SR}.csv")
                cm_df.to_csv(cm_path)

                hist_df = pd.DataFrame(history)
                hist_df.to_csv(os.path.join(OUT_SUB, "history.csv"), index=False)

                # Efficiency row for optional later aggregation
                total_time = float(hist_df["epoch_time_sec"].sum()) if not hist_df.empty else 0.0
                avg_ep_time = float(hist_df["epoch_time_sec"].mean()) if not hist_df.empty else 0.0
                mean_thr = float(hist_df["throughput_samples_per_sec"].mean()) if not hist_df.empty else 0.0
                efficiency_rows_global.append({
                    "sr": SR,
                    "seed": seed,
                    "fold": fold_id,
                    "arch": ARCH,
                    "n_params": n_params,
                    "epochs": EPOCHS,
                    "total_time_sec": total_time,
                    "avg_epoch_time_sec": avg_ep_time,
                    "mean_throughput_samples_per_sec": mean_thr,
                })

                results.append({
                    "seed": seed,
                    "fold": fold_id,
                    "sr": int(SR),
                    "val_acc": float(best_val_acc),
                    "cm_path": cm_path,
                    "ckpt_path": BEST_CKPT,
                })

        # Aggregate per-seed summaries
        for sr0 in SR_LIST:
            accs = [r["val_acc"] for r in results if r["seed"] == seed and r["sr"] == int(sr0)]
            if accs:
                summary_rows.append({
                    "seed": seed,
                    "sr": int(sr0),
                    "acc_mean_over_folds": float(np.mean(accs)),
                    "acc_std_over_folds":  float(np.std(accs)),
                    "n_folds": N_SPLITS
                })

In [None]:
df_results_all = pd.DataFrame(results)
df_seed_all    = pd.DataFrame(summary_rows)

for SR in SR_LIST:
    ROOT_OUT, OUT_FOLD, OUT_SUMM = _outdirs_for(SR)

    if not df_results_all.empty:
        df_res_sr = df_results_all[df_results_all["sr"] == int(SR)].copy()
        if len(df_res_sr):
            df_res_sr.to_csv(OUT_FOLD / f"transformer_kfold_results_per_fold_sr{SR}.csv", index=False)

    if not df_seed_all.empty:
        df_seed_sr = df_seed_all[df_seed_all["sr"] == int(SR)].copy()
        if len(df_seed_sr):
            df_seed_sr.to_csv(OUT_SUMM / f"transformer_kfold_summary_per_seed_sr{SR}.csv", index=False)

            mu_acc = float(df_seed_sr["acc_mean_over_folds"].mean())
            sd_acc = float(df_seed_sr["acc_mean_over_folds"].std(ddof=1)) if len(df_seed_sr) > 1 else 0.0

            df_sr_summary = pd.DataFrame([{
                "model": "Transformer",
                "sr": int(SR),
                "acc_mean": mu_acc,
                "acc_sd": sd_acc,
                "n_seeds": int(df_seed_sr["seed"].nunique()),
                "kfold": int(df_seed_sr["n_folds"].max()) if "n_folds" in df_seed_sr else 5
            }])
            df_sr_summary.to_csv(OUT_SUMM / f"transformer_kfold_multi_seed_summary_sr{SR}.csv", index=False)

    df_res_sr = df_results_all[df_results_all["sr"] == int(SR)].copy()
    if not df_res_sr.empty:
        manifest_path = OUT_SUMM / f"transformer_best_checkpoints_per_seed_fold_sr{SR}.csv"
        cols = ["seed", "fold", "sr", "val_acc", "ckpt_path", "cm_path"]
        (df_res_sr[cols].sort_values(["seed", "fold"]).to_csv(manifest_path, index=False))

        idx_per_seed = df_res_sr.groupby("seed")["val_acc"].idxmax()
        df_best_per_seed = df_res_sr.loc[idx_per_seed].copy().sort_values(["seed"]).reset_index(drop=True)

        per_seed_csv = OUT_SUMM / f"transformer_best_per_seed_sr{SR}.csv"
        df_best_per_seed[["seed", "fold", "sr", "val_acc", "ckpt_path"]].to_csv(per_seed_csv, index=False)

        for _, row in df_best_per_seed.iterrows():
            src = Path(row["ckpt_path"])
            dst = OUT_SUMM / f"bestmodel_seed{int(row['seed'])}_sr{int(SR)}.pth"
            try:
                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(src, dst)
            except Exception as e:
                print(f"[WARN][SR={SR}] Gagal salin best per seed {int(row['seed'])}: {e}")

        row_sr_best = df_res_sr.loc[df_res_sr["val_acc"].idxmax()]
        src = Path(row_sr_best["ckpt_path"])
        dst = ROOT_OUT / f"bestmodel_sr{int(SR)}.pth"
        try:
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(src, dst)
        except Exception as e:
            print(f"[WARN][SR={SR}] Gagal salin best per SR: {e}")

if not df_results_all.empty:
    row_global = df_results_all.loc[df_results_all["val_acc"].idxmax()]
    src = Path(row_global["ckpt_path"])
    final_dst = Path("/kaggle/working") / "bestmodel.pth"
    try:
        final_dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, final_dst)
    except Exception as e:
        print(f"[WARN] Gagal salin global best: {e}")

legacy_dir = "/kaggle/working/kfold_outputs"
os.makedirs(legacy_dir, exist_ok=True)

all_best_rows = []

for SR in SR_LIST:
    ROOT_OUT, OUT_FOLD, OUT_SUMM = _outdirs_for(SR)
    df_res_sr = df_results_all[df_results_all["sr"] == int(SR)].copy()
    if df_res_sr.empty:
        continue

    idx_per_seed = df_res_sr.groupby("seed")["val_acc"].idxmax()
    df_best_per_seed = df_res_sr.loc[idx_per_seed].copy().sort_values(["seed"]).reset_index(drop=True)

    best_rows_sr = []
    for _, rec in df_best_per_seed.iterrows():
        slim = {
            "seed": int(rec["seed"]),
            "fold": int(rec["fold"]),
            "sr":   int(rec["sr"]),
            "val_acc": float(rec["val_acc"]),
            "cm_path": rec.get("cm_path", None),
            "ckpt_path": rec.get("ckpt_path", None),
        }
        best_rows_sr.append(slim)
        with open(OUT_SUMM / f"transformer_best_seed{slim['seed']}_fold{slim['fold']}_sr{SR}.json", "w") as f:
            json.dump(slim, f, indent=2)

    pd.DataFrame(best_rows_sr).to_csv(OUT_SUMM / f"transformer_best_per_seed_sr{SR}.csv", index=False)
    with open(OUT_SUMM / f"transformer_best_per_seed_sr{SR}.json", "w") as f:
        json.dump(best_rows_sr, f, indent=2)

    all_best_rows.extend(best_rows_sr)

if all_best_rows:
    df_best_all = pd.DataFrame(all_best_rows).sort_values(["sr", "seed"]).reset_index(drop=True)
    df_best_path_csv  = os.path.join(legacy_dir, "transformer_best_per_seed.csv")
    df_best_path_json = os.path.join(legacy_dir, "transformer_best_per_seed.json")
    df_best_all.to_csv(df_best_path_csv, index=False)
    with open(df_best_path_json, "w") as f:
        json.dump(all_best_rows, f, indent=2)

if not COMMIT_MODE:
    for SR in SR_LIST:
        ROOT_OUT, OUT_FOLD, OUT_SUMM = _outdirs_for(SR)
        print(f"\n[SR={SR}] Saved to:")
        print(f" ├─ {OUT_FOLD}/transformer_kfold_results_per_fold_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_kfold_summary_per_seed_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_kfold_multi_seed_summary_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_best_checkpoints_per_seed_fold_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_best_per_seed_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_speed_efficiency_per_fold_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_speed_efficiency_per_seed_sr{SR}.csv")
        print(f" ├─ {OUT_SUMM}/transformer_best_per_seed_sr{SR}.json")
        print(f" ├─ {ROOT_OUT}/bestmodel_sr{SR}.pth")
        print(f" ├─ /kaggle/working/bestmodel.pth (global, opsional)")
        print(f" └─ /kaggle/working/kfold_outputs/transformer_best_per_seed.csv|json (legacy)")