In [None]:
# --- Environment & installs ---
!pip install datasets==2.16.0
!pip install huggingface-hub==0.20.0
!apt-get install -y libsox-dev
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
!pip install causal-conv1d==1.4.0 mamba-ssm==2.2.2 scikit-learn tensorboard

In [None]:
# =========================================================
# Keyword-Spotting (Google Speech Commands v0.02) with Mamba
# Front-end: MFCC (F=40) -> Linear(40 -> d_model) -> Mamba * L -> Classifier
# =========================================================

from __future__ import annotations
import json, os, random, math, time
from pathlib import Path
from typing import Tuple, Dict, List

import torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from datasets import load_dataset
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score

from mamba_ssm import Mamba

# ---------------------------------------------------------
# 0) Repro & device
# ---------------------------------------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")
set_seed(42)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ---------------------------------------------------------
# 1) Waveform-level augmentation (shift + noise)
# ---------------------------------------------------------
class Augment:
    def __init__(self,
                 stretch: Tuple[float,float]=(1.0,1.0),
                 shift_ms: int = 100,
                 noise: Tuple[float,float]=(0.,0.05),
                 sr: int = 16_000):
        self.stretch = stretch
        self.shift   = int(shift_ms * sr / 1000)
        self.noise   = noise
        self.sr      = sr

    def _shift(self, x: torch.Tensor):
        if self.shift == 0:
            return x
        s = int(torch.randint(-self.shift, self.shift + 1, ()).item())
        if s == 0:
            return x
        return (F.pad(x, (s, 0))[:, :-s] if s > 0 else F.pad(x, (0, -s))[:, -s:])

    def __call__(self, wav: torch.Tensor):
        squeezed = False
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)
            squeezed = True
        if self.stretch != (1.0, 1.0):
            factor = float(torch.empty(()).uniform_(*self.stretch))
            if abs(factor - 1.0) > 1e-3:
                wav, _ = torchaudio.sox_effects.apply_effects_tensor(
                    wav, self.sr, [["tempo", f"{factor}"]]
                )
        wav = self._shift(wav)
        if self.noise[1] > 0:
            sigma = float(torch.empty(()).uniform_(*self.noise))
            if sigma > 0:
                wav = wav + sigma * torch.randn_like(wav)
        return wav.squeeze(0) if squeezed else wav

# ---------------------------------------------------------
# 2) Front-end: MFCC or Mel with SpecAug AFTER log (if mel)
#    For MFCC we mask on the MFCCs (computed from log-mel internally).
# ---------------------------------------------------------
class WaveToSpec:
    def __init__(self,
                 feature_type: str = "mfcc",     # "mfcc" or "mel"
                 sample_rate: int = 16_000,
                 n_fft: int = 2048,
                 hop_length: int = 256,
                 n_mels: int = 128,
                 n_mfcc: int = 40,
                 top_db: int | None = 80,
                 apply_mask: bool = True,
                 freq_mask_param: int = 3,
                 time_mask_param: int = 12):
        self.feature_type = feature_type.lower(); assert self.feature_type in {"mel","mfcc"}
        self.apply_mask = apply_mask

        if self.feature_type == "mel":
            self.spec = T.MelSpectrogram(sample_rate, n_fft=n_fft, hop_length=hop_length,
                                         n_mels=n_mels, power=2.0)
            self.to_db = T.AmplitudeToDB(stype="power", top_db=top_db)
            self.freq_mask = T.FrequencyMasking(freq_mask_param) if apply_mask else None
            self.time_mask = T.TimeMasking(time_mask_param) if apply_mask else None
            self.n_out = n_mels
        else:
            self.spec = T.MFCC(sample_rate, n_mfcc=n_mfcc,
                                melkwargs=dict(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels))
            # MFCC branch does not use AmplitudeToDB directly; MFCC uses log-mel inside.
            self.to_db = None
            self.freq_mask = T.FrequencyMasking(freq_mask_param) if apply_mask else None
            self.time_mask = T.TimeMasking(time_mask_param) if apply_mask else None
            self.n_out = n_mfcc

    def __call__(self, wav: torch.Tensor) -> torch.Tensor:
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)            # [1, T]
        feats = self.spec(wav)                # mel: [1, M, T], mfcc: [1, C, T]

        if self.feature_type == "mel":
            feats = self.to_db(feats.clamp(min=1e-10))  # --- LOG ---
            if self.apply_mask:
                feats = self.freq_mask(feats)
                feats = self.time_mask(feats)
        else:
            # MFCC path: apply masks on MFCCs (post log-mel inside MFCC)
            if self.apply_mask:
                feats = self.freq_mask(feats)
                feats = self.freq_mask(feats)
                feats = self.time_mask(feats)
                feats = self.time_mask(feats)

        return feats                           # [1, F, T]

In [None]:
# ---------------------------------------------------------
# 3) Dataset wrapper with dataset-level normalization
# ---------------------------------------------------------
class SpeechCommands(Dataset):
    def __init__(self, hf_split, aug: Augment | None, frontend: WaveToSpec,
                 wav_len: int = 16_000, mean: float = 0.0, std: float = 1.0):
        self.ds, self.aug, self.front = hf_split, aug, frontend
        self.wav_len = wav_len
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]
        wav = torch.from_numpy(sample["audio"]["array"]).float()

        if wav.numel() < self.wav_len:
            wav = F.pad(wav, (0, self.wav_len - wav.numel()))
        else:
            wav = wav[: self.wav_len]

        if self.aug:
            wav = self.aug(wav)

        feats = self.front(wav)                              # [1, F, T]
        feats = (feats - self.mean) / (self.std + 1e-6)      # normalize
        feats = feats.squeeze(0).transpose(0, 1)             # [T, F]
        return feats, sample["label"]

In [None]:
# ---------------------------------------------------------
# 4) Helper functions + stats
# ---------------------------------------------------------
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    feats, lbls = zip(*batch)
    lens = torch.tensor([f.size(0) for f in feats], dtype=torch.long)
    feats_padded = pad_sequence(feats, batch_first=True, padding_value=0.0)  # [B, T_max, F]
    return feats_padded, torch.tensor(lbls), lens

@torch.no_grad()
def compute_dataset_stats(ds, frontend, wav_len=16_000):
    feats_all = []
    for sample in ds:
        wav = torch.from_numpy(sample["audio"]["array"]).float()
        if wav.numel() < wav_len:
            wav = F.pad(wav, (0, wav_len - wav.numel()))
        else:
            wav = wav[: wav_len]
        feats = frontend(wav).squeeze(0).transpose(0, 1)  # [T, F]
        feats_all.append(feats)
    feats_all = torch.cat(feats_all, dim=0)
    return feats_all.mean().item(), feats_all.std().item()

In [None]:
# ---------------------------------------------------------
# 5) Model: Linear(40->d_model) + Mamba×L + classifier
# ---------------------------------------------------------
class MambaKWS(nn.Module):
    def __init__(self, num_classes: int,
                 d_model=256, d_state=16, expand=2, n_layers=8, feature_dim=40, p_drop=0.1):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(feature_dim, d_model),
            nn.LayerNorm(d_model),
            nn.SiLU(),
            nn.Dropout(p_drop)
        )
        self.blocks = nn.ModuleList([
            nn.ModuleDict({
                "norm": nn.LayerNorm(d_model),
                "mamba": Mamba(d_model=d_model, d_state=d_state, expand=expand),
                "dropout": nn.Dropout(max(0.02, 0.05 - (i * 0.005)))
            }) for i in range(n_layers)
        ])
        self.pre_classifier_norm = nn.LayerNorm(d_model)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(d_model, d_model // 2),
            nn.SiLU(),
            nn.Dropout(0.05),
            nn.Linear(d_model // 2, num_classes)
        )

    def forward(self, x, lengths: torch.Tensor | None = None):  # x: [B, T, F]
        x = self.proj(x)                                        # [B, T, d_model]
        for blk in self.blocks:
            residual = x
            x = blk["norm"](x)
            x = blk["mamba"](x)
            x = blk["dropout"](x)
            x = residual + x

        x = self.pre_classifier_norm(x)

        # Mask-aware mean pooling over time (no time downsampling here)
        if lengths is not None:
            Tprime = x.size(1)
            mask = (torch.arange(Tprime, device=x.device)[None, :] < lengths[:, None]).float()  # [B, T]
            x_sum = (x * mask.unsqueeze(-1)).sum(dim=1)                                          # [B, d_model]
            denom = mask.sum(dim=1).clamp(min=1.0).unsqueeze(-1)                                 # [B, 1]
            pooled = x_sum / denom
        else:
            pooled = x.mean(dim=1)

        return self.classifier(pooled)                        # [B, C]

# ---------------------------------------------------------
# 6) Evaluation helpers (loss, acc, macro-F1)
# ---------------------------------------------------------
@torch.no_grad()
def evaluate(model, loader, device, criterion):
    model.eval()
    tot = correct = loss_sum = 0
    all_pred, all_true = [], []
    for xb, yb, lb in loader:
        xb, yb, lb = xb.to(device), yb.to(device), lb.to(device)
        logits = model(xb, lengths=lb)
        loss = criterion(logits, yb)
        loss_sum += loss.item() * xb.size(0)
        preds = logits.argmax(1)
        correct += (preds == yb).sum().item()
        tot += xb.size(0)
        all_pred.append(preds.detach().cpu())
        all_true.append(yb.detach().cpu())
    all_pred = torch.cat(all_pred).numpy()
    all_true = torch.cat(all_true).numpy()
    macro_f1 = f1_score(all_true, all_pred, average='macro')
    return loss_sum / tot, 100 * correct / tot, macro_f1

In [None]:
# ---------------------------------------------------------
# 7) Warmup + Cosine scheduler factory (per-batch)
# ---------------------------------------------------------
def make_warmup_cosine(total_steps: int, warmup_steps: int):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.003, 0.5 * (1.0 + math.cos(math.pi * progress)))
    return lr_lambda

In [None]:
# ---------------------------------------------------------
# 8) Main training loop over the config grid + TensorBoard
# ---------------------------------------------------------
def run_experiment(cfg: Dict, ds_splits, global_outdir="checkpoints"):
    d_model   = cfg["d_model"]
    n_layers  = cfg["n_layers"]
    epochs    = cfg["epochs"]
    base_lr   = cfg.get("base_lr", 5e-4)
    weight_decay = cfg.get("weight_decay", 1.8e-4)
    label_smoothing = cfg.get("label_smoothing", 0.07)
    d_state   = cfg.get("d_state", 16)
    expand    = cfg.get("expand", 2)

    # Unpack datasets/loaders prepared outside
    train_dl, train_eval_dl, val_dl, n_classes = ds_splits

    # Model / optim / sched
    model = MambaKWS(n_classes, d_model=d_model, d_state=d_state, expand=expand, n_layers=n_layers).to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=weight_decay, betas=(0.9, 0.999))

    steps_per_epoch = len(train_dl)
    total_steps   = steps_per_epoch * epochs
    warmup_steps  = int(0.1 * total_steps)  # 10% warmup
    sched = torch.optim.lr_scheduler.LambdaLR(opt, make_warmup_cosine(total_steps, warmup_steps))

    # TensorBoard writer
    run_name = f"d{d_model}_L{n_layers}"
    writer = SummaryWriter(log_dir=f"runs/mamba_kws/{run_name}")
    writer.add_text("hparams", json.dumps(cfg, indent=2))

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    best_val_acc, best_path = 0.0, Path(f"{global_outdir}/{run_name}_best.pt")
    best_path.parent.mkdir(parents=True, exist_ok=True)

    global_step = 0
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = correct = total = 0.0
        pbar = tqdm(train_dl, desc=f"[{run_name}] Epoch {epoch:02d}")

        for xb, yb, lb in pbar:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            lb = lb.to(device, non_blocking=True)

            with torch.amp.autocast(device_type=device.type,
                                    dtype=torch.bfloat16 if (use_amp and torch.cuda.is_bf16_supported()) else torch.float16,
                                    enabled=use_amp):
                if torch.isnan(xb).any():
                    xb = torch.nan_to_num(xb, nan=0.0)
                logits = model(xb, lengths=lb)
                loss = criterion(logits, yb)

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.8)
            scaler.step(opt)
            scaler.update()
            sched.step()
            global_step += 1

            with torch.no_grad():
                pred = logits.argmax(1)
                correct += (pred == yb).sum().item()
                total += yb.size(0)
                running_loss += loss.item() * yb.size(0)

            pbar.set_postfix(train_loss=f"{running_loss/max(1,total):.3f}",
                             train_acc=f"{100*correct/max(1,total):.1f}%",
                             lr=f"{opt.param_groups[0]['lr']:.2e}")

        tr_loss, tr_acc, tr_f1 = evaluate(model, train_eval_dl, device, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_dl, device, criterion)

        writer.add_scalar("train/loss", tr_loss, epoch)
        writer.add_scalar("train/acc",  tr_acc,  epoch)
        writer.add_scalar("train/macro_f1", tr_f1, epoch)
        writer.add_scalar("val/loss",   val_loss, epoch)
        writer.add_scalar("val/acc",    val_acc,  epoch)
        writer.add_scalar("val/macro_f1", val_f1, epoch)
        writer.add_scalar("lr", opt.param_groups[0]['lr'], epoch)

        print(f"[{run_name}] Epoch {epoch:02d} — "
              f"Train {tr_acc:.2f}% (loss {tr_loss:.3f}, F1 {tr_f1:.4f}) | "
              f"Val {val_acc:.2f}% (loss {val_loss:.3f}, F1 {val_f1:.4f})")

        # Save best-by-val-acc
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_path)
            print(f"[{run_name}] ** Saved new BEST ** (val_acc={best_val_acc:.2f}%) → {best_path}")

    # Save last
    last_path = Path(f"{global_outdir}/{run_name}_last.pt")
    torch.save(model.state_dict(), last_path)
    writer.add_text("checkpoints", f"best={str(best_path)} | last={str(last_path)}")
    writer.close()

    return {"run": run_name, "best_val_acc": best_val_acc,
            "best_ckpt": str(best_path), "last_ckpt": str(last_path)}

In [None]:
# ---------------------------------------------------------
# 9) Data prep (MFCC - Masks only on train)
# ---------------------------------------------------------
if __name__ == "__main__":
    # Load HF dataset
    ds = load_dataset("google/speech_commands", "v0.02")

    n_classes = len(ds["train"].features["label"].names)
    sr = 16_000

    # Front-ends
    frontend_stats = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=False)
    frontend_train = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=True, freq_mask_param=5, time_mask_param=18)
    frontend_eval  = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=False)

    # Waveform augmentations (train only)
    aug = Augment(shift_ms=120, noise=(0., 0.01))

    # Dataset-level normalization stats (computed on train, no aug)
    print("Computing dataset stats (MFCC, no masks)...")
    train_mean, train_std = compute_dataset_stats(ds["train"], frontend_stats)
    print(f"Stats — mean={train_mean:.4f}, std={train_std:.4f}")

    # Datasets
    train_ds       = SpeechCommands(ds["train"], aug,  frontend_train, mean=train_mean, std=train_std)
    train_eval_ds  = SpeechCommands(ds["train"], None, frontend_eval,  mean=train_mean, std=train_std)  # no aug
    val_ds         = SpeechCommands(ds["validation"], None, frontend_eval, mean=train_mean, std=train_std)

    # Loaders
    dl_kwargs = dict(batch_size=128, num_workers=2, pin_memory=True,
                     persistent_workers=True, collate_fn=collate_fn)
    train_dl      = DataLoader(train_ds, shuffle=True,  **dl_kwargs)
    train_eval_dl = DataLoader(train_eval_ds, shuffle=False, **dl_kwargs)
    val_dl        = DataLoader(val_ds, shuffle=False, **dl_kwargs)

    ds_splits = (train_dl, train_eval_dl, val_dl, n_classes)

    # -----------------------------------------------------
    # Grid: check configurations
    # -----------------------------------------------------
    outdir = "checkpoints"
    os.makedirs(outdir, exist_ok=True)

    configs = []
    for d_model in [64]:
        for n_layers in [8]:
            configs.append({
                "d_model": d_model,
                "n_layers": n_layers,
                "epochs": 100,
                "base_lr": 5e-4,
                "weight_decay": 1.8e-4,
                "label_smoothing": 0.07,
                "d_state": 16,
                "expand": 2
            })

    results = []
    for cfg in configs:
        set_seed(42)  # keep runs comparable
        res = run_experiment(cfg, ds_splits, global_outdir=outdir)
        results.append(res)

    print("\n=== Summary ===")
    for r in results:
        print(r)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/112M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84848 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9982 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4890 [00:00<?, ? examples/s]

Computing dataset stats (MFCC, no masks)...
Stats — mean=-5.8153, std=47.6515


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


[d64_L8] Epoch 01:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 01 — Train 3.96% (loss 3.501, F1 0.0099) | Val 3.67% (loss 3.502, F1 0.0091)
[d64_L8] ** Saved new BEST ** (val_acc=3.67%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 02:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 02 — Train 17.28% (loss 3.069, F1 0.0852) | Val 17.72% (loss 3.057, F1 0.0883)
[d64_L8] ** Saved new BEST ** (val_acc=17.72%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 03:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 03 — Train 36.50% (loss 2.510, F1 0.2465) | Val 37.91% (loss 2.478, F1 0.2564)
[d64_L8] ** Saved new BEST ** (val_acc=37.91%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 04:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 04 — Train 52.91% (loss 1.994, F1 0.4221) | Val 54.37% (loss 1.965, F1 0.4290)
[d64_L8] ** Saved new BEST ** (val_acc=54.37%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 05:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 05 — Train 63.86% (loss 1.645, F1 0.5560) | Val 65.18% (loss 1.617, F1 0.5693)
[d64_L8] ** Saved new BEST ** (val_acc=65.18%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 06:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 06 — Train 73.88% (loss 1.359, F1 0.6841) | Val 75.79% (loss 1.329, F1 0.7026)
[d64_L8] ** Saved new BEST ** (val_acc=75.79%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 07:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 07 — Train 78.99% (loss 1.184, F1 0.7451) | Val 79.83% (loss 1.157, F1 0.7539)
[d64_L8] ** Saved new BEST ** (val_acc=79.83%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 08:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 08 — Train 81.97% (loss 1.091, F1 0.7783) | Val 83.57% (loss 1.060, F1 0.7947)
[d64_L8] ** Saved new BEST ** (val_acc=83.57%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 09:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 09 — Train 85.37% (loss 0.991, F1 0.8187) | Val 86.19% (loss 0.973, F1 0.8271)
[d64_L8] ** Saved new BEST ** (val_acc=86.19%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 10:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 10 — Train 86.42% (loss 0.963, F1 0.8291) | Val 87.24% (loss 0.942, F1 0.8347)
[d64_L8] ** Saved new BEST ** (val_acc=87.24%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 11:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 11 — Train 87.64% (loss 0.917, F1 0.8436) | Val 88.52% (loss 0.895, F1 0.8510)
[d64_L8] ** Saved new BEST ** (val_acc=88.52%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 12:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 12 — Train 89.79% (loss 0.854, F1 0.8654) | Val 90.04% (loss 0.851, F1 0.8670)
[d64_L8] ** Saved new BEST ** (val_acc=90.04%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 13:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 13 — Train 90.46% (loss 0.830, F1 0.8727) | Val 90.98% (loss 0.814, F1 0.8753)
[d64_L8] ** Saved new BEST ** (val_acc=90.98%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 14:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 14 — Train 91.28% (loss 0.806, F1 0.8820) | Val 91.45% (loss 0.803, F1 0.8804)
[d64_L8] ** Saved new BEST ** (val_acc=91.45%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 15:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 15 — Train 92.01% (loss 0.785, F1 0.8891) | Val 92.20% (loss 0.781, F1 0.8885)
[d64_L8] ** Saved new BEST ** (val_acc=92.20%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 16:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 16 — Train 92.29% (loss 0.770, F1 0.8917) | Val 92.64% (loss 0.765, F1 0.8928)
[d64_L8] ** Saved new BEST ** (val_acc=92.64%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 17:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 17 — Train 92.86% (loss 0.751, F1 0.8987) | Val 92.87% (loss 0.749, F1 0.8970)
[d64_L8] ** Saved new BEST ** (val_acc=92.87%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 18:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 18 — Train 92.88% (loss 0.747, F1 0.8990) | Val 92.87% (loss 0.748, F1 0.8982)


[d64_L8] Epoch 19:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 19 — Train 93.36% (loss 0.731, F1 0.9022) | Val 93.21% (loss 0.730, F1 0.8982)
[d64_L8] ** Saved new BEST ** (val_acc=93.21%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 20:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 20 — Train 93.44% (loss 0.729, F1 0.9036) | Val 93.43% (loss 0.730, F1 0.9029)
[d64_L8] ** Saved new BEST ** (val_acc=93.43%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 21:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 21 — Train 93.99% (loss 0.712, F1 0.9099) | Val 93.45% (loss 0.722, F1 0.9051)
[d64_L8] ** Saved new BEST ** (val_acc=93.45%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 22:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 22 — Train 94.22% (loss 0.704, F1 0.9122) | Val 94.04% (loss 0.708, F1 0.9097)
[d64_L8] ** Saved new BEST ** (val_acc=94.04%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 23:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 23 — Train 94.03% (loss 0.707, F1 0.9112) | Val 94.03% (loss 0.711, F1 0.9105)


[d64_L8] Epoch 24:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 24 — Train 94.55% (loss 0.689, F1 0.9162) | Val 94.63% (loss 0.688, F1 0.9164)
[d64_L8] ** Saved new BEST ** (val_acc=94.63%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 25:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 25 — Train 94.54% (loss 0.687, F1 0.9148) | Val 94.39% (loss 0.693, F1 0.9145)


[d64_L8] Epoch 26:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 26 — Train 94.85% (loss 0.679, F1 0.9181) | Val 94.65% (loss 0.686, F1 0.9170)
[d64_L8] ** Saved new BEST ** (val_acc=94.65%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 27:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 27 — Train 95.09% (loss 0.671, F1 0.9219) | Val 94.83% (loss 0.680, F1 0.9187)
[d64_L8] ** Saved new BEST ** (val_acc=94.83%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 28:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 28 — Train 95.05% (loss 0.672, F1 0.9209) | Val 94.64% (loss 0.681, F1 0.9156)


[d64_L8] Epoch 29:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 29 — Train 95.18% (loss 0.668, F1 0.9219) | Val 94.81% (loss 0.680, F1 0.9171)


[d64_L8] Epoch 30:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 30 — Train 95.20% (loss 0.665, F1 0.9228) | Val 94.77% (loss 0.679, F1 0.9191)


[d64_L8] Epoch 31:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 31 — Train 95.43% (loss 0.657, F1 0.9245) | Val 95.01% (loss 0.672, F1 0.9199)
[d64_L8] ** Saved new BEST ** (val_acc=95.01%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 32:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 32 — Train 95.41% (loss 0.656, F1 0.9238) | Val 94.98% (loss 0.669, F1 0.9198)


[d64_L8] Epoch 33:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 33 — Train 95.67% (loss 0.652, F1 0.9267) | Val 95.13% (loss 0.672, F1 0.9219)
[d64_L8] ** Saved new BEST ** (val_acc=95.13%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 34:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 34 — Train 95.70% (loss 0.648, F1 0.9266) | Val 94.92% (loss 0.669, F1 0.9192)


[d64_L8] Epoch 35:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 35 — Train 95.72% (loss 0.646, F1 0.9272) | Val 94.78% (loss 0.671, F1 0.9174)


[d64_L8] Epoch 36:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 36 — Train 95.49% (loss 0.651, F1 0.9253) | Val 94.85% (loss 0.671, F1 0.9188)


[d64_L8] Epoch 37:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 37 — Train 95.82% (loss 0.643, F1 0.9282) | Val 94.77% (loss 0.670, F1 0.9178)


[d64_L8] Epoch 38:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 38 — Train 96.00% (loss 0.636, F1 0.9300) | Val 95.30% (loss 0.656, F1 0.9219)
[d64_L8] ** Saved new BEST ** (val_acc=95.30%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 39:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 39 — Train 96.00% (loss 0.637, F1 0.9306) | Val 95.40% (loss 0.660, F1 0.9239)
[d64_L8] ** Saved new BEST ** (val_acc=95.40%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 40:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 40 — Train 96.24% (loss 0.631, F1 0.9332) | Val 95.17% (loss 0.659, F1 0.9216)


[d64_L8] Epoch 41:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 41 — Train 96.12% (loss 0.634, F1 0.9317) | Val 95.18% (loss 0.660, F1 0.9226)


[d64_L8] Epoch 42:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 42 — Train 96.27% (loss 0.628, F1 0.9332) | Val 95.32% (loss 0.654, F1 0.9240)


[d64_L8] Epoch 43:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 43 — Train 96.28% (loss 0.627, F1 0.9335) | Val 95.27% (loss 0.656, F1 0.9228)


[d64_L8] Epoch 44:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 44 — Train 96.39% (loss 0.624, F1 0.9348) | Val 95.49% (loss 0.649, F1 0.9254)
[d64_L8] ** Saved new BEST ** (val_acc=95.49%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 45:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 45 — Train 96.26% (loss 0.627, F1 0.9327) | Val 95.46% (loss 0.654, F1 0.9245)


[d64_L8] Epoch 46:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 46 — Train 96.53% (loss 0.619, F1 0.9358) | Val 95.51% (loss 0.648, F1 0.9249)
[d64_L8] ** Saved new BEST ** (val_acc=95.51%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 47:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 47 — Train 96.50% (loss 0.619, F1 0.9356) | Val 95.54% (loss 0.646, F1 0.9260)
[d64_L8] ** Saved new BEST ** (val_acc=95.54%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 48:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 48 — Train 96.46% (loss 0.620, F1 0.9357) | Val 95.22% (loss 0.652, F1 0.9231)


[d64_L8] Epoch 49:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 49 — Train 96.72% (loss 0.612, F1 0.9376) | Val 95.67% (loss 0.642, F1 0.9266)
[d64_L8] ** Saved new BEST ** (val_acc=95.67%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 50:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 50 — Train 96.61% (loss 0.616, F1 0.9366) | Val 95.38% (loss 0.649, F1 0.9238)


[d64_L8] Epoch 51:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 51 — Train 96.67% (loss 0.611, F1 0.9372) | Val 95.45% (loss 0.645, F1 0.9238)


[d64_L8] Epoch 52:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 52 — Train 96.72% (loss 0.613, F1 0.9378) | Val 95.52% (loss 0.645, F1 0.9246)


[d64_L8] Epoch 53:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 53 — Train 96.82% (loss 0.609, F1 0.9388) | Val 95.33% (loss 0.650, F1 0.9223)


[d64_L8] Epoch 54:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 54 — Train 96.79% (loss 0.607, F1 0.9387) | Val 95.47% (loss 0.646, F1 0.9240)


[d64_L8] Epoch 55:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 55 — Train 96.93% (loss 0.605, F1 0.9400) | Val 95.66% (loss 0.643, F1 0.9271)


[d64_L8] Epoch 56:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 56 — Train 96.98% (loss 0.602, F1 0.9407) | Val 95.67% (loss 0.640, F1 0.9270)


[d64_L8] Epoch 57:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 57 — Train 96.99% (loss 0.603, F1 0.9403) | Val 95.62% (loss 0.640, F1 0.9261)


[d64_L8] Epoch 58:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 58 — Train 96.99% (loss 0.602, F1 0.9407) | Val 95.60% (loss 0.643, F1 0.9261)


[d64_L8] Epoch 59:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 59 — Train 97.09% (loss 0.599, F1 0.9418) | Val 95.72% (loss 0.635, F1 0.9274)
[d64_L8] ** Saved new BEST ** (val_acc=95.72%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 60:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 60 — Train 97.09% (loss 0.599, F1 0.9419) | Val 95.77% (loss 0.641, F1 0.9283)
[d64_L8] ** Saved new BEST ** (val_acc=95.77%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 61:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 61 — Train 97.16% (loss 0.596, F1 0.9425) | Val 95.85% (loss 0.636, F1 0.9283)
[d64_L8] ** Saved new BEST ** (val_acc=95.85%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 62:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 62 — Train 97.07% (loss 0.598, F1 0.9414) | Val 95.70% (loss 0.637, F1 0.9273)


[d64_L8] Epoch 63:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 63 — Train 97.27% (loss 0.594, F1 0.9433) | Val 95.95% (loss 0.634, F1 0.9297)
[d64_L8] ** Saved new BEST ** (val_acc=95.95%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 64:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 64 — Train 97.24% (loss 0.594, F1 0.9437) | Val 95.67% (loss 0.638, F1 0.9269)


[d64_L8] Epoch 65:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 65 — Train 97.30% (loss 0.592, F1 0.9438) | Val 95.90% (loss 0.633, F1 0.9291)


[d64_L8] Epoch 66:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 66 — Train 97.22% (loss 0.593, F1 0.9428) | Val 95.83% (loss 0.636, F1 0.9277)


[d64_L8] Epoch 67:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 67 — Train 97.29% (loss 0.592, F1 0.9440) | Val 96.01% (loss 0.633, F1 0.9311)
[d64_L8] ** Saved new BEST ** (val_acc=96.01%) → checkpoints/d64_L8_best.pt


[d64_L8] Epoch 68:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 68 — Train 97.35% (loss 0.591, F1 0.9445) | Val 95.82% (loss 0.634, F1 0.9287)


[d64_L8] Epoch 69:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 69 — Train 97.39% (loss 0.589, F1 0.9448) | Val 95.92% (loss 0.634, F1 0.9294)


[d64_L8] Epoch 70:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 70 — Train 97.39% (loss 0.589, F1 0.9448) | Val 95.97% (loss 0.635, F1 0.9294)


[d64_L8] Epoch 71:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 71 — Train 97.41% (loss 0.588, F1 0.9448) | Val 95.88% (loss 0.632, F1 0.9283)


[d64_L8] Epoch 72:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 72 — Train 97.47% (loss 0.586, F1 0.9457) | Val 95.93% (loss 0.631, F1 0.9294)


[d64_L8] Epoch 73:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 73 — Train 97.45% (loss 0.587, F1 0.9458) | Val 95.94% (loss 0.631, F1 0.9291)


[d64_L8] Epoch 74:   0%|          | 0/663 [00:00<?, ?it/s]

[d64_L8] Epoch 74 — Train 97.51% (loss 0.585, F1 0.9460) | Val 95.86% (loss 0.632, F1 0.9287)


[d64_L8] Epoch 75:   0%|          | 0/663 [00:00<?, ?it/s]

In [None]:
from tensorboard import notebook

notebook.start("--logdir runs")

In [None]:
SAVE_DIR = "/content/drive/MyDrive/kws_exports_small"
!mkdir -p "$SAVE_DIR"

# Zip folders straight to Drive paths
!zip -qr "$SAVE_DIR/checkpoints_expand.zip" /content/checkpoints
!zip -qr "$SAVE_DIR/runs_expand.zip"        /content/runs

# Verify
!ls -lh "$SAVE_DIR"