In [None]:
# --- Environment & installs ---
!pip install datasets==2.16.0
!pip install huggingface-hub==0.20.0
!apt-get install -y libsox-dev
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
!pip install causal-conv1d==1.4.0 mamba-ssm==2.2.2 scikit-learn tensorboard

Collecting datasets==2.16.0
  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.16.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.16.0)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =========================================================
# Keyword-Spotting (Google Speech Commands v0.02) with Mamba
# Front-end: MFCC (F=40) -> Linear(40 -> d_model) -> Mamba * L -> Classifier
# =========================================================

from __future__ import annotations
import json, os, random, math, time
from pathlib import Path
from typing import Tuple, Dict, List

import torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from datasets import load_dataset
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score

from mamba_ssm import Mamba

# ---------------------------------------------------------
# 0) Repro & device
# ---------------------------------------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")
set_seed(42)

from google.colab import drive
drive.mount('/content/drive')

  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd


Mounted at /content/drive


In [None]:
# ---------------------------------------------------------
# 1) Waveform-level augmentation (shift + noise)
# ---------------------------------------------------------
class Augment:
    def __init__(self,
                 stretch: Tuple[float,float]=(1.0,1.0),
                 shift_ms: int = 100,
                 noise: Tuple[float,float]=(0.,0.05),
                 sr: int = 16_000):
        self.stretch = stretch
        self.shift   = int(shift_ms * sr / 1000)
        self.noise   = noise
        self.sr      = sr

    def _shift(self, x: torch.Tensor):
        if self.shift == 0:
            return x
        s = int(torch.randint(-self.shift, self.shift + 1, ()).item())
        if s == 0:
            return x
        return (F.pad(x, (s, 0))[:, :-s] if s > 0 else F.pad(x, (0, -s))[:, -s:])

    def __call__(self, wav: torch.Tensor):
        squeezed = False
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)
            squeezed = True
        if self.stretch != (1.0, 1.0):
            factor = float(torch.empty(()).uniform_(*self.stretch))
            if abs(factor - 1.0) > 1e-3:
                wav, _ = torchaudio.sox_effects.apply_effects_tensor(
                    wav, self.sr, [["tempo", f"{factor}"]]
                )
        wav = self._shift(wav)
        if self.noise[1] > 0:
            sigma = float(torch.empty(()).uniform_(*self.noise))
            if sigma > 0:
                wav = wav + sigma * torch.randn_like(wav)
        return wav.squeeze(0) if squeezed else wav

# ---------------------------------------------------------
# 2) Front-end: MFCC or Mel with SpecAug AFTER log (if mel)
#    For MFCC we mask on the MFCCs (computed from log-mel internally).
# ---------------------------------------------------------
class WaveToSpec:
    def __init__(self,
                 feature_type: str = "mfcc",     # "mfcc" or "mel"
                 sample_rate: int = 16_000,
                 n_fft: int = 2048,
                 hop_length: int = 256,
                 n_mels: int = 128,
                 n_mfcc: int = 40,
                 top_db: int | None = 80,
                 apply_mask: bool = True,
                 freq_mask_param: int = 3,
                 time_mask_param: int = 12):
        self.feature_type = feature_type.lower(); assert self.feature_type in {"mel","mfcc"}
        self.apply_mask = apply_mask

        if self.feature_type == "mel":
            self.spec = T.MelSpectrogram(sample_rate, n_fft=n_fft, hop_length=hop_length,
                                         n_mels=n_mels, power=2.0)
            self.to_db = T.AmplitudeToDB(stype="power", top_db=top_db)
            self.freq_mask = T.FrequencyMasking(freq_mask_param) if apply_mask else None
            self.time_mask = T.TimeMasking(time_mask_param) if apply_mask else None
            self.n_out = n_mels
        else:
            self.spec = T.MFCC(sample_rate, n_mfcc=n_mfcc,
                                melkwargs=dict(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels))
            # MFCC branch does not use AmplitudeToDB directly; MFCC uses log-mel inside.
            self.to_db = None
            self.freq_mask = T.FrequencyMasking(freq_mask_param) if apply_mask else None
            self.time_mask = T.TimeMasking(time_mask_param) if apply_mask else None
            self.n_out = n_mfcc

    def __call__(self, wav: torch.Tensor) -> torch.Tensor:
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)            # [1, T]
        feats = self.spec(wav)                # mel: [1, M, T], mfcc: [1, C, T]

        if self.feature_type == "mel":
            feats = self.to_db(feats.clamp(min=1e-10))  # --- LOG ---
            if self.apply_mask:
                feats = self.freq_mask(feats)
                feats = self.time_mask(feats)
        else:
            # MFCC path: apply masks on MFCCs (post log-mel inside MFCC)
            if self.apply_mask:
                feats = self.freq_mask(feats)
                feats = self.freq_mask(feats)
                feats = self.time_mask(feats)
                feats = self.time_mask(feats)

        return feats                           # [1, F, T]

In [None]:
# ---------------------------------------------------------
# 3) Dataset wrapper with dataset-level normalization
# ---------------------------------------------------------
class SpeechCommands(Dataset):
    def __init__(self, hf_split, aug: Augment | None, frontend: WaveToSpec,
                 wav_len: int = 16_000, mean: float = 0.0, std: float = 1.0):
        self.ds, self.aug, self.front = hf_split, aug, frontend
        self.wav_len = wav_len
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]
        wav = torch.from_numpy(sample["audio"]["array"]).float()

        if wav.numel() < self.wav_len:
            wav = F.pad(wav, (0, self.wav_len - wav.numel()))
        else:
            wav = wav[: self.wav_len]

        if self.aug:
            wav = self.aug(wav)

        feats = self.front(wav)                              # [1, F, T]
        feats = (feats - self.mean) / (self.std + 1e-6)      # normalize
        feats = feats.squeeze(0).transpose(0, 1)             # [T, F]
        return feats, sample["label"]

In [None]:
# ---------------------------------------------------------
# 4) Helper functions + stats
# ---------------------------------------------------------
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    feats, lbls = zip(*batch)
    lens = torch.tensor([f.size(0) for f in feats], dtype=torch.long)
    feats_padded = pad_sequence(feats, batch_first=True, padding_value=0.0)  # [B, T_max, F]
    return feats_padded, torch.tensor(lbls), lens

@torch.no_grad()
def compute_dataset_stats(ds, frontend, wav_len=16_000):
    feats_all = []
    for sample in ds:
        wav = torch.from_numpy(sample["audio"]["array"]).float()
        if wav.numel() < wav_len:
            wav = F.pad(wav, (0, wav_len - wav.numel()))
        else:
            wav = wav[: wav_len]
        feats = frontend(wav).squeeze(0).transpose(0, 1)  # [T, F]
        feats_all.append(feats)
    feats_all = torch.cat(feats_all, dim=0)
    return feats_all.mean().item(), feats_all.std().item()

In [None]:
# ---------------------------------------------------------
# 5) Model: Linear(40→d_model) + Mamba×L + classifier
# ---------------------------------------------------------
class MambaKWS(nn.Module):
    def __init__(self, num_classes: int,
                 d_model=256, d_state=16, expand=2, n_layers=8, feature_dim=40, p_drop=0.1):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(feature_dim, d_model),
            nn.LayerNorm(d_model),
            nn.SiLU(),
            nn.Dropout(p_drop)
        )
        self.blocks = nn.ModuleList([
            nn.ModuleDict({
                "norm": nn.LayerNorm(d_model),
                "mamba": Mamba(d_model=d_model, d_state=d_state, expand=expand),
                "dropout": nn.Dropout(max(0.02, 0.05 - (i * 0.005)))
            }) for i in range(n_layers)
        ])
        self.pre_classifier_norm = nn.LayerNorm(d_model)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(d_model, d_model // 2),
            nn.SiLU(),
            nn.Dropout(0.05),
            nn.Linear(d_model // 2, num_classes)
        )

    def forward(self, x, lengths: torch.Tensor | None = None):  # x: [B, T, F]
        x = self.proj(x)                                        # [B, T, d_model]
        for blk in self.blocks:
            residual = x
            x = blk["norm"](x)
            x = blk["mamba"](x)
            x = blk["dropout"](x)
            x = residual + x

        x = self.pre_classifier_norm(x)

        # Mask-aware mean pooling over time (no time downsampling here)
        if lengths is not None:
            Tprime = x.size(1)
            mask = (torch.arange(Tprime, device=x.device)[None, :] < lengths[:, None]).float()  # [B, T]
            x_sum = (x * mask.unsqueeze(-1)).sum(dim=1)                                          # [B, d_model]
            denom = mask.sum(dim=1).clamp(min=1.0).unsqueeze(-1)                                 # [B, 1]
            pooled = x_sum / denom
        else:
            pooled = x.mean(dim=1)

        return self.classifier(pooled)                        # [B, C]

# ---------------------------------------------------------
# 6) Evaluation helpers (loss, acc, macro-F1)
# ---------------------------------------------------------
@torch.no_grad()
def evaluate(model, loader, device, criterion):
    model.eval()
    tot = correct = loss_sum = 0
    all_pred, all_true = [], []
    for xb, yb, lb in loader:
        xb, yb, lb = xb.to(device), yb.to(device), lb.to(device)
        logits = model(xb, lengths=lb)
        loss = criterion(logits, yb)
        loss_sum += loss.item() * xb.size(0)
        preds = logits.argmax(1)
        correct += (preds == yb).sum().item()
        tot += xb.size(0)
        all_pred.append(preds.detach().cpu())
        all_true.append(yb.detach().cpu())
    all_pred = torch.cat(all_pred).numpy()
    all_true = torch.cat(all_true).numpy()
    macro_f1 = f1_score(all_true, all_pred, average='macro')
    return loss_sum / tot, 100 * correct / tot, macro_f1

In [None]:
# ---------------------------------------------------------
# 7) Warmup + Cosine scheduler factory (per-batch)
# ---------------------------------------------------------
def make_warmup_cosine(total_steps: int, warmup_steps: int):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.003, 0.5 * (1.0 + math.cos(math.pi * progress)))
    return lr_lambda

In [None]:
# ---------------------------------------------------------
# 8) Main training loop over the config grid + TensorBoard
# ---------------------------------------------------------
def run_experiment(cfg: Dict, ds_splits, global_outdir="checkpoints"):
    d_model   = cfg["d_model"]
    n_layers  = cfg["n_layers"]
    epochs    = cfg["epochs"]
    base_lr   = cfg.get("base_lr", 0.001)
    weight_decay = cfg.get("weight_decay", 0.1)
    label_smoothing = cfg.get("label_smoothing", 0.1)
    d_state   = cfg.get("d_state", 16)
    expand    = cfg.get("expand", 2)

    # Unpack datasets/loaders prepared outside
    train_dl, train_eval_dl, val_dl, n_classes = ds_splits

    # Model / optim / sched
    model = MambaKWS(n_classes, d_model=d_model, d_state=d_state, expand=expand, n_layers=n_layers).to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=weight_decay, betas=(0.9, 0.999))

    steps_per_epoch = len(train_dl)
    total_steps   = steps_per_epoch * epochs
    warmup_steps  = int(0.12 * total_steps)  # 12% warmup
    sched = torch.optim.lr_scheduler.LambdaLR(opt, make_warmup_cosine(total_steps, warmup_steps))

    # TensorBoard writer
    run_name = f"d{d_model}_L{n_layers}"
    writer = SummaryWriter(log_dir=f"runs/mamba_kws/{run_name}")
    writer.add_text("hparams", json.dumps(cfg, indent=2))

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    best_val_acc, best_path = 0.0, Path(f"{global_outdir}/{run_name}_best.pt")
    best_path.parent.mkdir(parents=True, exist_ok=True)

    global_step = 0
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = correct = total = 0.0
        pbar = tqdm(train_dl, desc=f"[{run_name}] Epoch {epoch:02d}")

        for xb, yb, lb in pbar:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            lb = lb.to(device, non_blocking=True)

            with torch.amp.autocast(device_type=device.type,
                                    dtype=torch.bfloat16 if (use_amp and torch.cuda.is_bf16_supported()) else torch.float16,
                                    enabled=use_amp):
                if torch.isnan(xb).any():
                    xb = torch.nan_to_num(xb, nan=0.0)
                logits = model(xb, lengths=lb)
                loss = criterion(logits, yb)

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
            scaler.step(opt)
            scaler.update()
            sched.step()
            global_step += 1

            with torch.no_grad():
                pred = logits.argmax(1)
                correct += (pred == yb).sum().item()
                total += yb.size(0)
                running_loss += loss.item() * yb.size(0)

            pbar.set_postfix(train_loss=f"{running_loss/max(1,total):.3f}",
                             train_acc=f"{100*correct/max(1,total):.1f}%",
                             lr=f"{opt.param_groups[0]['lr']:.2e}")

        # --- Epoch end: log train metrics on NO-AUG train split, and val ---
        tr_loss, tr_acc, tr_f1 = evaluate(model, train_eval_dl, device, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_dl, device, criterion)

        writer.add_scalar("train/loss", tr_loss, epoch)
        writer.add_scalar("train/acc",  tr_acc,  epoch)
        writer.add_scalar("train/macro_f1", tr_f1, epoch)
        writer.add_scalar("val/loss",   val_loss, epoch)
        writer.add_scalar("val/acc",    val_acc,  epoch)
        writer.add_scalar("val/macro_f1", val_f1, epoch)
        writer.add_scalar("lr", opt.param_groups[0]['lr'], epoch)

        print(f"[{run_name}] Epoch {epoch:02d} — "
              f"Train {tr_acc:.2f}% (loss {tr_loss:.3f}, F1 {tr_f1:.4f}) | "
              f"Val {val_acc:.2f}% (loss {val_loss:.3f}, F1 {val_f1:.4f})")

        # Save best-by-val-acc
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_path)
            print(f"[{run_name}] ** Saved new BEST ** (val_acc={best_val_acc:.2f}%) → {best_path}")

    # Save last
    last_path = Path(f"{global_outdir}/{run_name}_last.pt")
    torch.save(model.state_dict(), last_path)
    writer.add_text("checkpoints", f"best={str(best_path)} | last={str(last_path)}")
    writer.close()

    return {"run": run_name, "best_val_acc": best_val_acc,
            "best_ckpt": str(best_path), "last_ckpt": str(last_path)}

In [None]:
# ---------------------------------------------------------
# 9) Data prep (MFCC - Masks only on train)
# ---------------------------------------------------------
if __name__ == "__main__":
    # Load HF dataset
    ds = load_dataset("google/speech_commands", "v0.02")

    n_classes = len(ds["train"].features["label"].names)
    sr = 16_000

    # Front-ends
    frontend_stats = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=False)
    frontend_train = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=True, freq_mask_param=4, time_mask_param=15)
    frontend_eval  = WaveToSpec(feature_type="mfcc", n_mfcc=40, n_mels=128,
                                apply_mask=False)

    # Waveform augmentations (train only)
    aug = Augment(shift_ms=100, noise=(0., 0.01), stretch=(0.95, 1.05))

    # Dataset-level normalization stats (computed on train, no aug)
    print("Computing dataset stats (MFCC, no masks)...")
    train_mean, train_std = compute_dataset_stats(ds["train"], frontend_stats)
    print(f"Stats — mean={train_mean:.4f}, std={train_std:.4f}")

    # Datasets
    train_ds       = SpeechCommands(ds["train"], aug,  frontend_train, mean=train_mean, std=train_std)
    train_eval_ds  = SpeechCommands(ds["train"], None, frontend_eval,  mean=train_mean, std=train_std)  # no aug
    val_ds         = SpeechCommands(ds["validation"], None, frontend_eval, mean=train_mean, std=train_std)

    # Loaders
    dl_kwargs = dict(batch_size=128, num_workers=2, pin_memory=True,
                     persistent_workers=True, collate_fn=collate_fn)
    train_dl      = DataLoader(train_ds, shuffle=True,  **dl_kwargs)
    train_eval_dl = DataLoader(train_eval_ds, shuffle=False, **dl_kwargs)
    val_dl        = DataLoader(val_ds, shuffle=False, **dl_kwargs)

    ds_splits = (train_dl, train_eval_dl, val_dl, n_classes)

    # -----------------------------------------------------
    # Grid: check configurations
    # -----------------------------------------------------
    outdir = "checkpoints"
    os.makedirs(outdir, exist_ok=True)

    configs = []
    for d_model in [128]:
        for n_layers in [10]:
            configs.append({
                "d_model": d_model,
                "n_layers": n_layers,
                "epochs": 100,
                "base_lr": 0.001,
                "weight_decay": 0.1,
                "label_smoothing": 0.1,
                "d_state": 16,
                "expand": 2
            })

    results = []
    for cfg in configs:
        set_seed(42)  # keep runs comparable
        res = run_experiment(cfg, ds_splits, global_outdir=outdir)
        results.append(res)

    print("\n=== Summary ===")
    for r in results:
        print(r)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/112M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84848 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9982 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4890 [00:00<?, ? examples/s]

Computing dataset stats (MFCC, no masks)...
Stats — mean=-5.8153, std=47.6515


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


[d128_L10] Epoch 01:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 01 — Train 17.22% (loss 3.154, F1 0.0906) | Val 18.83% (loss 3.140, F1 0.1006)
[d128_L10] ** Saved new BEST ** (val_acc=18.83%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 02:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 02 — Train 59.15% (loss 1.979, F1 0.4774) | Val 60.60% (loss 1.947, F1 0.4890)
[d128_L10] ** Saved new BEST ** (val_acc=60.60%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 03:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 03 — Train 76.31% (loss 1.426, F1 0.7151) | Val 77.60% (loss 1.395, F1 0.7265)
[d128_L10] ** Saved new BEST ** (val_acc=77.60%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 04:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 04 — Train 85.51% (loss 1.141, F1 0.8203) | Val 85.98% (loss 1.124, F1 0.8247)
[d128_L10] ** Saved new BEST ** (val_acc=85.98%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 05:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 05 — Train 88.92% (loss 1.033, F1 0.8573) | Val 89.22% (loss 1.020, F1 0.8579)
[d128_L10] ** Saved new BEST ** (val_acc=89.22%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 06:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 06 — Train 90.22% (loss 0.984, F1 0.8720) | Val 90.35% (loss 0.980, F1 0.8738)
[d128_L10] ** Saved new BEST ** (val_acc=90.35%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 07:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 07 — Train 92.44% (loss 0.917, F1 0.8949) | Val 92.69% (loss 0.907, F1 0.8963)
[d128_L10] ** Saved new BEST ** (val_acc=92.69%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 08:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 08 — Train 92.61% (loss 0.902, F1 0.8949) | Val 92.61% (loss 0.899, F1 0.8928)


[d128_L10] Epoch 09:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 09 — Train 92.88% (loss 0.891, F1 0.8984) | Val 93.34% (loss 0.882, F1 0.9020)
[d128_L10] ** Saved new BEST ** (val_acc=93.34%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 10:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 10 — Train 93.42% (loss 0.875, F1 0.9039) | Val 93.31% (loss 0.878, F1 0.9023)


[d128_L10] Epoch 11:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 11 — Train 93.90% (loss 0.857, F1 0.9106) | Val 93.58% (loss 0.867, F1 0.9059)
[d128_L10] ** Saved new BEST ** (val_acc=93.58%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 12:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 12 — Train 94.41% (loss 0.849, F1 0.9131) | Val 94.40% (loss 0.848, F1 0.9130)
[d128_L10] ** Saved new BEST ** (val_acc=94.40%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 13:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 13 — Train 94.11% (loss 0.846, F1 0.9096) | Val 93.46% (loss 0.861, F1 0.9013)


[d128_L10] Epoch 14:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 14 — Train 94.69% (loss 0.833, F1 0.9182) | Val 94.63% (loss 0.835, F1 0.9168)
[d128_L10] ** Saved new BEST ** (val_acc=94.63%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 15:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 15 — Train 95.00% (loss 0.821, F1 0.9205) | Val 94.47% (loss 0.832, F1 0.9149)


[d128_L10] Epoch 16:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 16 — Train 95.37% (loss 0.813, F1 0.9234) | Val 94.81% (loss 0.827, F1 0.9174)
[d128_L10] ** Saved new BEST ** (val_acc=94.81%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 17:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 17 — Train 95.31% (loss 0.813, F1 0.9243) | Val 94.82% (loss 0.831, F1 0.9189)
[d128_L10] ** Saved new BEST ** (val_acc=94.82%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 18:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 18 — Train 95.57% (loss 0.804, F1 0.9264) | Val 95.21% (loss 0.816, F1 0.9217)
[d128_L10] ** Saved new BEST ** (val_acc=95.21%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 19:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 19 — Train 95.92% (loss 0.793, F1 0.9301) | Val 95.17% (loss 0.810, F1 0.9213)


[d128_L10] Epoch 20:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 20 — Train 95.01% (loss 0.814, F1 0.9207) | Val 94.74% (loss 0.824, F1 0.9171)


[d128_L10] Epoch 21:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 21 — Train 96.06% (loss 0.788, F1 0.9318) | Val 95.35% (loss 0.805, F1 0.9252)
[d128_L10] ** Saved new BEST ** (val_acc=95.35%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 22:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 22 — Train 96.16% (loss 0.786, F1 0.9322) | Val 95.37% (loss 0.807, F1 0.9239)
[d128_L10] ** Saved new BEST ** (val_acc=95.37%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 23:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 23 — Train 95.94% (loss 0.791, F1 0.9313) | Val 95.11% (loss 0.813, F1 0.9224)


[d128_L10] Epoch 24:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 24 — Train 95.88% (loss 0.792, F1 0.9302) | Val 95.30% (loss 0.808, F1 0.9236)


[d128_L10] Epoch 25:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 25 — Train 96.42% (loss 0.779, F1 0.9351) | Val 95.81% (loss 0.795, F1 0.9281)
[d128_L10] ** Saved new BEST ** (val_acc=95.81%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 26:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 26 — Train 96.23% (loss 0.781, F1 0.9327) | Val 95.66% (loss 0.800, F1 0.9255)


[d128_L10] Epoch 27:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 27 — Train 96.45% (loss 0.776, F1 0.9352) | Val 95.65% (loss 0.796, F1 0.9266)


[d128_L10] Epoch 28:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 28 — Train 96.55% (loss 0.773, F1 0.9370) | Val 95.49% (loss 0.799, F1 0.9254)


[d128_L10] Epoch 29:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 29 — Train 96.62% (loss 0.771, F1 0.9372) | Val 96.08% (loss 0.789, F1 0.9310)
[d128_L10] ** Saved new BEST ** (val_acc=96.08%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 30:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 30 — Train 96.80% (loss 0.764, F1 0.9391) | Val 95.78% (loss 0.796, F1 0.9286)


[d128_L10] Epoch 31:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 31 — Train 96.86% (loss 0.763, F1 0.9400) | Val 95.65% (loss 0.794, F1 0.9259)


[d128_L10] Epoch 32:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 32 — Train 96.73% (loss 0.767, F1 0.9379) | Val 95.83% (loss 0.795, F1 0.9276)


[d128_L10] Epoch 33:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 33 — Train 96.75% (loss 0.765, F1 0.9387) | Val 95.91% (loss 0.794, F1 0.9293)


[d128_L10] Epoch 34:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 34 — Train 96.66% (loss 0.770, F1 0.9374) | Val 95.94% (loss 0.798, F1 0.9302)


[d128_L10] Epoch 35:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 35 — Train 96.74% (loss 0.766, F1 0.9385) | Val 95.88% (loss 0.792, F1 0.9294)


[d128_L10] Epoch 36:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 36 — Train 96.88% (loss 0.763, F1 0.9399) | Val 95.64% (loss 0.798, F1 0.9252)


[d128_L10] Epoch 37:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 37 — Train 96.80% (loss 0.764, F1 0.9390) | Val 95.69% (loss 0.798, F1 0.9273)


[d128_L10] Epoch 38:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 38 — Train 97.05% (loss 0.758, F1 0.9420) | Val 95.91% (loss 0.793, F1 0.9292)


[d128_L10] Epoch 39:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 39 — Train 97.12% (loss 0.755, F1 0.9422) | Val 96.06% (loss 0.788, F1 0.9313)


[d128_L10] Epoch 40:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 40 — Train 97.28% (loss 0.752, F1 0.9434) | Val 96.04% (loss 0.786, F1 0.9310)


[d128_L10] Epoch 41:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 41 — Train 97.38% (loss 0.750, F1 0.9452) | Val 96.13% (loss 0.788, F1 0.9305)
[d128_L10] ** Saved new BEST ** (val_acc=96.13%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 42:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 42 — Train 97.11% (loss 0.755, F1 0.9422) | Val 95.87% (loss 0.790, F1 0.9283)


[d128_L10] Epoch 43:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 43 — Train 97.18% (loss 0.753, F1 0.9434) | Val 95.85% (loss 0.790, F1 0.9293)


[d128_L10] Epoch 44:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 44 — Train 97.41% (loss 0.747, F1 0.9455) | Val 95.97% (loss 0.787, F1 0.9300)


[d128_L10] Epoch 45:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 45 — Train 97.19% (loss 0.752, F1 0.9435) | Val 95.85% (loss 0.790, F1 0.9280)


[d128_L10] Epoch 46:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 46 — Train 97.53% (loss 0.743, F1 0.9465) | Val 95.94% (loss 0.788, F1 0.9291)


[d128_L10] Epoch 47:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 47 — Train 97.15% (loss 0.752, F1 0.9429) | Val 95.78% (loss 0.797, F1 0.9276)


[d128_L10] Epoch 48:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 48 — Train 97.61% (loss 0.741, F1 0.9476) | Val 96.11% (loss 0.782, F1 0.9316)


[d128_L10] Epoch 49:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 49 — Train 97.61% (loss 0.740, F1 0.9474) | Val 95.72% (loss 0.794, F1 0.9262)


[d128_L10] Epoch 50:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 50 — Train 97.50% (loss 0.743, F1 0.9469) | Val 95.69% (loss 0.795, F1 0.9282)


[d128_L10] Epoch 51:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 51 — Train 97.73% (loss 0.737, F1 0.9493) | Val 96.15% (loss 0.783, F1 0.9329)
[d128_L10] ** Saved new BEST ** (val_acc=96.15%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 52:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 52 — Train 97.78% (loss 0.735, F1 0.9493) | Val 96.03% (loss 0.786, F1 0.9297)


[d128_L10] Epoch 53:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 53 — Train 97.77% (loss 0.737, F1 0.9497) | Val 96.19% (loss 0.784, F1 0.9327)
[d128_L10] ** Saved new BEST ** (val_acc=96.19%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 54:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 54 — Train 97.75% (loss 0.735, F1 0.9492) | Val 96.27% (loss 0.786, F1 0.9324)
[d128_L10] ** Saved new BEST ** (val_acc=96.27%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 55:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 55 — Train 97.97% (loss 0.730, F1 0.9509) | Val 96.40% (loss 0.779, F1 0.9334)
[d128_L10] ** Saved new BEST ** (val_acc=96.40%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 56:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 56 — Train 98.00% (loss 0.730, F1 0.9518) | Val 96.07% (loss 0.783, F1 0.9314)


[d128_L10] Epoch 57:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 57 — Train 97.99% (loss 0.728, F1 0.9515) | Val 96.14% (loss 0.788, F1 0.9308)


[d128_L10] Epoch 58:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 58 — Train 98.16% (loss 0.726, F1 0.9530) | Val 96.08% (loss 0.787, F1 0.9300)


[d128_L10] Epoch 59:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 59 — Train 97.99% (loss 0.729, F1 0.9516) | Val 96.11% (loss 0.786, F1 0.9308)


[d128_L10] Epoch 60:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 60 — Train 98.24% (loss 0.723, F1 0.9543) | Val 96.07% (loss 0.786, F1 0.9302)


[d128_L10] Epoch 61:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 61 — Train 98.28% (loss 0.720, F1 0.9542) | Val 96.14% (loss 0.778, F1 0.9309)


[d128_L10] Epoch 62:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 62 — Train 98.35% (loss 0.719, F1 0.9555) | Val 96.16% (loss 0.788, F1 0.9315)


[d128_L10] Epoch 63:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 63 — Train 98.39% (loss 0.719, F1 0.9557) | Val 96.15% (loss 0.780, F1 0.9314)


[d128_L10] Epoch 64:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 64 — Train 98.47% (loss 0.715, F1 0.9562) | Val 96.27% (loss 0.776, F1 0.9319)


[d128_L10] Epoch 65:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 65 — Train 98.44% (loss 0.716, F1 0.9564) | Val 96.15% (loss 0.784, F1 0.9311)


[d128_L10] Epoch 66:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 66 — Train 98.44% (loss 0.716, F1 0.9565) | Val 96.33% (loss 0.780, F1 0.9339)


[d128_L10] Epoch 67:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 67 — Train 98.54% (loss 0.714, F1 0.9574) | Val 96.21% (loss 0.780, F1 0.9326)


[d128_L10] Epoch 68:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 68 — Train 98.64% (loss 0.711, F1 0.9581) | Val 96.27% (loss 0.786, F1 0.9325)


[d128_L10] Epoch 69:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 69 — Train 98.78% (loss 0.707, F1 0.9597) | Val 96.33% (loss 0.778, F1 0.9336)


[d128_L10] Epoch 70:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 70 — Train 98.74% (loss 0.707, F1 0.9593) | Val 96.40% (loss 0.777, F1 0.9338)


[d128_L10] Epoch 71:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 71 — Train 98.79% (loss 0.706, F1 0.9596) | Val 96.24% (loss 0.782, F1 0.9313)


[d128_L10] Epoch 72:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 72 — Train 98.93% (loss 0.703, F1 0.9610) | Val 96.44% (loss 0.776, F1 0.9344)
[d128_L10] ** Saved new BEST ** (val_acc=96.44%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 73:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 73 — Train 98.87% (loss 0.704, F1 0.9605) | Val 96.41% (loss 0.780, F1 0.9344)


[d128_L10] Epoch 74:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 74 — Train 98.90% (loss 0.702, F1 0.9611) | Val 96.56% (loss 0.773, F1 0.9352)
[d128_L10] ** Saved new BEST ** (val_acc=96.56%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 75:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 75 — Train 99.04% (loss 0.699, F1 0.9623) | Val 96.57% (loss 0.776, F1 0.9355)
[d128_L10] ** Saved new BEST ** (val_acc=96.57%) → checkpoints/d128_L10_best.pt


[d128_L10] Epoch 76:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 76 — Train 99.02% (loss 0.699, F1 0.9620) | Val 96.40% (loss 0.779, F1 0.9339)


[d128_L10] Epoch 77:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 77 — Train 99.09% (loss 0.697, F1 0.9630) | Val 96.49% (loss 0.779, F1 0.9349)


[d128_L10] Epoch 78:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 78 — Train 99.04% (loss 0.699, F1 0.9627) | Val 96.30% (loss 0.782, F1 0.9332)


[d128_L10] Epoch 79:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 79 — Train 99.06% (loss 0.699, F1 0.9627) | Val 96.41% (loss 0.780, F1 0.9336)


[d128_L10] Epoch 80:   0%|          | 0/663 [00:00<?, ?it/s]

[d128_L10] Epoch 80 — Train 99.23% (loss 0.693, F1 0.9644) | Val 96.34% (loss 0.778, F1 0.9332)


[d128_L10] Epoch 81:   0%|          | 0/663 [00:00<?, ?it/s]

In [None]:
from tensorboard import notebook

notebook.start("--logdir runs")

In [None]:
SAVE_DIR = "/content/drive/MyDrive/kws_exports_medium"
!mkdir -p "$SAVE_DIR"

# Zip folders straight to Drive paths
!zip -qr "$SAVE_DIR/checkpoints_expand.zip" /content/checkpoints
!zip -qr "$SAVE_DIR/runs_expand.zip"        /content/runs

# Verify
!ls -lh "$SAVE_DIR"