In [None]:
!pip install datasets==2.16.0
!pip install huggingface-hub==0.20.0
!apt-get install -y libsox-dev
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121

Collecting datasets==2.16.0
  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.16.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.16.0)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Keyword‑Spotting project

from __future__ import annotations
import json, os, random
from pathlib import Path
from typing import Tuple, Dict

import torch, torchaudio
import torch.nn as nn
import os
import torch.nn.functional as F
import torchvision.models as tvm  # MobileNet V2 backbone
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm

# -------- Save parameters to Drive --------
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class WaveToSpec:
    """Waveform ➜ log‑Mel **or** MFCC tensor.

    Parameters
    ----------
    feature_type : "mel" | "mfcc"
        If "mfcc", `n_mfcc` controls the number of cepstral coeffs.
    normalize : bool | tuple(float,float)
        *False* – return raw log power values.
        *True*  – per‑sample standard score.
        *(mean,std)* – dataset‑wide constants.
    """
    def __init__(self,
                 feature_type: str = "mel",
                 sample_rate: int = 16_000,
                 n_fft: int = 2048,
                 hop_length: int = 128,
                 n_mels: int = 128,
                 n_mfcc: int = 40,
                 top_db: int | None = 80,
                 apply_mask: bool = True,
                 freq_mask_param: int = 15,
                 time_mask_param: int = 10):
        self.feature_type = feature_type.lower(); assert self.feature_type in {"mel","mfcc"}
        self.apply_mask = apply_mask and self.feature_type == "mel"

        if self.feature_type == "mel":
            self.spec = T.MelSpectrogram(sample_rate, n_fft, hop_length, n_mels, power=2)
            self.to_db = T.AmplitudeToDB(stype="power", top_db=top_db)
            if self.apply_mask:
                self.freq_mask = T.FrequencyMasking(freq_mask_param)
                self.time_mask = T.TimeMasking(time_mask_param)
        else:
            self.spec = T.MFCC(sample_rate, n_mfcc,
                                melkwargs=dict(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels))
            self.to_db = None
            self.freq_mask = self.time_mask = None

    def __call__(self, wav: torch.Tensor) -> torch.Tensor:
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)
        feats = self.spec(wav)
        if self.apply_mask:
            feats = self.freq_mask(feats); feats = self.time_mask(feats)
            feats = self.freq_mask(feats); feats = self.time_mask(feats)
        if self.to_db is not None:
            feats = self.to_db(feats.clamp(min=1e-10))
        return feats   # [1, F, T]

# -------------------------
# 2) Waveform-level augs
# -------------------------
class Augment:
    def __init__(self, stretch: Tuple[float,float]=(1.0,1.0),
                 shift_ms: int = 100,
                 noise: Tuple[float,float]=(0.,0.005),
                 sr: int = 16_000):
        self.stretch = stretch
        self.shift   = int(shift_ms * sr / 1000)
        self.noise   = noise
        self.sr      = sr

    def _shift(self, x: torch.Tensor):
        if self.shift == 0:
            return x
        s = int(torch.randint(-self.shift, self.shift + 1, ()).item())
        if s == 0:
            return x
        return (F.pad(x, (s, 0))[:, :-s] if s > 0 else F.pad(x, (0, -s))[:, -s:])

    def __call__(self, wav: torch.Tensor):
        squeezed = False
        if wav.dim() == 1:
            wav = wav.unsqueeze(0); squeezed = True

        # Stretch remains OFF because (1.0, 1.0)
        wav = self._shift(wav)

        if self.noise[1] > 0:
            sigma = float(torch.empty(()).uniform_(*self.noise))
            if sigma > 0:
                wav = wav + sigma * torch.randn_like(wav)

        return wav.squeeze(0) if squeezed else wav

In [None]:
# -------------------------
# 3) Dataset + Collate
# -------------------------
class SpeechCommands(Dataset):
    """HF split → (feature, label) with fixed waveform length (like your Mamba)."""
    def __init__(self, hf_split, aug: Augment | None, frontend: WaveToSpec, wav_len: int = 16_000):  # CHANGED: wav_len
        self.ds, self.aug, self.front = hf_split, aug, frontend
        self.wav_len = wav_len  # pad/crop wave to this length before spec

    def __len__(self): return len(self.ds)

    def __getitem__(self, idx):
        item = self.ds[idx]
        wav = torch.from_numpy(item["audio"]["array"]).float()

        # CHANGED: fixed wav_len like Mamba
        if wav.numel() < self.wav_len:
            wav = F.pad(wav, (0, self.wav_len - wav.numel()))
        else:
            wav = wav[: self.wav_len]

        if self.aug:
            wav = self.aug(wav)

        feats = self.front(wav)                          # [1, F, T]
        feats = (feats - feats.mean()) / (feats.std() + 1e-6)
        return feats, item["label"]

def collate_pad(batch):
    feats, labels = zip(*batch)
    B, F = len(feats), feats[0].size(1)
    T_max = max(x.size(-1) for x in feats)
    out = feats[0].new_zeros(B, 1, F, T_max)
    for i, x in enumerate(feats):
        T = x.size(-1)
        out[i, :, :, :T] = x
    return out, torch.tensor(labels, dtype=torch.long)


# -------------------------
# 4) Model
# -------------------------
class KeywordCNN(nn.Module):
    """Small CNN baseline with time-preserving pool and dropout head."""
    def __init__(self, n_classes:int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
            nn.MaxPool2d((2,1)),
            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(128, n_classes)
    def forward(self, x):
        x = self.features(x).flatten(1)
        x = self.dropout(x)
        return self.fc(x)

def build_mobilenet_v2(num_classes: int, alpha: float = 0.75, pretrained: bool = False) -> nn.Module:
    import torchvision.models as tvm
    net = tvm.mobilenet_v2(width_mult=alpha,
                           weights=None if not pretrained else tvm.MobileNet_V2_Weights.DEFAULT)
    # adapt first conv to 1 channel
    first = net.features[0][0]
    net.features[0][0] = nn.Conv2d(1, first.out_channels,
                                   kernel_size=first.kernel_size,
                                   stride=first.stride,
                                   padding=first.padding,
                                   bias=False)
    # simple dropout + classifier
    in_feats = net.classifier[-1].in_features
    net.classifier = nn.Sequential(
        nn.Dropout(0.2),
        nn.Linear(in_feats, num_classes)
    )
    return net

# -------------------------
# 5) Eval helper
# -------------------------
@torch.no_grad()
def evaluate(model, loader, device, criterion):
    model.eval(); tot=correct=loss_sum=0
    for xb,yb in loader:
        xb,yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits,yb)
        loss_sum += loss.item()*xb.size(0)
        correct  += (logits.argmax(1)==yb).sum().item(); tot += xb.size(0)
    return loss_sum/tot, 100.0*correct/tot

In [None]:
import math
# -------------------------
# 6) Main
# -------------------------
if __name__ == "__main__":
    # ---- dataset
    ds = load_dataset("google/speech_commands", "v0.02")
    labels = ds["train"].features["label"].names
    n_classes = len(labels)

    feature_type = "mel"               # "mel" or "mfcc"
    Epochs = 100
    base_lr = 1e-3
    warmup_frac = 0.1
    wav_len = 16_000

    # train/eval frontends (SpecAugment only on train)
    frontend_train = WaveToSpec(
        feature_type=feature_type,
        n_mfcc=40, n_mels=128,
        apply_mask=True,
        freq_mask_param=14, time_mask_param=24
    )
    frontend_eval = WaveToSpec(
        feature_type=feature_type,
        n_mfcc=40, n_mels=128,
        apply_mask=False
    )

    # augs: shift + small noise; stretch OFF by default (1.0,1.0)
    aug = Augment(stretch=(1.0, 1.0), shift_ms=100, noise=(0., 0.005))

    train_ds = SpeechCommands(ds["train"],      aug,  frontend_train, wav_len=wav_len)
    val_ds   = SpeechCommands(ds["validation"], None, frontend_eval,  wav_len=wav_len)
    test_ds  = SpeechCommands(ds["test"],       None, frontend_eval,  wav_len=wav_len)

    dl_kwargs = dict(batch_size=128, num_workers=4, pin_memory=True, collate_fn=collate_pad)
    train_dl  = DataLoader(train_ds, shuffle=True,  **dl_kwargs)
    val_dl    = DataLoader(val_ds,   shuffle=False, **dl_kwargs)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    use_amp = (device.type == "cuda")
    scaler = torch.amp.GradScaler(device.type, enabled=use_amp)

    model = build_mobilenet_v2(n_classes, alpha=0.75, pretrained=False).to(device)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=5e-5, betas=(0.9, 0.999))

    # warmup + cosine tail + Plateau (after warmup)
    steps_per_epoch = len(train_dl)
    true_steps      = steps_per_epoch * Epochs
    total_steps     = steps_per_epoch * int(Epochs * 1.5)
    warmup_steps    = int(true_steps * warmup_frac)

    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))

    sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
    plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode="min", factor=0.5, patience=3, threshold=1e-3, cooldown=0, min_lr=1e-6
    )
    prev_plateau_lr = opt.param_groups[0]['lr']

    best_val_acc = 0.0
    BEST_PATH = Path("/content/best_kws_cnn.pt")
    CKPT_DIR = Path("/content/drive/MyDrive/kws_models")
    CKPT_DIR.mkdir(parents=True, exist_ok=True)

    global_step = 0
    for epoch in range(1, Epochs + 1):
        model.train()
        running_loss = correct = total = 0
        pbar = tqdm(train_dl, desc=f"Epoch {epoch:02d}")

        for xb, yb in pbar:
            xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
            with torch.amp.autocast(device.type, enabled=use_amp):
                logits = model(xb)
                loss = criterion(logits, yb)
                if not torch.isfinite(loss):
                    continue

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.3)
            scaler.step(opt); scaler.update()
            sched.step()   # per-batch
            global_step += 1

            pred = logits.argmax(1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)
            running_loss += loss.item() * yb.size(0)
            pbar.set_postfix(
                train_loss=f"{running_loss/max(1,total):.3f}",
                train_acc=f"{100*correct/max(1,total):.1f}%",
                lr=f"{opt.param_groups[0]['lr']:.2e}"
            )

        val_loss, val_acc = evaluate(model, val_dl, device, criterion)
        print(f"Epoch {epoch:02d} ➜ train {100*correct/max(1,total):.1f}% | "
              f"val {val_acc:.1f}% (loss {val_loss:.3f}) | lr {opt.param_groups[0]['lr']:.2e}")

        # After warmup, allow Plateau to adjust LR (epoch-level)
        if global_step >= warmup_steps:
            plateau.step(val_loss)
            new_lr = opt.param_groups[0]['lr']
            if new_lr < prev_plateau_lr - 1e-12:
                print(f"[Plateau] LR reduced: {prev_plateau_lr:.2e} -> {new_lr:.2e}")
            prev_plateau_lr = new_lr

        # Save best-by-accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), BEST_PATH)
            print(f"Saved new BEST CNN @ {best_val_acc:.1f}%")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Epoch 01:   0%|          | 0/663 [00:00<?, ?it/s]



Epoch 01 ➜ train 4.0% | val 6.0% (loss 3.478) | lr 1.00e-04
★ Saved new BEST CNN @ 6.0%


Epoch 02:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 02 ➜ train 15.5% | val 51.6% (loss 1.857) | lr 2.00e-04
★ Saved new BEST CNN @ 51.6%


Epoch 03:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 03 ➜ train 58.6% | val 80.8% (loss 0.977) | lr 3.00e-04
★ Saved new BEST CNN @ 80.8%


Epoch 04:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 04 ➜ train 74.9% | val 86.9% (loss 0.806) | lr 4.00e-04
★ Saved new BEST CNN @ 86.9%


Epoch 05:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 05 ➜ train 79.8% | val 88.9% (loss 0.737) | lr 5.00e-04
★ Saved new BEST CNN @ 88.9%


Epoch 06:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 06 ➜ train 82.1% | val 88.7% (loss 0.734) | lr 6.00e-04


Epoch 07:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 07 ➜ train 83.8% | val 91.3% (loss 0.659) | lr 7.00e-04
★ Saved new BEST CNN @ 91.3%


Epoch 08:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 08 ➜ train 84.6% | val 92.1% (loss 0.635) | lr 8.00e-04
★ Saved new BEST CNN @ 92.1%


Epoch 09:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 09 ➜ train 85.4% | val 92.3% (loss 0.634) | lr 9.00e-04
★ Saved new BEST CNN @ 92.3%


Epoch 10:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 10 ➜ train 86.4% | val 92.3% (loss 0.627) | lr 1.00e-03


Epoch 11:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 11 ➜ train 86.9% | val 93.1% (loss 0.604) | lr 1.00e-03
[Plateau] LR reduced: 1.00e-03 -> 1.00e-03
★ Saved new BEST CNN @ 93.1%


Epoch 12:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 12 ➜ train 87.6% | val 93.1% (loss 0.593) | lr 9.99e-04
[Plateau] LR reduced: 1.00e-03 -> 9.99e-04


Epoch 13:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 13 ➜ train 88.2% | val 93.9% (loss 0.565) | lr 9.99e-04
[Plateau] LR reduced: 9.99e-04 -> 9.99e-04
★ Saved new BEST CNN @ 93.9%


Epoch 14:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 14 ➜ train 88.7% | val 93.9% (loss 0.567) | lr 9.98e-04
[Plateau] LR reduced: 9.99e-04 -> 9.98e-04
★ Saved new BEST CNN @ 93.9%


Epoch 15:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 15 ➜ train 89.1% | val 94.1% (loss 0.568) | lr 9.97e-04
[Plateau] LR reduced: 9.98e-04 -> 9.97e-04
★ Saved new BEST CNN @ 94.1%


Epoch 16:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 16 ➜ train 89.3% | val 93.9% (loss 0.568) | lr 9.95e-04
[Plateau] LR reduced: 9.97e-04 -> 9.95e-04


Epoch 17:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 17 ➜ train 89.8% | val 94.5% (loss 0.547) | lr 9.94e-04
[Plateau] LR reduced: 9.95e-04 -> 9.94e-04
★ Saved new BEST CNN @ 94.5%


Epoch 18:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 18 ➜ train 90.2% | val 94.5% (loss 0.541) | lr 9.92e-04
[Plateau] LR reduced: 9.94e-04 -> 9.92e-04


Epoch 19:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 19 ➜ train 90.5% | val 94.9% (loss 0.533) | lr 9.90e-04
[Plateau] LR reduced: 9.92e-04 -> 9.90e-04
★ Saved new BEST CNN @ 94.9%


Epoch 20:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 20 ➜ train 90.7% | val 95.0% (loss 0.534) | lr 9.87e-04
[Plateau] LR reduced: 9.90e-04 -> 9.87e-04
★ Saved new BEST CNN @ 95.0%


Epoch 21:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 21 ➜ train 90.8% | val 94.8% (loss 0.535) | lr 9.85e-04
[Plateau] LR reduced: 9.87e-04 -> 9.85e-04


Epoch 22:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 22 ➜ train 91.0% | val 95.1% (loss 0.530) | lr 9.82e-04
[Plateau] LR reduced: 9.85e-04 -> 9.82e-04
★ Saved new BEST CNN @ 95.1%


Epoch 23:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 23 ➜ train 91.1% | val 94.8% (loss 0.539) | lr 9.79e-04
[Plateau] LR reduced: 9.82e-04 -> 9.79e-04


Epoch 24:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 24 ➜ train 91.4% | val 94.8% (loss 0.535) | lr 9.76e-04
[Plateau] LR reduced: 9.79e-04 -> 9.76e-04


Epoch 25:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 25 ➜ train 91.5% | val 95.6% (loss 0.514) | lr 9.72e-04
[Plateau] LR reduced: 9.76e-04 -> 9.72e-04
★ Saved new BEST CNN @ 95.6%


Epoch 26:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 26 ➜ train 91.8% | val 95.3% (loss 0.520) | lr 9.68e-04
[Plateau] LR reduced: 9.72e-04 -> 9.68e-04


Epoch 27:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 27 ➜ train 91.8% | val 95.2% (loss 0.519) | lr 9.64e-04
[Plateau] LR reduced: 9.68e-04 -> 9.64e-04


Epoch 28:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 28 ➜ train 91.9% | val 95.4% (loss 0.511) | lr 9.60e-04
[Plateau] LR reduced: 9.64e-04 -> 9.60e-04


Epoch 29:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 29 ➜ train 92.1% | val 95.1% (loss 0.519) | lr 9.55e-04
[Plateau] LR reduced: 9.60e-04 -> 9.55e-04


Epoch 30:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 30 ➜ train 92.1% | val 95.3% (loss 0.514) | lr 9.50e-04
[Plateau] LR reduced: 9.55e-04 -> 9.50e-04


Epoch 31:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 31 ➜ train 92.4% | val 95.5% (loss 0.513) | lr 9.46e-04
[Plateau] LR reduced: 9.50e-04 -> 9.46e-04


Epoch 32:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 32 ➜ train 92.5% | val 95.4% (loss 0.515) | lr 9.40e-04
[Plateau] LR reduced: 9.46e-04 -> 4.70e-04


Epoch 33:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 33 ➜ train 92.5% | val 95.5% (loss 0.512) | lr 9.35e-04


Epoch 34:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 34 ➜ train 92.6% | val 95.3% (loss 0.515) | lr 9.29e-04
[Plateau] LR reduced: 9.35e-04 -> 9.29e-04


Epoch 35:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 35 ➜ train 92.7% | val 95.4% (loss 0.516) | lr 9.23e-04
[Plateau] LR reduced: 9.29e-04 -> 9.23e-04


Epoch 36:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 36 ➜ train 92.9% | val 95.4% (loss 0.514) | lr 9.17e-04
[Plateau] LR reduced: 9.23e-04 -> 4.59e-04


Epoch 37:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 37 ➜ train 92.9% | val 95.6% (loss 0.512) | lr 9.11e-04


Epoch 38:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 38 ➜ train 93.1% | val 95.4% (loss 0.512) | lr 9.05e-04
[Plateau] LR reduced: 9.11e-04 -> 9.05e-04


Epoch 39:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 39 ➜ train 93.2% | val 95.8% (loss 0.508) | lr 8.98e-04
[Plateau] LR reduced: 9.05e-04 -> 8.98e-04
★ Saved new BEST CNN @ 95.8%


Epoch 40:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 40 ➜ train 93.2% | val 96.0% (loss 0.501) | lr 8.91e-04
[Plateau] LR reduced: 8.98e-04 -> 8.91e-04
★ Saved new BEST CNN @ 96.0%


Epoch 41:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 41 ➜ train 93.2% | val 95.6% (loss 0.505) | lr 8.84e-04
[Plateau] LR reduced: 8.91e-04 -> 8.84e-04


Epoch 42:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 42 ➜ train 93.4% | val 95.8% (loss 0.499) | lr 8.77e-04
[Plateau] LR reduced: 8.84e-04 -> 8.77e-04


Epoch 43:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 43 ➜ train 93.4% | val 95.8% (loss 0.504) | lr 8.69e-04
[Plateau] LR reduced: 8.77e-04 -> 8.69e-04


Epoch 44:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 44 ➜ train 93.5% | val 95.7% (loss 0.503) | lr 8.61e-04
[Plateau] LR reduced: 8.69e-04 -> 8.61e-04


Epoch 45:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 45 ➜ train 93.6% | val 95.7% (loss 0.497) | lr 8.54e-04
[Plateau] LR reduced: 8.61e-04 -> 8.54e-04


Epoch 46:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 46 ➜ train 93.8% | val 95.7% (loss 0.503) | lr 8.46e-04
[Plateau] LR reduced: 8.54e-04 -> 8.46e-04


Epoch 47:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 47 ➜ train 93.8% | val 95.8% (loss 0.499) | lr 8.37e-04
[Plateau] LR reduced: 8.46e-04 -> 8.37e-04


Epoch 48:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 48 ➜ train 93.9% | val 95.8% (loss 0.501) | lr 8.29e-04
[Plateau] LR reduced: 8.37e-04 -> 8.29e-04


Epoch 49:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 49 ➜ train 93.9% | val 96.0% (loss 0.498) | lr 8.20e-04
[Plateau] LR reduced: 8.29e-04 -> 4.10e-04
★ Saved new BEST CNN @ 96.0%


Epoch 50:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 50 ➜ train 94.0% | val 96.1% (loss 0.498) | lr 8.12e-04
★ Saved new BEST CNN @ 96.1%


Epoch 51:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 51 ➜ train 94.1% | val 95.8% (loss 0.501) | lr 8.03e-04
[Plateau] LR reduced: 8.12e-04 -> 8.03e-04


Epoch 52:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 52 ➜ train 94.1% | val 95.9% (loss 0.497) | lr 7.94e-04
[Plateau] LR reduced: 8.03e-04 -> 7.94e-04


Epoch 53:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 53 ➜ train 94.0% | val 96.0% (loss 0.494) | lr 7.85e-04
[Plateau] LR reduced: 7.94e-04 -> 7.85e-04


Epoch 54:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 54 ➜ train 94.3% | val 95.9% (loss 0.500) | lr 7.75e-04
[Plateau] LR reduced: 7.85e-04 -> 7.75e-04


Epoch 55:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 55 ➜ train 94.4% | val 96.0% (loss 0.497) | lr 7.66e-04
[Plateau] LR reduced: 7.75e-04 -> 7.66e-04


Epoch 56:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 56 ➜ train 94.3% | val 96.0% (loss 0.498) | lr 7.56e-04
[Plateau] LR reduced: 7.66e-04 -> 7.56e-04


Epoch 57:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 57 ➜ train 94.4% | val 96.3% (loss 0.494) | lr 7.47e-04
[Plateau] LR reduced: 7.56e-04 -> 3.73e-04
★ Saved new BEST CNN @ 96.3%


Epoch 58:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 58 ➜ train 94.5% | val 96.2% (loss 0.494) | lr 7.37e-04


Epoch 59:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 59 ➜ train 94.6% | val 96.0% (loss 0.490) | lr 7.27e-04
[Plateau] LR reduced: 7.37e-04 -> 7.27e-04


Epoch 60:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 60 ➜ train 94.7% | val 95.9% (loss 0.501) | lr 7.17e-04
[Plateau] LR reduced: 7.27e-04 -> 7.17e-04


Epoch 61:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 61 ➜ train 94.7% | val 95.8% (loss 0.501) | lr 7.07e-04
[Plateau] LR reduced: 7.17e-04 -> 7.07e-04


Epoch 62:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 62 ➜ train 94.9% | val 96.3% (loss 0.492) | lr 6.97e-04
[Plateau] LR reduced: 7.07e-04 -> 6.97e-04


Epoch 63:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 63 ➜ train 94.8% | val 95.9% (loss 0.498) | lr 6.86e-04
[Plateau] LR reduced: 6.97e-04 -> 3.43e-04


Epoch 64:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 64 ➜ train 95.0% | val 96.3% (loss 0.488) | lr 6.76e-04
★ Saved new BEST CNN @ 96.3%


Epoch 65:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 65 ➜ train 94.9% | val 96.0% (loss 0.505) | lr 6.65e-04
[Plateau] LR reduced: 6.76e-04 -> 6.65e-04


Epoch 66:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 66 ➜ train 95.0% | val 96.2% (loss 0.497) | lr 6.55e-04
[Plateau] LR reduced: 6.65e-04 -> 6.55e-04


Epoch 67:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 67 ➜ train 95.0% | val 96.1% (loss 0.497) | lr 6.44e-04
[Plateau] LR reduced: 6.55e-04 -> 6.44e-04


Epoch 68:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 68 ➜ train 95.1% | val 96.1% (loss 0.495) | lr 6.33e-04
[Plateau] LR reduced: 6.44e-04 -> 3.17e-04


Epoch 69:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 69 ➜ train 95.1% | val 96.2% (loss 0.493) | lr 6.22e-04


Epoch 70:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 70 ➜ train 95.1% | val 96.0% (loss 0.498) | lr 6.11e-04
[Plateau] LR reduced: 6.22e-04 -> 6.11e-04


Epoch 71:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 71 ➜ train 95.3% | val 96.0% (loss 0.498) | lr 6.00e-04
[Plateau] LR reduced: 6.11e-04 -> 6.00e-04


Epoch 72:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 72 ➜ train 95.3% | val 96.1% (loss 0.498) | lr 5.89e-04
[Plateau] LR reduced: 6.00e-04 -> 2.95e-04


Epoch 73:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 73 ➜ train 95.4% | val 96.2% (loss 0.498) | lr 5.78e-04


Epoch 74:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 74 ➜ train 95.5% | val 96.3% (loss 0.490) | lr 5.67e-04
[Plateau] LR reduced: 5.78e-04 -> 5.67e-04


Epoch 75:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 75 ➜ train 95.5% | val 96.2% (loss 0.492) | lr 5.56e-04
[Plateau] LR reduced: 5.67e-04 -> 5.56e-04


Epoch 76:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 76 ➜ train 95.6% | val 96.2% (loss 0.492) | lr 5.45e-04
[Plateau] LR reduced: 5.56e-04 -> 2.72e-04


Epoch 77:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 77 ➜ train 95.6% | val 96.1% (loss 0.496) | lr 5.34e-04


Epoch 78:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 78 ➜ train 95.7% | val 96.2% (loss 0.495) | lr 5.22e-04
[Plateau] LR reduced: 5.34e-04 -> 5.22e-04


Epoch 79:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 79 ➜ train 95.7% | val 96.2% (loss 0.494) | lr 5.11e-04
[Plateau] LR reduced: 5.22e-04 -> 5.11e-04


Epoch 80:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 80 ➜ train 95.8% | val 96.3% (loss 0.495) | lr 5.00e-04
[Plateau] LR reduced: 5.11e-04 -> 2.50e-04


Epoch 81:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 81 ➜ train 95.9% | val 96.2% (loss 0.494) | lr 4.89e-04


Epoch 82:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 82 ➜ train 95.8% | val 96.3% (loss 0.493) | lr 4.78e-04
[Plateau] LR reduced: 4.89e-04 -> 4.78e-04
★ Saved new BEST CNN @ 96.3%


Epoch 83:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 83 ➜ train 96.0% | val 96.2% (loss 0.498) | lr 4.66e-04
[Plateau] LR reduced: 4.78e-04 -> 4.66e-04


Epoch 84:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 84 ➜ train 95.9% | val 96.2% (loss 0.494) | lr 4.55e-04
[Plateau] LR reduced: 4.66e-04 -> 2.28e-04


Epoch 85:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 85 ➜ train 95.9% | val 96.2% (loss 0.496) | lr 4.44e-04


Epoch 86:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 86 ➜ train 96.0% | val 96.3% (loss 0.495) | lr 4.33e-04
[Plateau] LR reduced: 4.44e-04 -> 4.33e-04


Epoch 87:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 87 ➜ train 96.1% | val 96.3% (loss 0.492) | lr 4.22e-04
[Plateau] LR reduced: 4.33e-04 -> 4.22e-04


Epoch 88:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 88 ➜ train 96.1% | val 96.3% (loss 0.492) | lr 4.11e-04
[Plateau] LR reduced: 4.22e-04 -> 2.05e-04


Epoch 89:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 89 ➜ train 96.2% | val 96.2% (loss 0.494) | lr 4.00e-04


Epoch 90:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 90 ➜ train 96.1% | val 96.5% (loss 0.489) | lr 3.89e-04
[Plateau] LR reduced: 4.00e-04 -> 3.89e-04
★ Saved new BEST CNN @ 96.5%


Epoch 91:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 91 ➜ train 96.3% | val 96.4% (loss 0.490) | lr 3.78e-04
[Plateau] LR reduced: 3.89e-04 -> 3.78e-04


Epoch 92:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 92 ➜ train 96.2% | val 96.2% (loss 0.491) | lr 3.67e-04
[Plateau] LR reduced: 3.78e-04 -> 1.83e-04


Epoch 93:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 93 ➜ train 96.3% | val 96.5% (loss 0.489) | lr 3.56e-04
★ Saved new BEST CNN @ 96.5%


Epoch 94:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 94 ➜ train 96.3% | val 96.3% (loss 0.494) | lr 3.45e-04
[Plateau] LR reduced: 3.56e-04 -> 3.45e-04


Epoch 95:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 95 ➜ train 96.3% | val 96.3% (loss 0.492) | lr 3.35e-04
[Plateau] LR reduced: 3.45e-04 -> 3.35e-04


Epoch 96:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 96 ➜ train 96.5% | val 96.4% (loss 0.490) | lr 3.24e-04
[Plateau] LR reduced: 3.35e-04 -> 1.62e-04


Epoch 97:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 97 ➜ train 96.5% | val 96.3% (loss 0.489) | lr 3.14e-04


Epoch 98:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 98 ➜ train 96.5% | val 96.4% (loss 0.491) | lr 3.03e-04
[Plateau] LR reduced: 3.14e-04 -> 3.03e-04


Epoch 99:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 99 ➜ train 96.4% | val 96.4% (loss 0.489) | lr 2.93e-04
[Plateau] LR reduced: 3.03e-04 -> 2.93e-04


Epoch 100:   0%|          | 0/663 [00:00<?, ?it/s]

Epoch 100 ➜ train 96.6% | val 96.4% (loss 0.491) | lr 2.83e-04
[Plateau] LR reduced: 2.93e-04 -> 1.42e-04


In [None]:
    # ---- save LAST weights to Drive ----
    torch.save(model.state_dict(), "/content/last_cnn_raw.pt")
    torch.save(model.state_dict(), CKPT_DIR / "last_cnn_raw.pt")
    if BEST_PATH.exists():
        import shutil
        shutil.copy2(BEST_PATH, CKPT_DIR / "best_cnn.pt")
        print(f"Copied BEST CNN to Drive: {CKPT_DIR/'best_cnn.pt'}")

Copied BEST CNN to Drive: /content/drive/MyDrive/kws_models/best_cnn.pt
