## Детекция фраз "не слышу" / "не слышно"

Бинарная классификация звонков: метка 1, если в аудио есть хотя бы одна из целевых фраз.

In [1]:
import json
import random
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio.functional as AF

from tqdm import tqdm

import sys
sys.path.append("../mel_spectrogram")
from mel_spectrogram import compute_mel_spectrogram, load_audio


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
print("device =", DEVICE)


device = cpu


### Хелперы

In [3]:
def _compute_deltas(m: np.ndarray) -> np.ndarray:
    d = np.zeros_like(m)
    d[:, 1:] = m[:, 1:] - m[:, :-1]
    return d


def _resample(wav: np.ndarray, sr: int, target_sr: int) -> tuple[np.ndarray, int]:
    """Поменять sample_rate."""
    if sr == target_sr:
        return wav, sr
    tensor = torch.tensor(wav, dtype=torch.float32)
    resampled = AF.resample(tensor, sr, target_sr)
    return resampled.numpy(), target_sr


def compute_mel_cached(
    path: Path,
    cache_dir: Path | None,
    *,
    target_sr: int,
    n_mels: int,
    window_ms: float,
    hop_ms: float,
    f_min: float,
    f_max: float | None,
) -> np.ndarray:
    """Рассчитать mel-спектрограмму. Если есть закэшированная, взять ее, иначе закэшировать."""
    cache_path = None
    if cache_dir is not None:
        cache_dir.mkdir(parents=True, exist_ok=True)
        cache_path = cache_dir / f"{path.stem}_sr{target_sr}_m{n_mels}_w{int(window_ms)}_h{int(hop_ms)}.npz"
        if cache_path.exists():
            return np.load(cache_path)["mel"]

    wav, sr = load_audio(str(path), mono=True)
    wav, sr = _resample(wav, sr, target_sr)
    mel = compute_mel_spectrogram(
        wav,
        sr,
        window_ms=window_ms,
        hop_ms=hop_ms,
        n_mels=n_mels,
        f_min=f_min,
        f_max=f_max,
        norm="slaney",
        log_scale=True,
        eps=1e-12,
    ).mel_spectrogram

    if cache_path is not None:
        np.savez(cache_path, mel=mel)
    return mel


def window_mel(mel: np.ndarray, start_frame: int, target_frames: int) -> np.ndarray:
    """Вырезать окно начиная с позиции start_frame."""
    pad_value = float(mel.min())
    end_frame = start_frame + target_frames
    if end_frame > mel.shape[1]:
        mel = np.pad(
            mel,
            ((0, 0), (0, end_frame - mel.shape[1])),
            mode="constant",
            constant_values=pad_value,
        )
    return mel[:, start_frame:end_frame]


def _pool_time(x: np.ndarray, out_T: int) -> np.ndarray:
    """Усреднить по времени до out_T бинов."""
    T = x.shape[-1]
    if T == out_T:
        return x
    edges = np.linspace(0, T, out_T + 1).astype(int)
    pooled = []
    for i in range(out_T):
        a, b = int(edges[i]), int(edges[i + 1])
        pooled.append(x[..., a:b].mean(axis=-1, keepdims=True))
    return np.concatenate(pooled, axis=-1)


def features_from_mel(mel: np.ndarray) -> torch.Tensor:
    """Фичи для DNN."""
    d1 = _compute_deltas(mel)
    d2 = _compute_deltas(d1)
    x = np.stack([mel, d1, d2], axis=0)  # (3, n_mels, T)
    x = _pool_time(x, out_T=50)          # (3, n_mels, 50)
    mu = x.mean(axis=-1, keepdims=True)
    sigma = x.std(axis=-1, keepdims=True) + 1e-5
    x = (x - mu) / sigma
    return torch.from_numpy(x.reshape(-1).astype(np.float32))


def collect_audio_files(audio_dir: Path) -> list[Path]:
    files = sorted(audio_dir.glob("*.opus"))
    return files


def load_bounds(path: Path) -> dict[str, tuple[float, float]]:
    """Загрузить конфиг с границами таргетов."""
    raw = json.loads(path.read_text())
    return {k: (float(v[0]), float(v[1])) for k, v in raw.items()}


def train_val_split(files: list[Path], val_ratio: float = 0.15, seed: int = 42):
    """Разбить на train/val."""
    rng = random.Random(seed)
    files = list(files)
    rng.shuffle(files)
    n_val = max(1, int(len(files) * val_ratio))
    return files[n_val:], files[:n_val]


### Датасет

In [4]:
class CallQualityDataset(Dataset):
    def __init__(
        self,
        files: list[Path],
        bounds: dict[str, tuple[float, float]],
        segment_seconds: float = 2.5,
        n_segments_per_file: int = 4,
        target_sr: int = 16000,
        n_mels: int = 48,
        window_ms: float = 25.0,
        hop_ms: float = 10.0,
        cache_dir: Path | None = None,
    ):
        self.files = files
        self.bounds = bounds
        self.segment_seconds = segment_seconds
        self.n_segments_per_file = n_segments_per_file
        self.target_sr = target_sr
        self.n_mels = n_mels
        self.window_ms = window_ms
        self.hop_ms = hop_ms
        self.cache_dir = cache_dir

        self.frames_per_sec = 1000.0 / hop_ms
        self.target_frames = int(round(segment_seconds * self.frames_per_sec))
        self.feature_dim = 3 * n_mels * 50

    def __len__(self):
        return len(self.files)

    def _start_time(self, total_sec: float, label: int, bounds: tuple[float, float] | None) -> float:
        if label == 0 or bounds is None:
            # Если нет таргета, взять случайное начало
            return random.uniform(0.0, max(total_sec - self.segment_seconds, 0.0))
        start, end = bounds
        center = 0.5 * (start + end)
        st = max(0.0, center - self.segment_seconds / 2)
        return min(st, max(total_sec - self.segment_seconds, 0.0))

    def __getitem__(self, idx: int):
        path = self.files[idx]
        uid = path.stem
        label = 1 if uid in self.bounds else 0
        bounds = self.bounds.get(uid)

        mel = compute_mel_cached(
            path,
            cache_dir=self.cache_dir,
            target_sr=self.target_sr,
            n_mels=self.n_mels,
            window_ms=self.window_ms,
            hop_ms=self.hop_ms,
            f_min=50.0,
            f_max=7800.0,
        )

        total_sec = mel.shape[1] / self.frames_per_sec

        starts = []
        if label == 1 and bounds is not None:
            starts.append(self._start_time(total_sec, label, bounds))
            for _ in range(max(self.n_segments_per_file - 1, 0)):
                starts.append(random.uniform(0.0, max(total_sec - self.segment_seconds, 0.0)))
        else:
            for _ in range(max(self.n_segments_per_file, 1)):
                starts.append(random.uniform(0.0, max(total_sec - self.segment_seconds, 0.0)))

        feats = []
        for start_sec in starts:
            start_frame = int(start_sec * self.frames_per_sec)
            window = window_mel(mel, start_frame, self.target_frames)
            feats.append(features_from_mel(window))
        x = torch.stack(feats, dim=0)  # (n_segments, feature_dim)
        y = torch.tensor(label, dtype=torch.float32)
        return x, y


### Модель

In [5]:
class DNNKWS(nn.Module):
    def __init__(self, input_dim: int, hidden=(256, 128), dropout: float = 0.2):
        super().__init__()
        # Нормализация
        layers = [nn.LayerNorm(input_dim)]
        d = input_dim
        for h in hidden:
            layers += [nn.Linear(d, h), nn.ReLU(), nn.Dropout(dropout)]
            d = h
        layers += [nn.Linear(d, 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


### Обучение и инференс

In [6]:
def compute_pos_weight(files: list[Path], bounds: dict[str, tuple[float, float]]):
    pos = sum(1 for f in files if f.stem in bounds)
    neg = max(len(files) - pos, 1)
    pos = max(pos, 1)
    return torch.tensor([neg / pos], dtype=torch.float32)


def train_epoch(model, loader, opt, pos_weight=None):
    model.train()
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    total_loss = 0.0
    for x, y in loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        logits = model(x)
        # x - (B, K, D), так как K окон на файл, делаем max-pool по окнам.
        logits = logits.max(dim=1).values
        loss = criterion(logits, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item() * y.size(0)
    return total_loss / len(loader.dataset)


@torch.no_grad()
def predict_file(
    model,
    path: Path,
    segment_seconds: float = 2.5,
    hop_seconds: float = 1.0,
    target_sr: int = 16000,
    n_mels: int = 48,
    window_ms: float = 25.0,
    hop_ms: float = 10.0,
    cache_dir: Path | None = None,
) -> float:
    mel = compute_mel_cached(
        path,
        cache_dir=cache_dir,
        target_sr=target_sr,
        n_mels=n_mels,
        window_ms=window_ms,
        hop_ms=hop_ms,
        f_min=50.0,
        f_max=7800.0,
    )
    frames_per_sec = 1000.0 / hop_ms
    target_frames = int(round(segment_seconds * frames_per_sec))
    hop_frames = max(1, int(round(hop_seconds * frames_per_sec)))

    probs = []
    start = 0
    while start < mel.shape[1]:
        window = window_mel(mel, start, target_frames)
        feat = features_from_mel(window).to(DEVICE)
        prob = torch.sigmoid(model(feat.unsqueeze(0))).item()
        probs.append(prob)
        if start + hop_frames >= mel.shape[1] and start != 0:
            break
        start += hop_frames
    return max(probs) if probs else 0.0


def _metrics_from_counts(tp: int, fp: int, fn: int, tn: int):
    pos = tp + fn
    neg = tn + fp
    far = fp / max(neg, 1)
    frr = fn / max(pos, 1)
    score = 0.0
    if (1 - frr + 1 - far) > 0:
        score = 2 * (1 - frr) * (1 - far) / (1 - frr + 1 - far)
    return score, far, frr


def evaluate_on_files(
    model,
    files: list[Path],
    bounds: dict[str, tuple[float, float]],
    threshold: float = 0.5,
    thresholds: list[float] | None = None,
    segment_seconds: float = 2.5,
    hop_seconds: float = 1.0,
    target_sr: int = 16000,
    n_mels: int = 48,
    window_ms: float = 25.0,
    hop_ms: float = 10.0,
    cache_dir: Path | None = None,
):
    was_training = model.training
    model.eval()

    y_true_list = []
    prob_list = []
    for path in files:
        y_true = 1 if path.stem in bounds else 0
        prob = predict_file(
            model,
            path,
            segment_seconds=segment_seconds,
            hop_seconds=hop_seconds,
            target_sr=target_sr,
            n_mels=n_mels,
            window_ms=window_ms,
            hop_ms=hop_ms,
            cache_dir=cache_dir,
        )
        y_true_list.append(y_true)
        prob_list.append(prob)

    y_true_arr = np.asarray(y_true_list, dtype=np.int8)
    prob_arr = np.asarray(prob_list, dtype=np.float32)

    cand_thresholds = thresholds or [threshold]
    best = None
    for th in cand_thresholds:
        y_pred = prob_arr >= float(th)
        tp = int(((y_true_arr == 1) & y_pred).sum())
        fp = int(((y_true_arr == 0) & y_pred).sum())
        fn = int(((y_true_arr == 1) & (~y_pred)).sum())
        tn = int(((y_true_arr == 0) & (~y_pred)).sum())
        score, far, frr = _metrics_from_counts(tp, fp, fn, tn)
        if best is None or score > best["score"]:
            best = {
                "threshold": float(th),
                "score": float(score),
                "far": float(far),
                "frr": float(frr),
                "tp": tp,
                "fp": fp,
                "fn": fn,
                "tn": tn,
            }

    if was_training:
        model.train()

    return best


def make_submission(
    model,
    files: list[Path],
    out_path: Path,
    threshold: float = 0.5,
    segment_seconds: float = 2.5,
    hop_seconds: float = 1.0,
    target_sr: int = 16000,
    n_mels: int = 48,
    window_ms: float = 25.0,
    hop_ms: float = 10.0,
    cache_dir: Path | None = None,
):
    was_training = model.training
    model.eval()
    rows = ["id,label"]
    for path in files:
        prob = predict_file(
            model,
            path,
            segment_seconds=segment_seconds,
            hop_seconds=hop_seconds,
            target_sr=target_sr,
            n_mels=n_mels,
            window_ms=window_ms,
            hop_ms=hop_ms,
            cache_dir=cache_dir,
        )
        label = 1 if prob >= threshold else 0
        rows.append(f"{path.stem},{label}")
    out_path.write_text("\n".join(rows))
    if was_training:
        model.train()
    return out_path


## Обучение

In [7]:
TRAIN_ROOT = Path("./train_opus")
TEST_ROOT = Path("./test_opus")
TRAIN_AUDIO = TRAIN_ROOT / "audio"
TEST_AUDIO = TEST_ROOT / "audio"
BOUNDS_PATH = TRAIN_ROOT / "word_bounds.json"
CACHE_DIR = Path("./mel_cache")

segment_seconds = 2.5
hop_seconds = 1.0
n_mels = 48

bounds = load_bounds(BOUNDS_PATH)
all_files = collect_audio_files(TRAIN_AUDIO)
train_files, val_files = train_val_split(all_files, val_ratio=0.15, seed=SEED)

train_ds = CallQualityDataset(
    train_files,
    bounds=bounds,
    segment_seconds=segment_seconds,
    n_segments_per_file=4,
    target_sr=16000,
    n_mels=n_mels,
    cache_dir=CACHE_DIR,
)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=0)

model = DNNKWS(input_dim=train_ds.feature_dim, hidden=(512, 256), dropout=0.1).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
pos_weight = compute_pos_weight(train_files, bounds).to(DEVICE)

best_threshold = 0.5
threshold_grid = np.linspace(0.05, 0.95, 19).tolist()

for epoch in tqdm(range(1, 16), leave=False):
    loss = train_epoch(model, train_loader, opt, pos_weight=pos_weight)
    metrics = evaluate_on_files(
        model,
        val_files,
        bounds,
        thresholds=threshold_grid,
        segment_seconds=segment_seconds,
        hop_seconds=hop_seconds,
        target_sr=16000,
        n_mels=n_mels,
        cache_dir=CACHE_DIR,
    )
    best_threshold = metrics["threshold"]
    print(
        f"epoch={epoch:02d} loss={loss:.4f} score={metrics['score']:.4f} "
        f"far={metrics['far']:.4f} frr={metrics['frr']:.4f} thr={best_threshold:.2f}"
    )


  7%|▋         | 1/15 [03:12<45:01, 192.98s/it]

epoch=01 loss=0.5007 score=0.5829 far=0.3508 frr=0.4710 thr=0.40


 13%|█▎        | 2/15 [06:27<42:02, 194.05s/it]

epoch=02 loss=0.4143 score=0.6277 far=0.3508 frr=0.3925 thr=0.35


 20%|██        | 3/15 [09:39<38:37, 193.12s/it]

epoch=03 loss=0.3768 score=0.6453 far=0.3637 frr=0.3455 thr=0.25


 27%|██▋       | 4/15 [12:50<35:14, 192.19s/it]

epoch=04 loss=0.3452 score=0.6585 far=0.2793 frr=0.3938 thr=0.30


 33%|███▎      | 5/15 [15:57<31:43, 190.39s/it]

epoch=05 loss=0.3270 score=0.6636 far=0.3175 frr=0.3542 thr=0.35


 40%|████      | 6/15 [19:03<28:19, 188.80s/it]

epoch=06 loss=0.3117 score=0.6764 far=0.3281 frr=0.3191 thr=0.15


 47%|████▋     | 7/15 [22:10<25:04, 188.08s/it]

epoch=07 loss=0.2995 score=0.6733 far=0.2579 frr=0.3838 thr=0.35


 53%|█████▎    | 8/15 [25:20<22:00, 188.68s/it]

epoch=08 loss=0.2869 score=0.6874 far=0.2820 frr=0.3406 thr=0.25


 60%|██████    | 9/15 [28:31<18:57, 189.52s/it]

epoch=09 loss=0.2801 score=0.6818 far=0.3058 frr=0.3301 thr=0.20


 67%|██████▋   | 10/15 [31:44<15:52, 190.50s/it]

epoch=10 loss=0.2730 score=0.6828 far=0.2973 frr=0.3360 thr=0.20


 73%|███████▎  | 11/15 [34:56<12:43, 190.94s/it]

epoch=11 loss=0.2615 score=0.6870 far=0.2716 frr=0.3499 thr=0.15


 80%|████████  | 12/15 [38:06<09:32, 190.83s/it]

epoch=12 loss=0.2569 score=0.6910 far=0.2649 frr=0.3482 thr=0.25


 87%|████████▋ | 13/15 [41:15<06:20, 190.30s/it]

epoch=13 loss=0.2511 score=0.6936 far=0.2756 frr=0.3347 thr=0.30


 93%|█████████▎| 14/15 [44:26<03:10, 190.44s/it]

epoch=14 loss=0.2439 score=0.6938 far=0.2813 frr=0.3294 thr=0.25


                                                

epoch=15 loss=0.2374 score=0.6901 far=0.2652 frr=0.3495 thr=0.25




In [8]:
model

DNNKWS(
  (net): Sequential(
    (0): LayerNorm((7200,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=7200, out_features=512, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.1, inplace=False)
    (7): Linear(in_features=256, out_features=1, bias=True)
  )
)

## Сабмит

In [9]:
test_files = collect_audio_files(TEST_AUDIO)
out_csv = make_submission(
    model,
    test_files,
    out_path=Path("submition.csv"),
    threshold=best_threshold,
    segment_seconds=segment_seconds,
    hop_seconds=hop_seconds,
    target_sr=16000,
    n_mels=n_mels,
    cache_dir=CACHE_DIR,
)
print(f"Сохранено в {out_csv} с {len(test_files)} записями")


Сохранено в submition.csv с 27000 записями
