In [None]:
import librosa
import numpy as np
import matplotlib
import os
import glob
import random
import soundfile as sf
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

In [None]:
def load_fix_audio(path, target_sr=16000, target_len_sec=3.0):
    y, sr = librosa.load(path, sr=target_sr, mono=True)

    n_target = int(target_sr * target_len_sec)

    if len(y) < n_target:
        y = np.pad(y, (0, n_target - len(y)), mode='constant')
    else:
        y = y[:n_target]

    return y

In [None]:
# функуия для разбиения данных
def split_dataset(root_dir, test_size=0.15, val_size=0.15, min_per_class=2):
    root_path = Path(root_dir)
    files = list(root_path.rglob("*.wav"))
    files = [str(p) for p in files]
    # files = glob.glob(os.path.join(root_dir, "**/*.wav"), recursive=True)
    labels = [1 if "barbie" in f else 0 for f in files]

    n_barbie = sum(labels)
    n_puppy = len(labels) - n_barbie
    print(f"Всего: {len(files)} файлов (barbie={n_barbie}, puppy={n_puppy})")

    X_train, X_temp, y_train, y_temp = train_test_split(
        files, labels, test_size=test_size + val_size, stratify=labels, random_state=42
    )

    rel_val = val_size / (test_size + val_size)

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=rel_val, stratify=y_temp, random_state=42
    )

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


In [None]:
def extract_logmel(y, sr=16000, n_mels=64):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel = librosa.power_to_db(mel, ref=np.max)
    return logmel

In [None]:
augment_transform = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    Shift(p=0.5),
])

def load_and_augment(path, sr=16000):
    y, _ = librosa.load(path, sr=sr)

    y_aug = augment_transform(samples=y, sample_rate=sr)
    return y_aug


In [None]:
class AudioDataset(Dataset):
    def __init__(self, files, labels, feature_type="logmel", augment=False):
        self.files = files
        self.labels = labels
        self.feature_type = feature_type
        self.augment = augment

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        label = self.labels[idx]

        y = load_fix_audio(path)

        if self.augment:
            y = augment_transform(samples=y, sample_rate=16000)

        if self.feature_type == "logmel":
            feat = extract_logmel(y)
        # elif self.feature_type == "cqt":
        #     feat = extract_cqt(path)
        # elif self.feature_type == "mfccdelta":
        #     feat = extract_mfcc_delta(path)
        else:
            raise ValueError("Unknown feature type")

        feat = (feat - feat.mean()) / (feat.std() + 1e-8)
        feat = torch.tensor(feat, dtype=torch.float32).unsqueeze(0)

        return feat, torch.tensor(label, dtype=torch.long)


In [None]:
class MLP(nn.Module):
    def __init__(self, input_shape, hidden_size=256):
        super().__init__()

        freq, time = input_shape
        input_dim = freq * time

        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden_size, 2)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

root_dir = "data"
(train_files, train_labels), (val_files, val_labels), (test_files, test_labels) = split_dataset(root_dir)

feature_type = "logmel"

train_ds = AudioDataset(train_files, train_labels, feature_type=feature_type, augment=True)
val_ds   = AudioDataset(val_files,   val_labels,   feature_type=feature_type, augment=False)
test_ds  = AudioDataset(test_files,  test_labels,  feature_type=feature_type, augment=False)

train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=16, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=16, shuffle=False)


Всего: 98 файлов (barbie=50, puppy=48)


In [None]:
len(train_ds)

68

In [None]:
sample_x, _ = train_ds[0]
input_shape = sample_x.squeeze(0).shape
input_shape

torch.Size([64, 94])

In [None]:
model = MLP(input_shape=input_shape, hidden_size=256).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def run_epoch(model, dataloader, optimizer=None):
    if optimizer is None:
        model.eval()
    else:
        model.train()

    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)

        if optimizer is not None:
            optimizer.zero_grad()

        logits = model(x)
        loss = criterion(logits, y)

        if optimizer is not None:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == y).sum().item()
        total_samples += y.size(0)

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples
    return avg_loss, avg_acc

In [None]:
num_epochs = 15

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = run_epoch(model, train_dl, optimizer)
    val_loss, val_acc = run_epoch(model, val_dl, optimizer=None)

    print(
        f"Epoch {epoch:02d} | "
        f"train_loss={train_loss:.4f}, train_acc={train_acc:.3f} | "
        f"val_loss={val_loss:.4f}, val_acc={val_acc:.3f}"
    )

Epoch 01 | train_loss=0.6010, train_acc=0.647 | val_loss=0.6160, val_acc=0.600
Epoch 02 | train_loss=0.7575, train_acc=0.500 | val_loss=0.5863, val_acc=0.667
Epoch 03 | train_loss=0.7216, train_acc=0.574 | val_loss=0.6048, val_acc=0.600
Epoch 04 | train_loss=0.6836, train_acc=0.603 | val_loss=0.6602, val_acc=0.533
Epoch 05 | train_loss=0.7230, train_acc=0.662 | val_loss=0.7342, val_acc=0.533
Epoch 06 | train_loss=0.7144, train_acc=0.559 | val_loss=0.7167, val_acc=0.467
Epoch 07 | train_loss=0.7191, train_acc=0.632 | val_loss=0.6645, val_acc=0.600
Epoch 08 | train_loss=0.6779, train_acc=0.618 | val_loss=0.6582, val_acc=0.667
Epoch 09 | train_loss=0.7070, train_acc=0.529 | val_loss=0.6516, val_acc=0.533
Epoch 10 | train_loss=0.6929, train_acc=0.662 | val_loss=0.6401, val_acc=0.667
Epoch 11 | train_loss=0.6730, train_acc=0.618 | val_loss=0.6364, val_acc=0.600
Epoch 12 | train_loss=0.6510, train_acc=0.691 | val_loss=0.6299, val_acc=0.600
Epoch 13 | train_loss=0.6400, train_acc=0.632 | val_

In [None]:
test_loss, test_acc = run_epoch(model, test_dl, optimizer=None)
print(f"\nTest: loss={test_loss:.4f}, acc={test_acc:.3f}")


Test: loss=0.6481, acc=0.467


In [None]:
def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            logits = model(x)
            probs = torch.softmax(logits, dim=1)
            preds = logits.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.numpy())
            all_probs.extend(probs.cpu().numpy())

    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

test_preds, test_labels, test_probs = predict(model, test_dl, device)

In [None]:
print("Распределение в тестовой выборке:")
print(f"  True labels:  unique={np.unique(test_labels)}, counts={np.bincount(test_labels)}")
print(f"  Predictions:  unique={np.unique(test_preds)}, counts={np.bincount(test_preds)}")

Распределение в тестовой выборке:
  True labels:  unique=[0 1], counts=[8 7]
  Predictions:  unique=[0 1], counts=[6 9]


In [None]:
class_names = ['puppy', 'barbie']

print(classification_report(
    test_labels,
    test_preds,
    labels=[0, 1],
    target_names=class_names,
    zero_division=0
))


              precision    recall  f1-score   support

       puppy       0.00      0.00      0.00         0
      barbie       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       0.50      0.50      0.50        15
weighted avg       1.00      1.00      1.00        15

