In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Libraries for processing sounds
import librosa
from IPython.display import Audio
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
df = pd.read_csv('data/Animal_Sound_processed.csv')
len(df)

650

In [8]:
from ClassesData.AnimalSoundDataset import AnimalSoundDataset

dataset_train = AnimalSoundDataset(df, split='train', split_ratio=0.8, seed=42)
dataset_val = AnimalSoundDataset(df, split='val', split_ratio=0.8, seed=42)

Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
train set contains 2327 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
val set contains 582 segments.


In [9]:
# in a paper there are 20000 samples in the training set
# and they use 1000 for batch
# we have 2000 samples in the training set
# therefore we use 128 for batch
batch_size = 128

In [10]:
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

print(f"Number of batches in train loader: {len(train_loader)}")

for batch in train_loader:
    x_batch, y_batch = batch
    print(x_batch.shape)  # [batch_size, 1, 128, 400]
    print(y_batch.shape)  # [batch_size]
    break

Number of batches in train loader: 19
torch.Size([128, 2, 60, 41])
torch.Size([128])


In [11]:
input_dim = dataset_train[0][0].shape[0]  # e.g., (2, 60, 41) for short segments
n_classes = len(dataset_train.classes)
print(f"Input dimension: {input_dim}, Number of classes: {n_classes}")

hyperparameters = dict(
    input_dim=input_dim,
    output_dim=n_classes,
    hidden_layers_size=5000,
    activation='relu',
    kernel_size_conv=[(57, 6), (1, 3)],
    kernel_size_pool=[(4, 3), (1, 3)],
    stride_conv=[(1, 1), (1, 1)],
    stride_pool=[(1, 3), (1, 3)],
    filters=[80, 80],
    batch_normalization=False,
    dropout_rate=0.5,
    # trainer hyperparameters
    learning_rate=0.002,
    weight_decay=0.001,
    momentum=0.9,
    nesterov=True,

    # questionable hyperparameters
    #batch_size=batch_size,
    max_epoch=10
)

Input dimension: 2, Number of classes: 13


In [12]:
from ClassesML.AudioModel import AudioModel
model = AudioModel(hyperparameters).to(device)

ModuleNotFoundError: No module named 'Utilities'

In [None]:
import torch.nn.functional as F
def predict_clip(model, segment_tensors, device, method='prob'):
    """
    Generate prediction for a whole clip given its segments.

    segment_tensors: tensor of shape (n_segments, channels, mel_bands, frames)
    method: 'majority' or 'prob'
    """
    model.eval()
    segment_tensors = segment_tensors.to(device)
    with torch.no_grad():
        outputs = model(segment_tensors)  # (n_segments, num_classes)
        probs = F.softmax(outputs, dim=1)

    if method == 'majority':
        preds = torch.argmax(probs, dim=1)
        counts = torch.bincount(preds)
        clip_pred = torch.argmax(counts).item()
    elif method == 'prob':
        avg_probs = probs.mean(dim=0)
        clip_pred = torch.argmax(avg_probs).item()
    else:
        raise ValueError("method must be 'majority' or 'prob'")

    return clip_pred


In [None]:
from ClassesML.AudioTrainer import AudioTrainer
trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)
trainer.train()

  return F.conv2d(
Epoch 1/10: 100%|██████████| 19/19 [00:14<00:00,  1.31it/s]


Epoch 1: Train Loss: 2.4937, Train Acc: 15.5714, Val Loss: 2.4500, Val Acc: 31.7545


Epoch 2/10: 100%|██████████| 19/19 [00:13<00:00,  1.38it/s]


Epoch 2: Train Loss: 2.3670, Train Acc: 22.1253, Val Loss: 2.3961, Val Acc: 33.0268


Epoch 3/10: 100%|██████████| 19/19 [00:14<00:00,  1.34it/s]


Epoch 3: Train Loss: 2.2532, Train Acc: 32.8447, Val Loss: 2.2888, Val Acc: 40.7054


Epoch 4/10: 100%|██████████| 19/19 [00:14<00:00,  1.34it/s]


Epoch 4: Train Loss: 2.1162, Train Acc: 36.2003, Val Loss: 2.1402, Val Acc: 42.7098


Epoch 5/10: 100%|██████████| 19/19 [00:15<00:00,  1.22it/s]


Epoch 5: Train Loss: 1.9345, Train Acc: 39.3718, Val Loss: 1.9822, Val Acc: 46.3795


Epoch 6/10: 100%|██████████| 19/19 [00:14<00:00,  1.32it/s]


Epoch 6: Train Loss: 1.7857, Train Acc: 43.6535, Val Loss: 1.8247, Val Acc: 50.0491


Epoch 7/10: 100%|██████████| 19/19 [00:15<00:00,  1.24it/s]


Epoch 7: Train Loss: 1.6350, Train Acc: 47.3595, Val Loss: 1.7168, Val Acc: 55.2545


Epoch 8/10: 100%|██████████| 19/19 [00:15<00:00,  1.25it/s]


Epoch 8: Train Loss: 1.5320, Train Acc: 50.8832, Val Loss: 1.6361, Val Acc: 56.3750


Epoch 9/10: 100%|██████████| 19/19 [00:14<00:00,  1.29it/s]


Epoch 9: Train Loss: 1.4670, Train Acc: 54.3085, Val Loss: 1.5613, Val Acc: 58.9777


Epoch 10/10: 100%|██████████| 19/19 [00:16<00:00,  1.18it/s]


Epoch 10: Train Loss: 1.4204, Train Acc: 55.4366, Val Loss: 1.5488, Val Acc: 65.2009


### Evaluate the model using the 5-fold cross validation

In [None]:
import warnings
def extract_segments_with_deltas(file_path, variant='short', silence_threshold=-80):
    """
    Extracts 2-channel (log-mel + delta) spectrogram segments from an audio file.

    Parameters:
    - file_path (str): Path to audio file.
    - variant (str): 'short' (41 frames, 50% overlap) or 'long' (101 frames, 90% overlap).
    - silence_threshold (float): dB threshold for discarding low-energy segments.

    Returns:
    - np.ndarray: Array of shape (n_segments, 2, 60, frames_per_segment)
    """
    # Config
    sr = 22050
    n_fft = 1024
    hop_length = 512
    n_mels = 60

    if variant == 'short':
        frames_per_segment = 41
        overlap = 0.5
    elif variant == 'long':
        frames_per_segment = 101
        overlap = 0.9
    else:
        raise ValueError("variant must be 'short' or 'long'")

    try:
        # Load audio in mono
        y, _ = librosa.load(file_path, sr=sr, mono=True)

        # Skip empty or very short files
        if len(y) < n_fft:
            warnings.warn(f"File too short to process: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        # Compute log-mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize safely
        mean = np.mean(log_mel_spec)
        std = np.std(log_mel_spec)
        if std == 0:
            warnings.warn(f"Zero std encountered in file: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        log_mel_spec = (log_mel_spec - mean) / std

        # Compute deltas
        delta_spec = librosa.feature.delta(log_mel_spec)

        # Segmenting
        step = int(frames_per_segment * (1 - overlap))
        segments = []

        for start in range(0, log_mel_spec.shape[1] - frames_per_segment + 1, step):
            seg = log_mel_spec[:, start:start + frames_per_segment]
            delta = delta_spec[:, start:start + frames_per_segment]

            # Skip silent segments
            if np.mean(seg) < silence_threshold:
                continue

            stacked = np.stack([seg, delta], axis=0)
            segments.append(stacked)

        return np.stack(segments) if segments else np.empty((0, 2, 60, frames_per_segment))

    except Exception as e:
        warnings.warn(f"Failed to process {file_path}: {e}")
        return np.empty((0, 2, 60, frames_per_segment))

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Load your dataset
df = pd.read_csv('data/Animal_Sound_processed.csv')  # Update path if needed

# Prepare 5-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_val_idx, val_idx) in enumerate(kf.split(df)):
    df_train_val = df.iloc[train_val_idx].reset_index(drop=True)
    df_test = df.iloc[val_idx].reset_index(drop=True)

    print(f"Fold {fold + 1}")
    print(f"Train/Val samples: {len(df_train_val)}")
    print(f"Test samples: {len(df_test)}")

    # Create datasets for this fold
    dataset_train = AnimalSoundDataset(df_train_val, split='train', split_ratio=0.75, seed=42)
    dataset_val = AnimalSoundDataset(df_train_val, split='val', split_ratio=0.75, seed=42)
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    # Initialize model and trainer
    model = AudioModel(hyperparameters).to(device)
    trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)
    trainer.train()
    # Save the model for this fold
    model_save_path = f'model_fold_{fold + 1}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}\n")

    # Evaluate on test set
    for path, label in zip(df_test['path'], df_test['name']):

        test_segments = extract_segments_with_deltas(path, variant='short')
        predicted_sound = predict_clip(model, torch.tensor(test_segments, dtype=torch.float32).to(device), device, method='prob')


        predicted_label = dataset_train.classes[predicted_sound]

        if predicted_label == label:
            correct_predictions += 1
        total_predictions += 1
    accuracy = correct_predictions / total_predictions * 100 if total_predictions > 0 else 0
    print(f"Fold {fold + 1} - Test Accuracy: {accuracy:.4f}\n")





Fold 1
Train/Val samples: 520
Test samples: 130
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2301
train set contains 1725 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2301
val set contains 576 segments.


Epoch 1/10: 100%|██████████| 14/14 [00:12<00:00,  1.13it/s]


Epoch 1: Train Loss: 2.5115, Train Acc: 14.2619, Val Loss: 2.4871, Val Acc: 17.0312


Epoch 2/10: 100%|██████████| 14/14 [00:10<00:00,  1.31it/s]


Epoch 2: Train Loss: 2.3844, Train Acc: 20.1552, Val Loss: 2.4146, Val Acc: 26.7188


Epoch 3/10: 100%|██████████| 14/14 [00:11<00:00,  1.24it/s]


Epoch 3: Train Loss: 2.3181, Train Acc: 30.0983, Val Loss: 2.3757, Val Acc: 29.6875


Epoch 4/10: 100%|██████████| 14/14 [00:12<00:00,  1.13it/s]


Epoch 4: Train Loss: 2.2362, Train Acc: 30.3279, Val Loss: 2.2976, Val Acc: 29.3750


Epoch 5/10: 100%|██████████| 14/14 [00:10<00:00,  1.30it/s]


Epoch 5: Train Loss: 2.1220, Train Acc: 31.9068, Val Loss: 2.2100, Val Acc: 31.5625


Epoch 6/10: 100%|██████████| 14/14 [00:11<00:00,  1.22it/s]


Epoch 6: Train Loss: 2.0089, Train Acc: 34.6906, Val Loss: 2.1211, Val Acc: 36.5625


Epoch 7/10: 100%|██████████| 14/14 [00:12<00:00,  1.15it/s]


Epoch 7: Train Loss: 1.9123, Train Acc: 38.0891, Val Loss: 2.0181, Val Acc: 38.2812


Epoch 8/10: 100%|██████████| 14/14 [00:12<00:00,  1.10it/s]


Epoch 8: Train Loss: 1.8286, Train Acc: 39.7468, Val Loss: 1.9478, Val Acc: 44.8438


Epoch 9/10: 100%|██████████| 14/14 [00:13<00:00,  1.01it/s]


Epoch 9: Train Loss: 1.7320, Train Acc: 44.4004, Val Loss: 1.8659, Val Acc: 49.0625


Epoch 10/10: 100%|██████████| 14/14 [00:14<00:00,  1.00s/it]


Epoch 10: Train Loss: 1.6196, Train Acc: 47.2464, Val Loss: 1.7664, Val Acc: 51.5625
Model saved to model_fold_1.pth

Fold 1 - Test Accuracy: 45.5944

Fold 2
Train/Val samples: 520
Test samples: 130
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2332
train set contains 1749 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2332
val set contains 583 segments.


Epoch 1/10: 100%|██████████| 14/14 [00:16<00:00,  1.18s/it]


Epoch 1: Train Loss: 2.4994, Train Acc: 16.3275, Val Loss: 2.4806, Val Acc: 14.3794


Epoch 2/10: 100%|██████████| 14/14 [00:14<00:00,  1.05s/it]


Epoch 2: Train Loss: 2.3797, Train Acc: 19.5378, Val Loss: 2.4148, Val Acc: 26.3182


Epoch 3/10: 100%|██████████| 14/14 [00:14<00:00,  1.05s/it]


Epoch 3: Train Loss: 2.2972, Train Acc: 31.5671, Val Loss: 2.3594, Val Acc: 29.6633


Epoch 4/10:  79%|███████▊  | 11/14 [00:13<00:03,  1.21s/it]
