In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader

# Libraries for processing sounds
import librosa
from IPython.display import Audio
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
df = pd.read_csv('data/Animal_Sound_processed.csv')
len(df)

650

In [4]:
from ClassesData.AnimalSoundDataset import AnimalSoundDataset

dataset_train = AnimalSoundDataset(df, split='train', split_ratio=0.8, seed=42)
dataset_val = AnimalSoundDataset(df, split='val', split_ratio=0.8, seed=42)

Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
train set contains 2327 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
val set contains 582 segments.


In [5]:
# in a paper there are 20000 samples in the training set
# and they use 1000 for batch
# we have 2000 samples in the training set
# therefore we use 128 for batch
batch_size = 128

In [6]:
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

print(f"Number of batches in train loader: {len(train_loader)}")

for batch in train_loader:
    x_batch, y_batch = batch
    print(x_batch.shape)  # [batch_size, 1, 128, 400]
    print(y_batch.shape)  # [batch_size]
    break

Number of batches in train loader: 19
torch.Size([128, 2, 60, 41])
torch.Size([128])


In [9]:
input_dim = dataset_train[0][0].shape[0]  # e.g., (2, 60, 41) for short segments
n_classes = len(dataset_train.classes)
print(f"Input dimension: {input_dim}, Number of classes: {n_classes}")

hyperparameters = dict(
    input_dim=input_dim,
    output_dim=n_classes,
    hidden_layers_size=5000,
    activation='relu',
    kernel_size_conv=[(57, 6), (1, 3)],
    kernel_size_pool=[(4, 3), (1, 3)],
    stride_conv=[(1, 1), (1, 1)],
    stride_pool=[(1, 3), (1, 3)],
    filters=[80, 80],
    batch_normalization=False,
    dropout_rate=0.5,
    # trainer hyperparameters
    learning_rate=0.002,
    weight_decay=0.001,
    momentum=0.9,
    nesterov=True,

    # questionable hyperparameters
    #batch_size=batch_size,
    max_epoch=2,

    #Early stopping and sceduler
    patience_lr=False,
    early_stopping=False
)

Input dimension: 2, Number of classes: 13


In [None]:
from sklearn.model_selection import ParameterSampler
import pandas as pd
from ClassesML.AudioTrainer import AudioTrainer
from ClassesML.AudioModel import AudioModel

hyperparameter_choices = {}
for k in hyperparameters.keys():
    hyperparameter_choices[k] = [hyperparameters[k]]

hyperparameter_choices['learning_rate'] = [0.01, 0.005, 0.001, 0.0005, 0.0001]
#hyperparameter_choices['activation'] = ['relu', 'sigmoid', 'tanh']
#hyperparameter_choices['filters'] = [[8, 16, 32],[16, 32, 64]]
hyperparameter_try = list(ParameterSampler(hyperparameter_choices, n_iter=2))

metric_list = []

for hyperparam in hyperparameter_try:

    model = AudioModel(hyperparam).to(device)


    trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)

    train_accuracy_list, valid_accuracy_list = trainer.train()
    metric_list.append(valid_accuracy_list[-1])
    hyperparam['metric'] = valid_accuracy_list[-1]

idx = np.argsort(metric_list)
hyperparameter_sorted = np.array(hyperparameter_try)[idx].tolist()
df = pd.DataFrame.from_dict(hyperparameter_sorted)
print(tabulate(df, headers='keys', tablefmt='psql'))

Epoch 1/2:   0%|          | 0/19 [00:00<?, ?it/s]

Epoch 1/2: 100%|██████████| 19/19 [04:04<00:00, 12.89s/it]


Epoch 1: Train Loss: 2.4893, Train Acc: 15.9182, Val Loss: 2.4467, Val Acc: 31.4911


Epoch 2/2: 100%|██████████| 19/19 [03:40<00:00, 11.61s/it]


Epoch 2: Train Loss: 2.3700, Train Acc: 26.8718, Val Loss: 2.4034, Val Acc: 35.0312


Epoch 1/2:  16%|█▌        | 3/19 [00:47<04:03, 15.19s/it]

In [8]:
from ClassesML.AudioModel import AudioModel
model = AudioModel(hyperparameters).to(device)

In [9]:
import torch.nn.functional as F
def predict_clip(model, segment_tensors, device, method='prob'):
    """
    Generate prediction for a whole clip given its segments.
    
    segment_tensors: tensor of shape (n_segments, channels, mel_bands, frames)
    method: 'majority' or 'prob'
    """
    model.eval()
    segment_tensors = segment_tensors.to(device)
    with torch.no_grad():
        outputs = model(segment_tensors)  # (n_segments, num_classes)
        probs = F.softmax(outputs, dim=1)

    if method == 'majority':
        preds = torch.argmax(probs, dim=1)
        counts = torch.bincount(preds)
        clip_pred = torch.argmax(counts).item()
    elif method == 'prob':
        avg_probs = probs.mean(dim=0)
        clip_pred = torch.argmax(avg_probs).item()
    else:
        raise ValueError("method must be 'majority' or 'prob'")

    return clip_pred


In [10]:
from ClassesML.AudioTrainer import AudioTrainer
trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)
trainer.train()

Epoch 1/10:   0%|          | 0/19 [00:00<?, ?it/s]

  return F.conv2d(
Epoch 1/10: 100%|██████████| 19/19 [01:09<00:00,  3.64s/it]


Epoch 1: Train Loss: 2.4912, Train Acc: 16.6691, Val Loss: 2.4530, Val Acc: 34.4330


Epoch 2/10: 100%|██████████| 19/19 [01:00<00:00,  3.16s/it]


Epoch 2: Train Loss: 2.3699, Train Acc: 27.7943, Val Loss: 2.4016, Val Acc: 33.6518


Epoch 3/10: 100%|██████████| 19/19 [01:58<00:00,  6.25s/it]


Epoch 3: Train Loss: 2.2754, Train Acc: 31.7828, Val Loss: 2.3065, Val Acc: 39.6384


Epoch 4/10: 100%|██████████| 19/19 [01:49<00:00,  5.76s/it]


Epoch 4: Train Loss: 2.1343, Train Acc: 34.6968, Val Loss: 2.1717, Val Acc: 45.3393


Epoch 5/10: 100%|██████████| 19/19 [02:05<00:00,  6.58s/it]


Epoch 5: Train Loss: 1.9472, Train Acc: 39.6936, Val Loss: 2.0148, Val Acc: 43.1518


Epoch 6/10: 100%|██████████| 19/19 [01:52<00:00,  5.93s/it]


Epoch 6: Train Loss: 1.7954, Train Acc: 43.4300, Val Loss: 1.8620, Val Acc: 47.6027


Epoch 7/10: 100%|██████████| 19/19 [01:59<00:00,  6.28s/it]


Epoch 7: Train Loss: 1.6509, Train Acc: 46.0205, Val Loss: 1.7639, Val Acc: 53.4062


Epoch 8/10: 100%|██████████| 19/19 [02:40<00:00,  8.44s/it]


Epoch 8: Train Loss: 1.5719, Train Acc: 49.8498, Val Loss: 1.6646, Val Acc: 56.2455


Epoch 9/10: 100%|██████████| 19/19 [01:44<00:00,  5.50s/it]


Epoch 9: Train Loss: 1.4881, Train Acc: 52.4582, Val Loss: 1.6073, Val Acc: 61.1116


Epoch 10/10: 100%|██████████| 19/19 [01:24<00:00,  4.45s/it]


Epoch 10: Train Loss: 1.4215, Train Acc: 55.3311, Val Loss: 1.5432, Val Acc: 62.4152


### Evaluate the model using the 5-fold cross validation

In [11]:
import warnings
def extract_segments_with_deltas(file_path, variant='short', silence_threshold=-80):
    """
    Extracts 2-channel (log-mel + delta) spectrogram segments from an audio file.

    Parameters:
    - file_path (str): Path to audio file.
    - variant (str): 'short' (41 frames, 50% overlap) or 'long' (101 frames, 90% overlap).
    - silence_threshold (float): dB threshold for discarding low-energy segments.

    Returns:
    - np.ndarray: Array of shape (n_segments, 2, 60, frames_per_segment)
    """
    # Config
    sr = 22050
    n_fft = 1024
    hop_length = 512
    n_mels = 60

    if variant == 'short':
        frames_per_segment = 41
        overlap = 0.5
    elif variant == 'long':
        frames_per_segment = 101
        overlap = 0.9
    else:
        raise ValueError("variant must be 'short' or 'long'")

    try:
        # Load audio in mono
        y, _ = librosa.load(file_path, sr=sr, mono=True)

        # Skip empty or very short files
        if len(y) < n_fft:
            warnings.warn(f"File too short to process: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        # Compute log-mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize safely
        mean = np.mean(log_mel_spec)
        std = np.std(log_mel_spec)
        if std == 0:
            warnings.warn(f"Zero std encountered in file: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        log_mel_spec = (log_mel_spec - mean) / std

        # Compute deltas
        delta_spec = librosa.feature.delta(log_mel_spec)

        # Segmenting
        step = int(frames_per_segment * (1 - overlap))
        segments = []

        for start in range(0, log_mel_spec.shape[1] - frames_per_segment + 1, step):
            seg = log_mel_spec[:, start:start + frames_per_segment]
            delta = delta_spec[:, start:start + frames_per_segment]

            # Skip silent segments
            if np.mean(seg) < silence_threshold:
                continue

            stacked = np.stack([seg, delta], axis=0)
            segments.append(stacked)

        return np.stack(segments) if segments else np.empty((0, 2, 60, frames_per_segment))

    except Exception as e:
        warnings.warn(f"Failed to process {file_path}: {e}")
        return np.empty((0, 2, 60, frames_per_segment))

In [12]:
import pandas as pd
from sklearn.model_selection import KFold

# Load your dataset
df = pd.read_csv('data/Animal_Sound_processed.csv')  # Update path if needed

# Prepare 5-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_val_idx, val_idx) in enumerate(kf.split(df)):
    df_train_val = df.iloc[train_val_idx].reset_index(drop=True)
    df_test = df.iloc[val_idx].reset_index(drop=True)

    print(f"Fold {fold + 1}")
    print(f"Train/Val samples: {len(df_train_val)}")
    print(f"Test samples: {len(df_test)}")

    # Create datasets for this fold
    dataset_train = AnimalSoundDataset(df_train_val, split='train', split_ratio=0.75, seed=42)
    dataset_val = AnimalSoundDataset(df_train_val, split='val', split_ratio=0.75, seed=42)
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    # Initialize model and trainer
    model = AudioModel(hyperparameters).to(device)
    trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)
    trainer.train()
    # Save the model for this fold
    model_save_path = f'model_fold_{fold + 1}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}\n")

    # Evaluate on test set
    for path, label in zip(df_test['path'], df_test['name']):

        test_segments = extract_segments_with_deltas(path, variant='short')
        predicted_sound = predict_clip(model, torch.tensor(test_segments, dtype=torch.float32).to(device), device, method='prob')
        

        predicted_label = dataset_train.classes[predicted_sound]
        
        if predicted_label == label:
            correct_predictions += 1
        total_predictions += 1
    accuracy = correct_predictions / total_predictions * 100 if total_predictions > 0 else 0
    print(f"Fold {fold + 1} - Test Accuracy: {accuracy:.4f}\n")

        

   

Fold 1
Train/Val samples: 520
Test samples: 130
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2301
train set contains 1725 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2301
val set contains 576 segments.


Epoch 1/10: 100%|██████████| 14/14 [01:09<00:00,  4.95s/it]


Epoch 1: Train Loss: 2.5135, Train Acc: 14.4019, Val Loss: 2.4880, Val Acc: 13.4375


Epoch 2/10: 100%|██████████| 14/14 [01:10<00:00,  5.04s/it]


Epoch 2: Train Loss: 2.3944, Train Acc: 20.1552, Val Loss: 2.4219, Val Acc: 32.5000


Epoch 3/10: 100%|██████████| 14/14 [01:03<00:00,  4.52s/it]


Epoch 3: Train Loss: 2.3307, Train Acc: 30.7350, Val Loss: 2.3866, Val Acc: 31.8750


Epoch 4/10: 100%|██████████| 14/14 [01:19<00:00,  5.66s/it]


Epoch 4: Train Loss: 2.2393, Train Acc: 30.3773, Val Loss: 2.3064, Val Acc: 31.2500


Epoch 5/10: 100%|██████████| 14/14 [01:42<00:00,  7.32s/it]


Epoch 5: Train Loss: 2.1259, Train Acc: 32.6826, Val Loss: 2.2020, Val Acc: 34.8438


Epoch 6/10: 100%|██████████| 14/14 [02:17<00:00,  9.81s/it]


Epoch 6: Train Loss: 2.0200, Train Acc: 34.9925, Val Loss: 2.1197, Val Acc: 37.3438


Epoch 7/10: 100%|██████████| 14/14 [01:35<00:00,  6.79s/it]


Epoch 7: Train Loss: 1.9225, Train Acc: 38.2904, Val Loss: 2.0336, Val Acc: 39.6875


Epoch 8/10: 100%|██████████| 14/14 [02:27<00:00, 10.54s/it]


Epoch 8: Train Loss: 1.8318, Train Acc: 40.1209, Val Loss: 1.9652, Val Acc: 47.1875


Epoch 9/10: 100%|██████████| 14/14 [03:50<00:00, 16.46s/it]


Epoch 9: Train Loss: 1.7164, Train Acc: 44.7407, Val Loss: 1.8695, Val Acc: 45.7812


Epoch 10/10:  21%|██▏       | 3/14 [00:46<02:51, 15.61s/it]


KeyboardInterrupt: 