In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader

# Libraries for processing sounds
import librosa
from IPython.display import Audio
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
df = pd.read_csv('data/Animal_Sound_processed.csv')
len(df)

650

In [4]:
from ClassesData.AnimalSoundDataset import AnimalSoundDataset

dataset_train = AnimalSoundDataset(df, split='train', split_ratio=0.8, seed=42)
dataset_val = AnimalSoundDataset(df, split='val', split_ratio=0.8, seed=42)

Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
train set contains 2327 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
val set contains 582 segments.


In [5]:
# in a paper there are 20000 samples in the training set
# and they use 1000 for batch
# we have 2000 samples in the training set
# therefore we use 128 for batch
batch_size = 128

In [6]:
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

print(f"Number of batches in train loader: {len(train_loader)}")

for batch in train_loader:
    x_batch, y_batch = batch
    print(x_batch.shape)  # [batch_size, 1, 128, 400]
    print(y_batch.shape)  # [batch_size]
    break

Number of batches in train loader: 19
torch.Size([128, 2, 60, 41])
torch.Size([128])


In [None]:
input_dim = dataset_train[0][0].shape[0]  # e.g., (2, 60, 41) for short segments
n_classes = len(dataset_train.classes)
print(f"Input dimension: {input_dim}, Number of classes: {n_classes}")

hyperparameters = dict(
    input_dim=input_dim,
    output_dim=n_classes,
    hidden_layers_size=5000,
    activation='relu',
    kernel_size_conv=[(57, 6), (1, 3)],
    kernel_size_pool=[(4, 3), (1, 3)],
    stride_conv=[(1, 1), (1, 1)],
    stride_pool=[(1, 3), (1, 3)],
    filters=[80, 80],
    batch_normalization=False,
    dropout_rate=0.5,
    # trainer hyperparameters
    learning_rate=0.002,
    weight_decay=0.001,
    momentum=0.9,
    nesterov=True,

    # questionable hyperparameters
    #batch_size=batch_size,
    max_epoch=2,

    #Early stopping and sceduler
    patience_lr=5,
    early_stopping=True
)

Input dimension: 2, Number of classes: 13


In [None]:
from sklearn.model_selection import ParameterSampler
import pandas as pd
from ClassesML.AudioTrainer import AudioTrainer
from ClassesML.AudioModel import AudioModel

hyperparameter_choices = {}
for k in hyperparameters.keys():
    hyperparameter_choices[k] = [hyperparameters[k]]

hyperparameter_choices['learning_rate'] = [0.005, 0.001]
hyperparameter_choices['batch_size'] = [64, 128, 256]
hyperparameter_choices['max_epoch'] = [100]
hyperparameter_choices['hidden_layers_size']=[1000, 5000]
hyperparameter_choices['patience_lr'] = [5, 10, 15]
hyperparameter_choices['momentum'] = [0.9, 0.95, 0.85]
hyperparameter_choices['weight_decay'] = [0.001, 0.002]


hyperparameter_try = list(ParameterSampler(hyperparameter_choices, n_iter=20))

metric_list = []

for hyperparam in hyperparameter_try:

    model = AudioModel(hyperparam).to(device)
    
    train_loader = DataLoader(dataset_train, batch_size=hyperparam['batch_size'], shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=hyperparam['batch_size'], shuffle=False)

    trainer = AudioTrainer(model, train_loader, val_loader, hyperparam, device=device)

    train_accuracy_list, valid_accuracy_list = trainer.train()
    metric_list.append(valid_accuracy_list[-1])
    hyperparam['val_accuracy'] = valid_accuracy_list[-1]
    hyperparam['train_accuracy'] = train_accuracy_list[-1]

idx = np.argsort(metric_list)
hyperparameter_sorted = np.array(hyperparameter_try)[idx].tolist()
df = pd.DataFrame.from_dict(hyperparameter_sorted)

Epoch 1/2: 100%|██████████| 19/19 [01:11<00:00,  3.77s/it]


Epoch 1: Train Loss: 2.4760, Train Acc: 18.2727, Val Loss: 2.4360, Val Acc: 38.8571
Epoch 0 - Keeping weights


Epoch 2/2: 100%|██████████| 19/19 [01:12<00:00,  3.83s/it]


Epoch 2: Train Loss: 2.3576, Train Acc: 28.8222, Val Loss: 2.3772, Val Acc: 39.0670
Max epoch reached - Stop training - Restoring weights


Epoch 1/2: 100%|██████████| 19/19 [01:13<00:00,  3.88s/it]


Epoch 1: Train Loss: 2.4834, Train Acc: 15.2835, Val Loss: 2.4429, Val Acc: 37.2188
Epoch 0 - Keeping weights


Epoch 2/2: 100%|██████████| 19/19 [01:08<00:00,  3.61s/it]


Epoch 2: Train Loss: 2.3678, Train Acc: 27.1131, Val Loss: 2.3917, Val Acc: 39.9509
Max epoch reached - Stop training - Restoring weights


In [16]:
from tabulate import tabulate

print(tabulate(df, headers='keys', tablefmt='psql'))

+----+----------------+------------------+------------------+---------------+--------------+------------+------------+-------------+-----------------+--------------------+--------------------+-------------+----------------------+-----------+------------------+----------------+-----------------------+--------------+----------+
|    |   weight_decay | stride_pool      | stride_conv      | patience_lr   |   output_dim | nesterov   |   momentum |   max_epoch |   learning_rate | kernel_size_pool   | kernel_size_conv   |   input_dim |   hidden_layers_size | filters   | early_stopping   |   dropout_rate | batch_normalization   | activation   |   metric |
|----+----------------+------------------+------------------+---------------+--------------+------------+------------+-------------+-----------------+--------------------+--------------------+-------------+----------------------+-----------+------------------+----------------+-----------------------+--------------+----------|
|  0 |          

In [13]:
hyperparameter_final = hyperparameters

In [14]:
from ClassesML.AudioModel import AudioModel
model_final = AudioModel(hyperparameter_final).to(device)



In [15]:
import torch.nn.functional as F
def predict_clip(model, segment_tensors, device, method='prob'):
    """
    Generate prediction for a whole clip given its segments.
    
    segment_tensors: tensor of shape (n_segments, channels, mel_bands, frames)
    method: 'majority' or 'prob'
    """
    model.eval()
    segment_tensors = segment_tensors.to(device)
    with torch.no_grad():
        outputs = model(segment_tensors)  # (n_segments, num_classes)
        probs = F.softmax(outputs, dim=1)

    if method == 'majority':
        preds = torch.argmax(probs, dim=1)
        counts = torch.bincount(preds)
        clip_pred = torch.argmax(counts).item()
    elif method == 'prob':
        avg_probs = probs.mean(dim=0)
        clip_pred = torch.argmax(avg_probs).item()
    else:
        raise ValueError("method must be 'majority' or 'prob'")

    return clip_pred


In [16]:
from ClassesML.AudioTrainer import AudioTrainer
trainer_final = AudioTrainer(model_final, train_loader, val_loader, hyperparameter_final, device=device)
trainer_final.train()

Epoch 1/2:  16%|█▌        | 3/19 [00:16<01:29,  5.60s/it]


KeyboardInterrupt: 

### Evaluate the model using the 5-fold cross validation

In [17]:
import warnings
def extract_segments_with_deltas(file_path, variant='short', silence_threshold=-80):
    """
    Extracts 2-channel (log-mel + delta) spectrogram segments from an audio file.

    Parameters:
    - file_path (str): Path to audio file.
    - variant (str): 'short' (41 frames, 50% overlap) or 'long' (101 frames, 90% overlap).
    - silence_threshold (float): dB threshold for discarding low-energy segments.

    Returns:
    - np.ndarray: Array of shape (n_segments, 2, 60, frames_per_segment)
    """
    # Config
    sr = 22050
    n_fft = 1024
    hop_length = 512
    n_mels = 60

    if variant == 'short':
        frames_per_segment = 41
        overlap = 0.5
    elif variant == 'long':
        frames_per_segment = 101
        overlap = 0.9
    else:
        raise ValueError("variant must be 'short' or 'long'")

    try:
        # Load audio in mono
        y, _ = librosa.load(file_path, sr=sr, mono=True)

        # Skip empty or very short files
        if len(y) < n_fft:
            warnings.warn(f"File too short to process: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        # Compute log-mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize safely
        mean = np.mean(log_mel_spec)
        std = np.std(log_mel_spec)
        if std == 0:
            warnings.warn(f"Zero std encountered in file: {file_path}")
            return np.empty((0, 2, 60, frames_per_segment))

        log_mel_spec = (log_mel_spec - mean) / std

        # Compute deltas
        delta_spec = librosa.feature.delta(log_mel_spec)

        # Segmenting
        step = int(frames_per_segment * (1 - overlap))
        segments = []

        for start in range(0, log_mel_spec.shape[1] - frames_per_segment + 1, step):
            seg = log_mel_spec[:, start:start + frames_per_segment]
            delta = delta_spec[:, start:start + frames_per_segment]

            # Skip silent segments
            if np.mean(seg) < silence_threshold:
                continue

            stacked = np.stack([seg, delta], axis=0)
            segments.append(stacked)

        return np.stack(segments) if segments else np.empty((0, 2, 60, frames_per_segment))

    except Exception as e:
        warnings.warn(f"Failed to process {file_path}: {e}")
        return np.empty((0, 2, 60, frames_per_segment))

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Load your dataset
df = pd.read_csv('data/Animal_Sound_processed.csv')  # Update path if needed

# Prepare 5-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_val_idx, val_idx) in enumerate(kf.split(df)):
    df_train_val = df.iloc[train_val_idx].reset_index(drop=True)
    df_test = df.iloc[val_idx].reset_index(drop=True)

    print(f"Fold {fold + 1}")
    print(f"Train/Val samples: {len(df_train_val)}")
    print(f"Test samples: {len(df_test)}")

    # Create datasets for this fold
    dataset_train = AnimalSoundDataset(df_train_val, split='train', split_ratio=0.75, seed=42)
    dataset_val = AnimalSoundDataset(df_train_val, split='val', split_ratio=0.75, seed=42)
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    # Initialize model and trainer
    model_final = AudioModel(hyperparameter_final).to(device)
    trainer_final = AudioTrainer(model, train_loader, val_loader, hyperparameter_final, device=device)
    trainer_final.train()
    # Save the model for this fold
    model_save_path = f'model_fold_{fold + 1}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}\n")

    # Evaluate on test set
    for path, label in zip(df_test['path'], df_test['name']):

        test_segments = extract_segments_with_deltas(path, variant='short')
        predicted_sound = predict_clip(model, torch.tensor(test_segments, dtype=torch.float32).to(device), device, method='prob')
        

        predicted_label = dataset_train.classes[predicted_sound]
        
        if predicted_label == label:
            correct_predictions += 1
        total_predictions += 1
    accuracy = correct_predictions / total_predictions * 100 if total_predictions > 0 else 0
    print(f"Fold {fold + 1} - Test Accuracy: {accuracy:.4f}\n")

        

   

Fold 1
Train/Val samples: 520
Test samples: 130
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2301
train set contains 1725 segments.
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
