In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader

# Libraries for processing sounds
import librosa
from IPython.display import Audio
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
from ClassesData.AnimalSoundDataset import AnimalSoundDataset

data_path = 'data/Animal_Sound_processed.csv'
dataset_train = AnimalSoundDataset(data_path, split='train', split_ratio=0.8, seed=42)
dataset_val = AnimalSoundDataset(data_path, split='val', split_ratio=0.8, seed=42)

Loading dataset from data/Animal_Sound_processed.csv...
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
train set contains 2327 segments.
Loading dataset from data/Animal_Sound_processed.csv...
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
val set contains 582 segments.


In [4]:
# in a paper there are 20000 samples in the training set
# and they use 1000 for batch
# we have 2000 samples in the training set
# therefore we use 128 for batch
batch_size = 128

In [5]:
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

print(f"Number of batches in train loader: {len(train_loader)}")

for batch in train_loader:
    x_batch, y_batch = batch
    print(x_batch.shape)  # [batch_size, 1, 128, 400]
    print(y_batch.shape)  # [batch_size]
    break

Number of batches in train loader: 19
torch.Size([128, 2, 60, 41])
torch.Size([128])


In [None]:
input_dim = dataset_train[0][0].shape[0]  # e.g., (2, 60, 41) for short segments
n_classes = len(dataset_train.classes)
print(f"Input dimension: {input_dim}, Number of classes: {n_classes}")

hyperparameters = dict(
    input_dim=input_dim,
    output_dim=n_classes,
    hidden_layers_size=5000,
    activation='relu',
    kernel_size_conv=[(57, 6), (1, 3)],
    kernel_size_pool=[(4, 3), (1, 3)],
    stride_conv=[(1, 1), (1, 1)],
    stride_pool=[(1, 3), (1, 3)],
    filters=[80, 80],
    batch_normalization=False,
    dropout_rate=0.5,
    # trainer hyperparameters
    learning_rate=0.002,
    weight_decay=0.001,
    momentum=0.9,
    nesterov=True,

    # questionable hyperparameters
    #batch_size=batch_size,
    max_epoch=10
)

Input dimension: 2, Number of classes: 13


In [7]:
from ClassesML.AudioModel import AudioModel
model = AudioModel(hyperparameters).to(device)

In [8]:
def predict_clip(model, segment_tensors, device, method='prob'):
    """
    Generate prediction for a whole clip given its segments.
    
    segment_tensors: tensor of shape (n_segments, channels, mel_bands, frames)
    method: 'majority' or 'prob'
    """
    model.eval()
    segment_tensors = segment_tensors.to(device)
    with torch.no_grad():
        outputs = model(segment_tensors)  # (n_segments, num_classes)
        probs = F.softmax(outputs, dim=1)

    if method == 'majority':
        preds = torch.argmax(probs, dim=1)
        counts = torch.bincount(preds)
        clip_pred = torch.argmax(counts).item()
    elif method == 'prob':
        avg_probs = probs.mean(dim=0)
        clip_pred = torch.argmax(avg_probs).item()
    else:
        raise ValueError("method must be 'majority' or 'prob'")

    return clip_pred


In [9]:
from ClassesML.AudioTrainer import AudioTrainer
trainer = AudioTrainer(model, train_loader, val_loader, hyperparameters, device=device)
trainer.train()

  return F.conv2d(
Epoch 1/10: 100%|██████████| 19/19 [00:16<00:00,  1.17it/s]


Epoch 1: Train Loss: 2.4859, Train Acc: 16.6637, Val Loss: 2.4513, Val Acc: 29.3304


Epoch 2/10: 100%|██████████| 19/19 [00:16<00:00,  1.14it/s]


Epoch 2: Train Loss: 2.3841, Train Acc: 26.6197, Val Loss: 2.3971, Val Acc: 41.4062


Epoch 3/10: 100%|██████████| 19/19 [00:17<00:00,  1.07it/s]


Epoch 3: Train Loss: 2.2838, Train Acc: 31.8364, Val Loss: 2.3146, Val Acc: 34.5893


Epoch 4/10: 100%|██████████| 19/19 [00:17<00:00,  1.07it/s]


Epoch 4: Train Loss: 2.1178, Train Acc: 36.0108, Val Loss: 2.1661, Val Acc: 40.2366


Epoch 5/10: 100%|██████████| 19/19 [00:20<00:00,  1.08s/it]


Epoch 5: Train Loss: 1.9535, Train Acc: 39.6006, Val Loss: 2.0198, Val Acc: 43.9821


Epoch 6/10: 100%|██████████| 19/19 [00:18<00:00,  1.01it/s]


Epoch 6: Train Loss: 1.7851, Train Acc: 42.9956, Val Loss: 1.8449, Val Acc: 48.0179


Epoch 7/10: 100%|██████████| 19/19 [00:19<00:00,  1.03s/it]


Epoch 7: Train Loss: 1.6523, Train Acc: 47.3362, Val Loss: 1.7446, Val Acc: 56.5848


Epoch 8/10: 100%|██████████| 19/19 [00:18<00:00,  1.01it/s]


Epoch 8: Train Loss: 1.5246, Train Acc: 52.4510, Val Loss: 1.6517, Val Acc: 55.5938


Epoch 9/10: 100%|██████████| 19/19 [00:20<00:00,  1.07s/it]


Epoch 9: Train Loss: 1.4705, Train Acc: 53.4736, Val Loss: 1.5792, Val Acc: 57.9107


Epoch 10/10: 100%|██████████| 19/19 [00:19<00:00,  1.01s/it]


Epoch 10: Train Loss: 1.4024, Train Acc: 56.6165, Val Loss: 1.5550, Val Acc: 57.4196
