In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader

# Libraries for processing sounds
import librosa
from IPython.display import Audio
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
from AnimalSoundDataset import AnimalSoundDataset

data_path = 'data/Animal_Sound_processed.csv'
dataset_train = AnimalSoundDataset(data_path, split='train', split_ratio=0.8, seed=42)
dataset_val = AnimalSoundDataset(data_path, split='val', split_ratio=0.8, seed=42)

Loading dataset from data/Animal_Sound_processed.csv...
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
train set contains 2327 segments.
Loading dataset from data/Animal_Sound_processed.csv...
Classes found: ['bear', 'cat', 'chicken', 'cow', 'dog', 'dolphin', 'donkey', 'elephant', 'frog', 'horse', 'lion', 'monkey', 'sheep']
Total segments extracted: 2909
val set contains 582 segments.


In [8]:
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)
# val_loader = DataLoader(dataset_val, batch_size=32, shuffle=False)

print(f"Number of batches in train loader: {len(train_loader)}")

for batch in train_loader:
    x_batch, y_batch = batch
    print(x_batch.shape)  # [batch_size, 1, 128, 400]
    print(y_batch.shape)  # [batch_size]
    break

Number of batches in train loader: 73
torch.Size([32, 2, 60, 41])
torch.Size([32])


In [9]:
input_dim = dataset_train[0][0].shape[0]  # e.g., (2, 60, 41) for short segments
n_classes = len(dataset_train.classes)
print(f"Input dimension: {input_dim}, Number of classes: {n_classes}")

hyperparameters = dict(
    input_dim=2,
    output_dim=n_classes,
    hidden_layers_size=100,  # should be 5000 but do not want to kill my laptop
    activation='relu',
    kernel_size_conv=[(57, 6), (1, 3)],
    kernel_size_pool=[(4, 3), (1, 3)],
    stride_conv=[(1, 1), (1, 1)],
    stride_pool=[(1, 3), (1, 3)],
    filters=[80, 80],
    batch_normalization=False,
    dropout_rate=0.5,
    learning_rate=0.01,
    batch_size=32,
    max_epoch=10
)

Input dimension: 2, Number of classes: 13


In [10]:
from ClassesML.AudioModel import AudioModel
model = AudioModel(hyperparameters).to(device)

In [11]:
for batch in train_loader:
    x_batch, y_batch = batch
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)
    output = model(x_batch)
    print("Output shape:", output.shape)  # Should be [batch_size, n_classes]
    break

Output shape: torch.Size([32, 13])


  return F.conv2d(


In [12]:
# import torch.optim as optim
# from Utilities.Utilities import *
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=hyperparameters['learning_rate'])
# def run():
    
#     train_accuracy_dict = {}
#     valid_accuracy_dict = {}
    
#     for epoch in range(hyperparameters['max_epoch']):
#         # Train
#         model.train()
#         total_loss = 0.0
#         total_accuracy = 0.0
        
        
#         for batch in train_loader:
#             x_batch, y_batch = batch

#             x = x_batch.to(device)
#             y = y_batch.to(device)

#             # Forward pass
#             y_hat = model(x)
#             loss = criterion(y_hat, y)

#             # Backward pass
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
            
#             # Calculate accuracy
#             batch_accuracy = Utilities.compute_accuracy(y, y_hat)
#             total_accuracy += batch_accuracy

#         train_loss = total_loss / len(train_loader)
#         train_accuracy = total_accuracy / len(train_loader)
#         print("Epoch: ", str(epoch + 1) + "/" + str(hyperparameters['max_epoch']))
#         print("Train Loss: ", str(train_loss) + " - Training Accuracy: " + str(train_accuracy))

#         # Validation
    
#         # very important to set the model to eval mode
#         # we do not want to update the weights
#         # we want the current state of the model
#         model.eval() 
#         total_loss = 0.0
#         total_accuracy = 0.0
#         for batch in val_loader:
#             x_batch, y_batch = batch

#             x = x_batch.to(device)
#             y = y_batch.to(device)
#             # Forward pass
#             y_hat = model(x)
#             loss = criterion(y_hat, y)
#             total_loss += loss.item()
            
#             # Calculate accuracy
#             batch_accuracy = Utilities.compute_accuracy(y, y_hat)
#             total_accuracy += batch_accuracy

#         valid_loss = total_loss / len(val_loader)
#         valid_accuracy = total_accuracy / len(val_loader)
#         print("Epoch: ", str(epoch + 1) + "/" + str(hyperparameters['max_epoch']))
#         print("Validation Loss: ", str(valid_loss) + " - Validation Accuracy: " + str(valid_accuracy))
#         train_accuracy_dict[epoch] = train_accuracy
#         valid_accuracy_dict[epoch] = valid_accuracy
            
#     train_accuracy_list = [train_accuracy_dict[i] for i in train_accuracy_dict.keys()]
#     valid_accuracy_list = [valid_accuracy_dict[i] for i in valid_accuracy_dict.keys()]
#     return train_accuracy_list, valid_accuracy_list

In [13]:
from ClassesML.AudioTrainer import AudioTrainer
trainer = AudioTrainer(model, dataset_train, dataset_val, hyperparameters, device=device)
trainer.train()

Epoch 1/10: 100%|██████████| 73/73 [00:11<00:00,  6.44it/s]


Epoch 1: Train Loss: 0.1056, Train Acc: 30.2904, Val Loss: 1.4555, Val Acc: 6.5430


Epoch 2/10:  34%|███▍      | 25/73 [00:03<00:07,  6.44it/s]


KeyboardInterrupt: 