## Deep Learning-Based Birdcall Classification

In [None]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os

def set_seed(seed_value=42):
    """Set the random number generator seed for all modules."""
    random.seed(seed_value)  # Built-in Python random module
    np.random.seed(seed_value)  # NumPy
    torch.manual_seed(seed_value)  # PyTorch function for CPUs
    torch.cuda.manual_seed(seed_value)  # PyTorch function for GPUs
    torch.cuda.manual_seed_all(seed_value)  # PyTorch function for multi-GPUs
    torch.backends.cudnn.deterministic = True  # Makes cudnn algorithm deterministic
    torch.backends.cudnn.benchmark = False  # Disables the cudnn benchmarking
    os.environ['PYTHONHASHSEED'] = str(seed_value)

set_seed(42)  # Call with the desired seed value


In [None]:
!pip install python_speech_features

Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone
  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5870 sha256=f888c04f58b35491cb68d20f6db00e7dca2ca0dc4c6fb93f7d039ffaa54a5f7f
  Stored in directory: /root/.cache/pip/wheels/5a/9e/68/30bad9462b3926c29e315df16b562216d12bdc215f4d240294
Successfully built python_speech_features
Installing collected packages: python_speech_features
Successfully installed python_speech_features-0.6


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read
from IPython.lib.display import Audio
from scipy.fftpack import fft, ifft
from scipy.io import loadmat
import scipy.signal as sgnl
import scipy.io.wavfile as wav
import sys
import wave
import operator
import scipy
from python_speech_features import mfcc

## Model Training


---
Dataset Preparation: Construct a dataset that includes results from MFCC (Mel Frequency Cepstral Coefficients) and Fourier Transforms.

Model Definition: Define models such as LSTM (Long Short-Term Memory) or CNN (Convolutional Neural Network).

Model Training: Use the dataset to train the model.

Inference and Evaluation: Perform inference using the trained model and evaluate its performance.

Library Installation and Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#Setting the Current Directory
%cd /content/drive/My Drive/birdcall

Mounted at /content/drive
/content/drive/My Drive/birdcall


데이터 설정 및 전처리 mfcc 활용

Model Definition and Training

---
LSTM Classifier

In [None]:
import torch
import torch.nn as nn
# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Audio Data Preprocessing Steps
1. Convert MP3 data to WAV format.
2. Perform Fourier Transform and high-pass filtering.
3. Extract MFCC (Mel Frequency Cepstral Coefficients)
4. Transform the data  into a format suitable for input into the model.

# Setting Up Training Data

In [None]:
import os
import shutil
import glob
import random

# Retrieve the list of .wav files from the current directory.
source_dir = './wav'  # Change this to the directory path where your .wav files are located.
train_dir = './train'
val_dir = './val'

# Create train and val directories if they do not exist.
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Load the list of .wav files.
wav_files = glob.glob(os.path.join(source_dir, '*.wav'))

# Randomly shuffle the list of files.
random.shuffle(wav_files)

# Split the files into an 80:20 ratio.
split_index = int(len(wav_files) * 0.8)
train_files = wav_files[:split_index]
val_files = wav_files[split_index:]

# Move the files to their respective directories.
for f in train_files:
    shutil.move(f, os.path.join(train_dir, os.path.basename(f)))

for f in val_files:
    shutil.move(f, os.path.join(val_dir, os.path.basename(f)))

print(f'Moved {len(train_files)} files to {train_dir}')
print(f'Moved {len(val_files)} files to {val_dir}')


Moved 619 files to ./train
Moved 155 files to ./val


In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


Convert MP3 files to WAV files and save them by dividing them into train and val files with a 9:1 ratio

#Preprocessing
---


1. Load WAV files and generate labels through file names.
2. Perform Fourier Transform.
3. Load and apply a low-pass filter.
4. Extract MFCC (Mel Frequency Cepstral Coefficients) features.
5. Transform the data into a format suitable for model training.

In [None]:
import os
import numpy as np
from scipy.io import wavfile, loadmat
from scipy.fft import fft, fftfreq, ifftshift
from scipy.fftpack import fftshift
from python_speech_features import mfcc
import torch
from torch.utils.data import TensorDataset, DataLoader

def apply_fourier_transform(input_audio, Fs):
    N = len(input_audio)
    f_transform = fftshift(fft(input_audio, N))
    frequencies = np.linspace(-Fs/2, Fs/2, N)
    return f_transform, frequencies

import scipy.signal as signal

def apply_lowpass_filter(input_audio, b, a):
    filtered_audio = signal.lfilter(b, a, input_audio)
    return filtered_audio

def extract_mfcc_features(filtered_audio, Fs):
    mfcc_features = mfcc(filtered_audio, Fs)
    return mfcc_features

# Load low-pass filter
filter_data = loadmat('./low_filter/highpass(500).mat') # load the filter coefficients
Coeffs = filter_data['ba'].astype(np.float64) # obtaining filter coefficients
b = Coeffs[0,:] # first column is b
a = 1

# Dictionary to map labels to their indices
labels_index = {}

wav_files_dir = './train'  # Directory where WAV files are stored
wav_files = [f for f in os.listdir(wav_files_dir) if f.endswith('.wav')]

features_list = []  # List to store features
labels_list = []  # List to store labels

for wav_file in wav_files:
    # Extract label (remove .wav and last '_number')
    label = '_'.join(wav_file.split('_')[:-1])
    if label not in labels_index:
        labels_index[label] = len(labels_index)
    label_index = labels_index[label]

    file_path = os.path.join(wav_files_dir, wav_file)

    # Load WAV file
    Fs, input_audio = wavfile.read(file_path)

    # Apply low-pass filtering
    filtered_audio = apply_lowpass_filter(input_audio, b, a)

    # Extract MFCC
    mfcc_features = extract_mfcc_features(filtered_audio, Fs)

    # Add to list
    features_list.append(mfcc_features)
    labels_list.append(label_index)

# Calculate the shortest length
min_length = min([len(feat) for feat in features_list])

# Trim all features to the shortest length
trimmed_features_list = [feat[:min_length] for feat in features_list]

# Convert features and labels to NumPy arrays
features_array = np.array(trimmed_features_list)
labels_array = np.array(labels_list)

# Convert to PyTorch tensors
features_tensor = torch.tensor(features_array, dtype=torch.float)
labels_tensor = torch.tensor(labels_array, dtype=torch.long)

# Create TensorDataset
dataset = TensorDataset(features_tensor, labels_tensor)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
## 매핑 확인
for label, index in labels_index.items():
    print(f"Label '{label}' is mapped to index {index}")

Label 'Velvet_Scoter_Melanitta_fusca' is mapped to index 0
Label 'Long_tailed_Duck_Clangula_hyemalis' is mapped to index 1
Label 'Leach's_Storm_Petrel_Hydrobates_leucorhous' is mapped to index 2
Label 'European_Turtle_Dove_Streptopelia_turtur' is mapped to index 3
Label 'Black_legged_Kittiwake_Rissa_tridactyla' is mapped to index 4
Label 'Balearic_Shearwater_Puffinus_mauretanicus' is mapped to index 5
Label 'Atlantic_Puffin_Fratercula_arctica' is mapped to index 6
Label 'Aquatic_Warbler_Acrocephalus_paludicola' is mapped to index 7
Label 'Great_Bustard_Otis_tarda' is mapped to index 8
Label 'Horned_Grebe_Podiceps_auritus' is mapped to index 9


In [None]:
# Load Validation Data
val_dir = './val'  # Directory where validation data is stored
val_wav_files = [f for f in os.listdir(val_dir) if f.endswith('.wav')]

val_features_list = []  # List to store features of validation data
val_labels_list = []  # List to store labels of validation data

for wav_file in val_wav_files:
    label = '_'.join(wav_file.split('_')[:-1])  # Extract label from file name
    if label not in labels_index:  # Skip if the label is not in the training data label index
        continue  # Move to the next file

    file_path = os.path.join(val_dir, wav_file)
    # Load WAV file
    Fs, input_audio = wavfile.read(file_path)

    # Apply low-pass filtering
    filtered_audio = apply_lowpass_filter(input_audio, b, a)

    # Extract MFCC
    mfcc_features = extract_mfcc_features(filtered_audio, Fs)

    label_index = labels_index[label]  # Convert label name to label index
    val_features_list.append(mfcc_features)  # Add extracted features to the list
    val_labels_list.append(label_index)  # Add label index to the list

# Calculate the shortest length
min_length = min([len(feat) for feat in val_features_list])

# Trim all features to the shortest length
trimmed_val_features_list = [feat[:min_length] for feat in val_features_list]

# Convert features and labels to NumPy arrays
val_features_array = np.array(trimmed_val_features_list)
val_labels_array = np.array(val_labels_list)

# Convert to PyTorch tensors
val_features_tensor = torch.tensor(val_features_array, dtype=torch.float)
val_labels_tensor = torch.tensor(val_labels_array, dtype=torch.long)

val_dataset = TensorDataset(val_features_tensor, val_labels_tensor)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=True)

# Define the Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMSoundClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=3):  # Default value for num_layers set to 3
        super(LSTMSoundClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Define LSTM layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Define additional fully connected layers
        self.fc1 = nn.Linear(hidden_size, hidden_size * 2)  # Fully connected layer that doubles the hidden size
        self.fc2 = nn.Linear(hidden_size * 2, num_classes)  # Layer for final output

        # Add dropout (common usage)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Pass through LSTM layers
        out, _ = self.lstm(x, (h0, c0))

        # Use dropout
        out = self.dropout(out[:, -1, :])  # applying dropout to the output of the last LSTM layer

        # Pass through fully connected layers
        out = F.relu(self.fc1(out))
        out = self.fc2(out)

        return out

#Model Training

In [None]:
import torch.optim as optim
import torch.nn.functional as F
import copy

# Set up the model, loss function, and optimizer
hidden_size = 512  # Adjust hidden_size to a larger value to deepen the model
model = LSTMSoundClassifier(input_size=features_tensor.size(2), hidden_size=hidden_size, num_classes=len(labels_index)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

num_epochs = 50

best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
patience = 5  # Number of epochs to allow without performance improvement
patience_counter = 0  # Counter for epochs without performance improvement

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for features, labels in dataloader:
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, predictions = torch.max(outputs, 1)
        train_loss += loss.item()
        train_correct += (predictions == labels).sum().item()
        train_total += labels.size(0)

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for features, labels in val_dataloader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)

            _, predictions = torch.max(outputs, 1)
            val_loss += loss.item()
            val_correct += (predictions == labels).sum().item()
            val_total += labels.size(0)

    epoch_acc = val_correct / val_total

    # Save the best model
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        patience_counter = 0  # Reset counter on performance improvement
    else:
        patience_counter += 1  # Increase counter if no performance improvement

    print(f'Epoch {epoch+1}: Train Loss: {train_loss / len(dataloader):.4f}, Train Accuracy: {train_correct / train_total:.4f}, '
          f'Val Loss: {val_loss / len(val_dataloader):.4f}, Val Accuracy: {val_correct / val_total:.4f}')

    # Check for early stopping condition
    if patience_counter == patience:
        print("Early stopping")
        break

# Restore the best model (code to save the best performing model)
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), '4.19.pth') # model name is adjustable.

Epoch 1: Train Loss: 1.6926, Train Accuracy: 0.4443, Val Loss: 1.7684, Val Accuracy: 0.5665
Epoch 2: Train Loss: 1.2363, Train Accuracy: 0.5987, Val Loss: 1.3113, Val Accuracy: 0.6897
Epoch 3: Train Loss: 1.0303, Train Accuracy: 0.6792, Val Loss: 1.3685, Val Accuracy: 0.7192
Epoch 4: Train Loss: 0.9204, Train Accuracy: 0.7034, Val Loss: 1.7142, Val Accuracy: 0.6749
Epoch 5: Train Loss: 0.8579, Train Accuracy: 0.7195, Val Loss: 1.5537, Val Accuracy: 0.7635
Epoch 6: Train Loss: 0.7886, Train Accuracy: 0.7503, Val Loss: 1.3924, Val Accuracy: 0.7635
Epoch 7: Train Loss: 0.7113, Train Accuracy: 0.7584, Val Loss: 1.7369, Val Accuracy: 0.7833
Epoch 8: Train Loss: 0.6507, Train Accuracy: 0.7987, Val Loss: 1.6134, Val Accuracy: 0.7537
Epoch 9: Train Loss: 0.5489, Train Accuracy: 0.8134, Val Loss: 1.6567, Val Accuracy: 0.7833
Epoch 10: Train Loss: 0.5304, Train Accuracy: 0.8295, Val Loss: 1.8698, Val Accuracy: 0.7783
Epoch 11: Train Loss: 0.4593, Train Accuracy: 0.8470, Val Loss: 1.9129, Val Acc

## Algorithm to print new name and its probability with a test sample

In [None]:
import torch.nn.functional as F

# Load test data sample
# Define functions for low-pass filtering and extracting MFCC
def preprocess_audio(file_path, b, a):
    # Load WAV file
    Fs, input_audio = wavfile.read(file_path)

    # Apply low-pass filtering
    filtered_audio = apply_lowpass_filter(input_audio, b, a)

    # Extract MFCC
    mfcc_features = mfcc(filtered_audio, samplerate=Fs, numcep=13)

    return mfcc_features

# Test sample file path
test_file_path = './test/Black_Legged_Kittiwake_2_TEST.wav'

# Apply preprocessing to the test sample
test_feature = preprocess_audio(test_file_path, b, a)

# Convert to PyTorch tensor
test_feature_tensor = torch.tensor([test_feature], dtype=torch.float)#.to(device)

# Define model
model = LSTMSoundClassifier(input_size=test_feature_tensor.size(2), hidden_size=512, num_classes=len(labels_index))#.to(device)
model.load_state_dict(torch.load('./3.23.pth'))
######model.load_state_dict(torch.load('./best_model_1.pth'))
model.eval()  # Set to inference mode
threshold = 0.5  # Set the threshold for probability

# Perform inference on the test sample
with torch.no_grad():
    output = model(test_feature_tensor)  # test_feature_tensor is the preprocessed test data
    probabilities = F.softmax(output, dim=1)  # Convert model output to probabilities using softmax
    max_probs, predicted_indices = torch.max(probabilities, dim=1)  # Extract the maximum probability and corresponding index for each sample

    # Handle the first sample
    predicted_prob = max_probs.item()  # Maximum probability of the first sample
    predicted_index = predicted_indices.item()  # Predicted class index of the first sample

    # Treat predictions with probability lower than the threshold as 'non-match'
    if predicted_prob < threshold:
        print("Non-match")
    else:
        # Retrieve the name of the predicted class
        predicted_label = [label for label, index in labels_index.items() if index == predicted_index][0]
        print(f"Predicted label: {predicted_label}, with probability: {predicted_prob:.4f}")

Predicted label: Black_legged_Kittiwake_Rissa_tridactyla, with probability: 0.8980


In [None]:
print(predicted_prob)

In [None]:
import torch.nn.functional as F

# Load test data sample
# Define functions for low-pass filtering and extracting MFCC
def preprocess_audio(file_path, b, a):
    # Load WAV file
    Fs, input_audio = wavfile.read(file_path)

    # Apply low-pass filtering
    filtered_audio = apply_lowpass_filter(input_audio, b, a)

    # Extract MFCC
    mfcc_features = mfcc(filtered_audio, samplerate=Fs, numcep=13)

    return mfcc_features

# Path of the test sample file
test_file_path = './test/Long_tailed_duck_2_TEST.wav'

# Apply preprocessing to the test sample
test_feature = preprocess_audio(test_file_path, b, a)

# Convert to PyTorch tensor
test_feature_tensor = torch.tensor([test_feature], dtype=torch.float).to(device)

# Define the model
model = LSTMSoundClassifier(input_size=test_feature_tensor.size(2), hidden_size=256, num_classes=len(labels_index)).to(device)
model.load_state_dict(torch.load('./best_model_1.pth'))
model.eval()  # Set to inference mode
threshold = 0.6  # Set the threshold for probability

# Perform inference on the test sample
with torch.no_grad():
    output = model(test_feature_tensor)  # test_feature_tensor is the preprocessed test data
    probabilities = F.softmax(output, dim=1)  # Convert the model output to probabilities using the softmax function
    max_probs, predicted_indices = torch.max(probabilities, dim=1)  # Extract the maximum probability and corresponding index for each sample

    # Handle the first sample
    predicted_prob = max_probs.item()  # Maximum probability of the first sample
    predicted_index = predicted_indices.item()  # Predicted class index of the first sample

    # Treat predictions with a probability lower than the threshold as 'non-match'
    if predicted_prob < threshold:
        print("Non-match")
    else:
        # Retrieve the name of the predicted class
        predicted_label = [label for label, index in labels_index.items() if index == predicted_index][0]
        print(f"Predicted label: {predicted_label}, with probability: {predicted_prob:.4f}")

Predicted label: Black_legged_Kittiwake_Rissa_tridactyla, with probability: 0.9997
