<a href="https://colab.research.google.com/github/audreychristensen/Bird_Audio_CNN/blob/main/2.1%20Model%20for%20Subset%20of%20Species.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle

import h5py
import numpy as np
import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

import librosa
import librosa.display

import IPython.display as ipd
from PIL import Image

import soundfile as sf
import scipy.io.wavfile as wave
import scipy.ndimage as ndimage
import scipy.stats as stats
from scipy import interpolate
import traceback
import tensorflow as tf

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

In [None]:
base_dir = '/content/drive/MyDrive/F2024/Applied Data Science/Project 3/'

In [None]:
hdf5_path = base_dir + 'output_spectrograms_final.h5'
birds_df = pd.read_csv(base_dir + 'bird_dict.csv')

In [None]:
bird_dict = dict(zip(birds_df.iloc[:, 0], birds_df.iloc[:, 1]))

In [None]:
# Set runtime to GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define

In [None]:
class BirdSpectrogramDataset(Dataset):
    def __init__(self, hdf5_path, bird_dict, transform=None):
        self.hdf5_path = hdf5_path
        self.bird_dict = bird_dict
        self.transform = transform
        self.data = []
        self.labels = []

        with h5py.File(hdf5_path, 'r') as f:
            # Iterate through all groups (bird species)
            for species_name, species_group in f.items():
                    species_label = bird_dict.get(species_name, -1)  # Get label from dictionary
                    # Loop through spectrograms in each species group
                    for spectrogram in species_group.values():
                        self.data.append(spectrogram[()])
                        self.labels.append(species_label)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        spectrogram = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            spectrogram = self.transform(spectrogram)

        return spectrogram, label


In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # You can adjust this if needed
])

In [None]:
dataset = BirdSpectrogramDataset(hdf5_path, bird_dict, transform=transform)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, stratify=dataset.labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 32 * 64, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = self.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(-1, 128 * 32 * 64)  # Flattening
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
num_classes = len(bird_dict)
model = CNNModel(num_classes).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train Model

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%')


Epoch [1/10], Loss: 0.1863, Accuracy: 93.76%
Epoch [2/10], Loss: 0.1025, Accuracy: 97.41%
Epoch [3/10], Loss: 0.1309, Accuracy: 96.20%
Epoch [4/10], Loss: 0.1098, Accuracy: 96.34%
Epoch [5/10], Loss: 0.0810, Accuracy: 98.00%
Epoch [6/10], Loss: 0.0691, Accuracy: 97.56%
Epoch [7/10], Loss: 0.0343, Accuracy: 98.83%
Epoch [8/10], Loss: 0.0243, Accuracy: 99.32%
Epoch [9/10], Loss: 0.0277, Accuracy: 99.12%
Epoch [10/10], Loss: 0.0719, Accuracy: 98.15%


# Quick Validate

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_accuracy = 100 * correct / total
print(f'Validation Accuracy: {val_accuracy:.2f}%')

Validation Accuracy: 67.25%


# Validation Accuracy per Species:

In [None]:
model.eval()
correct = 0
total = 0
species_correct = {species: 0 for species in bird_dict.keys()}
species_total = {species: 0 for species in bird_dict.keys()}

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Track correct and total for each species
        for i in range(labels.size(0)):
            species_name = list(bird_dict.keys())[list(bird_dict.values()).index(labels[i].item())]
            species_total[species_name] += 1
            if predicted[i] == labels[i]:
                species_correct[species_name] += 1

val_accuracy = 100 * correct / total
print(f'Validation Accuracy: {val_accuracy:.2f}%')

# Calculate and print % of correct classifications per species
print("\nPercentage of Correct Classifications per Species:")
for species in bird_dict.keys():
    species_accuracy = 100 * species_correct[species] / species_total[species] if species_total[species] > 0 else 0
    print(f"Species: {species}, Correct: {species_accuracy:.2f}%")


Validation Accuracy: 67.25%

Percentage of Correct Classifications per Species:
Species: AmericanCrow, Correct: 81.82%
Species: AmericanRedstart, Correct: 69.61%
Species: AmericanRobin, Correct: 78.00%
Species: AmericanYellowWarbler, Correct: 41.18%
Species: BarnSwallow, Correct: 65.42%
Species: noise, Correct: 100.00%


In [None]:
class_to_species = {v: k for k, v in bird_dict.items()}

In [None]:
with open('class_to_species.pkl', 'wb') as f:
    pickle.dump(class_to_species, f)

In [None]:
torch.save(model.state_dict(), base_dir + 'full_bird_model.pth')

# Now we try to predict a bird from a full length recording! I will be using a recording >1 minute long because it is easy for me to know that that didn't make it into our training data

In [None]:
def getMelSpec(path, seconds = 5, overlap = 4, minlen = 3, winlen=0.05, winstep=0.0097, NFFT=840, sr_target=44100):
  """
  """
  y, sr = librosa.load(path, sr=sr_target)
  print(f"shape: {y.shape}", f'sr: {sr}')
  sig_splits = []
  step = int((seconds - overlap) * sr)
  window_length = int(seconds * sr)
  NFFT = max(NFFT, int(winlen * sr))

  for i in range(0, len(y), step):
    split =  y[i:i + window_length]
    if len(split) >= minlen:
      sig_splits.append(split)

  if len(sig_splits) == 0:
    sig_splits.append(sig)


  for split_sig in sig_splits:
        # compute mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=split_sig,
            sr=sr,
            n_fft=int(winlen * sr),
            hop_length=int(winstep * sr),
            n_mels=128*2
        )

        # Convert to dB scale
        mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max)

        # resize to fixed shape
        mel_spec_resized = cv2.resize(mel_spec_db, (512, 256))

        yield mel_spec_resized

def filter_isolated_cells(array, struct):

    filtered_array = np.copy(array)
    id_regions, num_ids = ndimage.label(filtered_array, structure=struct)
    id_sizes = np.array(ndimage.sum(array, id_regions, range(num_ids + 1)))
    area_mask = (id_sizes == 1)
    filtered_array[area_mask[id_regions]] = 0

    return filtered_array


def hasBird(spec, threshold=16):

    #working copy
    img = spec.copy()

    #STEP 1: Median blur
    img = cv2.medianBlur(img,5)

    #STEP 2: Median threshold
    col_median = np.median(img, axis=0, keepdims=True)
    row_median = np.median(img, axis=1, keepdims=True)

    img[img < row_median * 3] = 0
    img[img < col_median * 4] = 0
    img[img > 0] = 1

    #STEP 3: Remove singles
    img = filter_isolated_cells(img, struct=np.ones((3,3)))

    #STEP 4: Morph Closing
    img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, np.ones((5,5), np.float32))

    #STEP 5: Frequency crop
    img = img[128:-16, :]

    #STEP 6: Count columns and rows with signal
    #(Note: We only use rows with signal as threshold, but columns might come in handy in other scenarios)

    #column has signal?
    col_max = np.max(img, axis=0)
    col_max = ndimage.morphology.binary_dilation(col_max, iterations=2).astype(col_max.dtype)
    cthresh = col_max.sum()

    #row has signal?
    row_max = np.max(img, axis=1)
    row_max = ndimage.morphology.binary_dilation(row_max, iterations=2).astype(row_max.dtype)
    rthresh = row_max.sum()

    #final threshold
    thresh = rthresh

    #DBUGB: show?
    #print thresh
    #cv2.imshow('BIRD?', img)
    #cv2.waitKey(-1)

    #STEP 7: Apply threshold (Default = 16)
    bird = True
    if thresh < threshold:
        bird = False

    return bird, thresh

In [None]:
bird = '/content/drive/MyDrive/F2024/Applied Data Science/Project 3/bird_calls_highest_quality/AmericanRobin/543354.mp3'

In [None]:
spectrograms = list(getMelSpec(bird))

shape: (2840832,) sr: 44100


In [None]:
import torch
import numpy as np


def spectrograms_to_tensor(spectrograms, device='cuda'):
    spectrograms_tensor = torch.tensor(np.array(spectrograms)).unsqueeze(1).float().to(device)
    return spectrograms_tensor

def predict_from_spectrograms(model, spectrograms, device='cuda'):
    spectrograms_tensor = spectrograms_to_tensor(spectrograms, device)
    model.eval()

    with torch.no_grad():
        outputs = model(spectrograms_tensor)
        _, predictions = torch.max(outputs, 1)

    return predictions

spectrograms = list(getMelSpec(bird))
predictions = predict_from_spectrograms(model, spectrograms, device='cuda')

print(f'Predictions: {predictions}')


shape: (2840832,) sr: 44100
Predictions: tensor([4, 4, 1, 3, 3, 3, 3, 5, 3, 4, 4, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 3, 1, 5, 3,
        5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')


In [None]:
from collections import Counter

In [None]:
def get_most_frequent_species(predictions, class_to_species):
    prediction_counts = Counter(predictions.cpu().numpy())

    most_frequent_index = prediction_counts.most_common(1)[0][0]
    most_frequent_count = prediction_counts.most_common(1)[0][1]

    species = class_to_species[most_frequent_index]

    confidence = (most_frequent_count / len(predictions)) * 100

    return species, confidence, prediction_counts

In [None]:
species, confidence, prediction_counts = get_most_frequent_species(predictions, class_to_species)

print(f'Most frequent species: {species} in {confidence:.2f}% of recording')
print(f'Prediction counts: {prediction_counts}')



Most frequent species: AmericanRobin with 61.54% confidence
Prediction counts: Counter({3: 40, 1: 14, 4: 8, 5: 3})
