<a href="https://colab.research.google.com/github/archit436/Birds_Classifier/blob/main/Data_Processing/DP_Stage2_XC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

0. Setup


In [1]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
# Import Relevant Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
import os
import librosa
import soundfile as sf
from pydub import AudioSegment
# Copied from Prarathona's DP Pipeline for Baseline models.
import cv2
from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
import torchaudio

1. Data Acquisiton from Xeno Canto


In [3]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')
csv_path = "/content/drive/MyDrive/APS360_Team_15/Data_Processing/chosen_classes_80_data_stats.csv"

# We start by importing the information about birds outputted by the stage 1 in
# the form of a csv file.
birds_df = pd.read_csv(csv_path)
birds_df.head()

Mounted at /content/drive


Unnamed: 0,Index,Class ID,Images Count,XC Recordings Count,Species Name,XC Species Name
0,0,315,116,169,Gadwall (Breeding male),Gadwall
1,1,317,120,243,Mallard (Breeding male),Mallard
2,2,333,105,112,Common Goldeneye (Breeding male),Common Goldeneye
3,3,352,120,283,Black-crowned Night-Heron (Adult),Black-crowned Night Heron
4,4,366,101,127,Common Gallinule (Adult),Common Gallinule


In [None]:
# We will now query into Xeno-Canto API to download the audio files for the birds.
# We create a directory called Xeno Canto to store all the audio files.
xeno_canto_dir = "/content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto"
os.makedirs(xeno_canto_dir, exist_ok=True)

# Iterate through all the birds manually, due to timeout issues with Xeno Canto
it_bird = 106

# Extract basic data about the bird from the dataframe.
bird_id = birds_df["Class ID"][it_bird]
bird_xc_name = birds_df["XC Species Name"][it_bird]
bird_xc_recordings_count = birds_df["XC Recordings Count"][it_bird]

# Make a subdirectory for the current bird using its Class ID.
bird_dir = os.path.join(xeno_canto_dir, str(bird_id))
os.makedirs(bird_dir, exist_ok=True)

# Get the bird name and query the Xeno-Canto API.
# As before, we only query for high quality recordings
search_query = "+".join(bird_xc_name.split())
encoded_search_query = f"{search_query}+q:A"
url = f"https://www.xeno-canto.org/api/2/recordings?query={encoded_search_query}"
search_response = requests.get(url)
search_data = search_response.json()

# Use the search query response to download all the audio files.
# We limit the recordings to 500 per species, if they have more available.
for it_rec in range(min(500,bird_xc_recordings_count)):
    audio_url = search_data["recordings"][it_rec]["file"]
    audio_response = requests.get(audio_url, stream=True)
    with open(f"{bird_dir}/{it_rec}.mp3", "wb") as f:
        f.write(audio_response.content)
print(f"Downloaded {min(500,bird_xc_recordings_count)} recordings for {bird_xc_name}.")

Downloaded 337 recordings for Red-winged Blackbird.


In [None]:
# Code to reacquire any faulty recordings.
bird_class_id = 987
file_id = 241

# Find the name corresponding this bird.
bird_name = birds_df[birds_df["Class ID"] == bird_class_id]["XC Species Name"].values[0]

# Make the query and get the search response.
search_query = "+".join(bird_name.split())
encoded_search_query = f"{search_query}+q:A"
url = f"https://www.xeno-canto.org/api/2/recordings?query={encoded_search_query}"
search_response = requests.get(url)
search_data = search_response.json()

# Retrive the audio file and write to a file.
audio_url = search_data["recordings"][file_id]["file"]
audio_response = requests.get(audio_url, stream=True)

# Define the directory to write to and then download the audio file.
xeno_canto_dir = "/content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto"
bird_dir = os.path.join(xeno_canto_dir, str(bird_class_id))
with open(f"{bird_dir}/{file_id}.mp3", "wb") as f:
    f.write(audio_response.content)
print(f"Downloaded {file_id}.mp3 for {bird_name}.")
print(search_data["recordings"][file_id])

Downloaded 241.mp3 for Red-winged Blackbird.
{'id': '314546', 'gen': 'Agelaius', 'sp': 'phoeniceus', 'ssp': 'stereus/fortis', 'group': 'birds', 'en': 'Red-winged Blackbird', 'rec': 'Nick Komar', 'cnt': 'United States', 'loc': 'Fort Collins, Larimer County, Colorado', 'lat': '40.5717', 'lng': '-105.1289', 'alt': '1600', 'type': 'song, atypical male song', 'sex': 'male', 'stage': '', 'method': 'field recording', 'url': '//xeno-canto.org/314546', 'file': 'https://xeno-canto.org/314546/download', 'file-name': 'XC314546-160501_006 RWBL male song alt 0710 Ft Collins CO nk.mp3', 'sono': {'small': '//xeno-canto.org/sounds/uploaded/ODAYZINCOA/ffts/XC314546-small.png', 'med': '//xeno-canto.org/sounds/uploaded/ODAYZINCOA/ffts/XC314546-med.png', 'large': '//xeno-canto.org/sounds/uploaded/ODAYZINCOA/ffts/XC314546-large.png', 'full': '//xeno-canto.org/sounds/uploaded/ODAYZINCOA/ffts/XC314546-full.png'}, 'osci': {'small': '//xeno-canto.org/sounds/uploaded/ODAYZINCOA/wave/XC314546-small.png', 'med': '

2. Data Processing - MP3 to Processed WAV

In [4]:
# Here we define a function that converts a mp3 file to a processed wav file waveform.
# By default, we want to get the recordings to a sample rate of 16kHz, and length of 10 seconds.
def convert_mp3_to_wav(mp3_file_path, max_length = 160000, target_sample_rate=16000):
  # Load the mp3 file.
  waveform, sample_rate = torchaudio.load(mp3_file_path)

  # Convert to mono if stereo (2 audio channels -> 1 audio channel)
  if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

  # Standardise the sample rate.
  if sample_rate != target_sample_rate:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
    waveform = resampler(waveform)

  # Normalise to zero mean and unit variance.
  waveform = (waveform - torch.mean(waveform)) / (torch.std(waveform) + 1e-8)

  # Handle padding or truncation.
  if waveform.shape[1] < max_length:
    # Pad with zeros.
    padding = torch.zeros(1, max_length - waveform.shape[1])
    waveform = torch.cat((waveform, padding), dim=1)
  else:
    # Truncate to max_length.
    waveform = waveform[:, :max_length]

  return waveform

In [None]:
# Now we use the function to generate a .pt file for each class that is a tensor
# containing all processed wav file values.

# Start by defining the directories.
data_dir = "/content/drive/MyDrive/APS360_Team_15/Data"
output_dir = os.path.join(data_dir, "Xeno_Canto_WAV_Tensors_10sec")
os.makedirs(output_dir, exist_ok=True)
# Iterate through all the birds.
for it_bird in range(55, birds_df.shape[0]):
  print(f"Converting bird num {it_bird}.")
  # Create an output list to store all tensors for this bird.
  wav_tensors_list = []
  # Extract the details of the bird.
  bird_id = birds_df["Class ID"][it_bird]
  mp3_dir = os.path.join(data_dir, "Xeno_Canto", str(bird_id))
  # Check if this bird exists in the 80 classes csv file.
  if bird_id not in birds_df["Class ID"].values:
    continue
  # Iterate through all the mp3 files for this bird.
  for file_name in os.listdir(mp3_dir):
    if file_name.endswith(".mp3"):
      # Define the path to the mp3 file.
      mp3_file_path = os.path.join(mp3_dir, file_name)
      # Convert the mp3 file to a tensor.
      wav_tensor = convert_mp3_to_wav(mp3_file_path)
      # Add tensor to the list for this bird.
      wav_tensors_list.append(wav_tensor)
  # Convert the list to a single tensor, and save to a .pt file.
  bird_wav_tensor = torch.cat(wav_tensors_list, dim=0)
  print(bird_wav_tensor.shape)
  output_file = os.path.join(output_dir, f"{bird_id}.pt")
  torch.save(bird_wav_tensor, output_file)

Converting bird num 55.
torch.Size([499, 160000])
Converting bird num 56.
torch.Size([106, 160000])
Converting bird num 57.
torch.Size([202, 160000])
Converting bird num 58.
torch.Size([284, 160000])
Converting bird num 59.
torch.Size([172, 160000])
Converting bird num 60.
torch.Size([459, 160000])
Converting bird num 61.
torch.Size([183, 160000])
Converting bird num 62.
torch.Size([194, 160000])
Converting bird num 63.
torch.Size([500, 160000])
Converting bird num 64.
torch.Size([117, 160000])
Converting bird num 65.
torch.Size([105, 160000])
Converting bird num 66.
torch.Size([211, 160000])
Converting bird num 67.
torch.Size([159, 160000])
Converting bird num 68.
torch.Size([149, 160000])
Converting bird num 69.
torch.Size([108, 160000])
Converting bird num 70.
torch.Size([101, 160000])
Converting bird num 71.
torch.Size([175, 160000])
Converting bird num 72.
torch.Size([130, 160000])
Converting bird num 73.
torch.Size([150, 160000])
Converting bird num 74.
torch.Size([211, 160000])


3. Processed WAV to Spectograms

In [24]:
# Define a function to convert a wav_tensor to a colored db-scaled spectogram.
# Most parameters are set to their default values.
def audio_to_spectogram(wav_tensor, img_size=(128, 128)):
  try:
    # Load the file and set the sample rate.
    y = wav_tensor
    sr = 16000

    # Compute the mel spectrogram
    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=1024,
        hop_length=512,
        n_mels=img_size[0]
    )(y)

    # # Print mel_spec shape for debugging
    # print(f"Mel spectrogram shape: {mel_spec.shape}")

    # Convert to db scale to allow both quiet and loud parts to show up effectively.
    db_spec = torchaudio.transforms.AmplitudeToDB(top_db=80)(mel_spec)

    # Normalize the spectrogram
    db_spec_norm = (db_spec - db_spec.mean()) / (db_spec.std() + 1e-8)

    # Reshape for interpolation
    # For a 1D input, we need to add both channel and batch dimensions
    if len(db_spec_norm.shape) == 2:
        # If shape is [n_mels, time_frames]
        db_spec_norm = db_spec_norm.unsqueeze(0)  # Add channel dim -> [1, n_mels, time_frames]

    # Resize to target dimensions
    # Target: [1, img_size[0], img_size[1]]
    db_spec_resized = torch.nn.functional.interpolate(
        db_spec_norm.unsqueeze(0),  # Add batch dimension -> [1, 1, n_mels, time_frames]
        size=img_size,
        mode='bilinear',
        align_corners=False
    ).squeeze(0)  # Remove batch dimension -> [1, img_size[0], img_size[1]]

    return db_spec_resized

  except Exception as e:
      print(f"Error in audio_to_spectogram: {e}")
      import traceback
      traceback.print_exc()
      return None

In [25]:
# We want to convert the processed wavs obtained from mp3s to spectrograms.
# Define some directories.
wav_dir = "/content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Tensors_10sec"
spec_dir = "/content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Spectrograms_10sec"
os.makedirs(spec_dir, exist_ok=True)
sr = 16000
# Iterate through each .pt file.
for file_name in os.listdir(wav_dir):
  print(f"Processing file {file_name}")
  if file_name.endswith(".pt"):
    # Extract the file path.
    file_path = os.path.join(wav_dir, file_name)
    try:
      # Load the .pt file containing all tensors for this bird.
      wav_tensors = torch.load(file_path)
      print(f"Loaded tensors with shape {wav_tensors.shape}")

      # Init list to hold spectrograms for this class.
      spectograms = []

      # Convert each audio sample to a spectrogram.
      for i in range(wav_tensors.shape[0]):
        spec = audio_to_spectogram(wav_tensors[i])
        # Add to the list, if valid spectogram is created.
        if spec is not None:
          spectograms.append(spec)


      # Stack all spectograms into a single tensor and output as a .pt file.
      if spectograms:
        spec_tensor = torch.stack(spectograms)
        print(f"Stacking tensors with shape {spec_tensor.shape}")

        # Save the spectogram file.
        output_file = os.path.join(spec_dir, file_name)
        torch.save(spec_tensor, output_file)
        print(f"Saved spectogram tensor to {output_file}")
      else:
        print(f"No valid spectograms found in {file_name}")

    except Exception as e:
      print(f"Error loading file {file_name}: {e}")
      continue


Processing file 315.pt
Loaded tensors with shape torch.Size([167, 160000])
Stacking tensors with shape torch.Size([167, 1, 128, 128])
Saved spectogram tensor to /content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Spectrograms_10sec/315.pt
Processing file 317.pt
Loaded tensors with shape torch.Size([240, 160000])
Stacking tensors with shape torch.Size([240, 1, 128, 128])
Saved spectogram tensor to /content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Spectrograms_10sec/317.pt
Processing file 333.pt
Loaded tensors with shape torch.Size([114, 160000])
Stacking tensors with shape torch.Size([114, 1, 128, 128])
Saved spectogram tensor to /content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Spectrograms_10sec/333.pt
Processing file 352.pt
Loaded tensors with shape torch.Size([285, 160000])
Stacking tensors with shape torch.Size([285, 1, 128, 128])
Saved spectogram tensor to /content/drive/MyDrive/APS360_Team_15/Data/Xeno_Canto_WAV_Spectrograms_10sec/352.pt
Processing file 366.