Stage 1 of data processing is to filter through the images and audio data and choosing classes that have a balance of good data of both types. The output of this stage is
a csv file with selected classes and associated data statistics (number of audio samples and images).The selection criteria currently is birds with more than 100 images and more than 75 recordings.


0. Setup


In [1]:
# Import Relevant Libraries
import os
import pandas as pd
import requests
import urllib.parse

1. NABirds Images Dataset


In [2]:
# Define the directory of the images dataset.
main_dir = "../Data/nabirds/images"

# Define a dictionary to store the counts of the images.
image_counts = {}

# Loop through the subdirectories in the main directory.
for sub_dir in os.listdir(main_dir):
    # Define the path to the subdirectory.
    sub_dir_path = os.path.join(main_dir, sub_dir)
    # Error check to ensure the subdirectory is a directory.
    if not os.path.isdir(sub_dir_path):
        continue
    # Count the number of images in the subdirectory.
    image_count = len(os.listdir(sub_dir_path))
    # Store the count of images in the dictionary.
    image_counts[sub_dir] = image_count

In [3]:
# Now we import this information into a pandas dataframe.
birds_df = pd.DataFrame(list(image_counts.items()), 
                  columns=["Class ID", "Images Count"])
# Convert Class ID to int64. This is the unique identifier for each class.
birds_df["Class ID"] = birds_df["Class ID"].astype("int64")

# Check the sum of the images to ensure that we have all the images.
birds_df["Images Count"].sum()

48562

In [4]:
# Get statistics of the images count.
birds_df["Images Count"].describe()

count    555.000000
mean      87.499099
std       27.294713
min       13.000000
25%       66.000000
50%       91.000000
75%      114.000000
max      120.000000
Name: Images Count, dtype: float64

In [5]:
# Now we add the species name to the dataframe.
# We start by storing this information in a dictionary.
class_mapping = {}
# We refer to the classes.txt file that contains class IDs and class names.
with open("../Data/nabirds/classes.txt", "r") as f:
    # Iterate through each line.
    for line in f:
        class_id, class_name = line.strip().split(" ", 1)
        class_mapping[class_id] = class_name

# Convert the ditionary keys to int64.
class_mapping = {int(k): v for k, v in class_mapping.items()}

In [6]:
# Use the dictionary to map the class IDs to the species names in the dataframe.
birds_df["Species Name"] = birds_df["Class ID"].map(class_mapping)
birds_df.head()

Unnamed: 0,Class ID,Images Count,Species Name
0,987,120,Red-winged Blackbird (Female/juvenile)
1,973,119,White-crowned Sparrow (Immature)
2,342,31,Pacific Loon (Breeding)
3,514,96,Long-billed Curlew
4,974,113,Golden-crowned Sparrow (Immature)


In [7]:
# Let's focus only on birds with more than 100 images.
# So we filter the dataframe to only include rows where the image count is 
# greater than 100.
birds_df = birds_df[birds_df["Images Count"] >= 100]
# Sort the data by image count
birds_df = birds_df.sort_values(by="Images Count", ascending=False)

In [8]:
# Print the number of species in the filtered dataframe.
birds_df.shape[0]

231

2. Xeno Canto Audio Dataset


In [9]:
# Next, we add a column to the dataframe to store the names modified to be more
# suitable for Xeno Canto. Start by removing any text in the parentheses.
birds_df["XC Species Name"] = birds_df["Species Name"].str.replace(r"\(.*\)", "", 
                                                      regex=True).str.strip()
birds_df.head()

Unnamed: 0,Class ID,Images Count,Species Name,XC Species Name
0,987,120,Red-winged Blackbird (Female/juvenile),Red-winged Blackbird
369,835,120,Marsh Wren,Marsh Wren
354,511,120,Willet,Willet
346,529,120,Mourning Dove,Mourning Dove
340,947,120,Red-eyed Vireo,Red-eyed Vireo


In [10]:
# Remove any bird species that have a duplicate name in XC Species Name column.
birds_df = birds_df.drop_duplicates(subset=["XC Species Name"])
# Return the count of classes in the dataframe now.
birds_df.shape[0]

205

In [11]:
# We also replace any species name with an alternative name, that is more
# suitable for Xeno Canto.
birds_df["XC Species Name"] = birds_df["XC Species Name"].replace({
    "Gray Catbird": "Grey Catbird",
    "Western Scrub-Jay": "California Scrub Jay",
    "Black-crowned Night-Heron": "Black-crowned Night Heron",
    "European Starling": "Common Starling",
    "Blue-gray Gnatcatcher": "Blue-grey Gnatcatcher",
    "American Pipit": "Buff-bellied Pipit",
    "Yellow-rumped Warbler": "Myrtle Warbler",
    "Eurasian Collared-Dove": "Eurasian Collared Dove",
    "Common Raven" : "Northern Raven",
    "Eastern Wood-Pewee": "Eastern Wood Pewee",
    "Rock Pigeon": "Rock Dove",
    "Western Wood-Pewee": "Western Wood Pewee",
    "Northern Pygmy-Owl": "Northern Pygmy Owl", 
})

In [12]:
# Iterate through the XC names in the df and get the number of recordings for each,
# by querying the Xeno-Canto API.
xeno_canto_counts = []
# Iterate through the birds.
for bird in birds_df["XC Species Name"]:
    # Define and URL-encode the search query.
    # Also add the "q:A" parameter to only search for recordings with high quality.
    query = "+".join(bird.split())
    encoded_query = f"{query}+q:A"
    # Define the URL for the API call.
    url = f"https://www.xeno-canto.org/api/2/recordings?query={encoded_query}"
    # Send a GET request to the API, and capture the response
    response = requests.get(url)
    data = response.json()
    # Extract the num of recordings.
    xeno_canto_counts.append(data['numRecordings'])

In [13]:
# Now we add the number of recordings to the dataframe.
birds_df["XC Recordings Count"] = xeno_canto_counts
# Convert the values to int64.
birds_df["XC Recordings Count"] = birds_df["XC Recordings Count"].astype("int64")
# Test
birds_df.head()

Unnamed: 0,Class ID,Images Count,Species Name,XC Species Name,XC Recordings Count
0,987,120,Red-winged Blackbird (Female/juvenile),Red-winged Blackbird,337
369,835,120,Marsh Wren,Marsh Wren,186
354,511,120,Willet,Willet,70
346,529,120,Mourning Dove,Mourning Dove,49
340,947,120,Red-eyed Vireo,Red-eyed Vireo,130


In [14]:
# Now we print out the statistics of the dataframe, for XC Recordings Count.
# List out num of classes with 20, 40, 50, 75, 100 recordings.
print("Classes with 20 recordings: ", birds_df[birds_df["XC Recordings Count"] >= 20].shape[0])
print("Classes with 40 recordings: ", birds_df[birds_df["XC Recordings Count"] >= 40].shape[0])
print("Classes with 50 recordings: ", birds_df[birds_df["XC Recordings Count"] >= 50].shape[0])
print("Classes with 75 recordings: ", birds_df[birds_df["XC Recordings Count"] >= 75].shape[0])
print("Classes with 100 recordings: ", birds_df[birds_df["XC Recordings Count"] >= 100].shape[0])
# Also print out the general statistics.
birds_df["XC Recordings Count"].describe()

Classes with 20 recordings:  192
Classes with 40 recordings:  164
Classes with 50 recordings:  141
Classes with 75 recordings:  106
Classes with 100 recordings:  80


count    205.000000
mean     122.214634
std      134.039659
min        2.000000
25%       45.000000
50%       77.000000
75%      141.000000
max      831.000000
Name: XC Recordings Count, dtype: float64

In [15]:
# We will filter out the species that have less than 75 recordings.
birds_df = birds_df[birds_df["XC Recordings Count"] >= 75]
birds_df.shape[0]

106

In [17]:
# Add index values to the dataframe.
birds_df["Index"] = range(0, birds_df.shape[0])
# Save the dataframe to a CSV file, with a new order of the columns for better readability.
birds_df = birds_df[["Index", "Class ID", "Images Count", "XC Recordings Count", 
                     "Species Name", "XC Species Name"]]
# Sort by Class ID
birds_df = birds_df.sort_values(by="Class ID")
birds_df.to_csv("chosen_classes_data_stats.csv", index=False)