0. Dataset Statistics


In [5]:
# Import Relevant Libraries
import os
import pandas as pd

In [6]:
# We start by first evaluating the dataset to see how many images are in each class.
# This will help us to understand the distribution of the images in the dataset.

# Define the directory of the images.
main_dir = "../Data/nabirds/images"

# Define a dictionary to store the counts of the images.
image_counts = {}

# Loop through the subdirectories in the main directory.
for sub_dir in os.listdir(main_dir):
    # Define the path to the subdirectory.
    sub_dir_path = os.path.join(main_dir, sub_dir)
    # Error check to ensure the subdirectory is a directory.
    if not os.path.isdir(sub_dir_path):
        continue
    # Count the number of images in the subdirectory.
    image_count = len(os.listdir(sub_dir_path))
    # Store the count of images in the dictionary.
    image_counts[sub_dir] = image_count

In [29]:
# Now we import this information into a pandas dataframe and store it in a csv file.
df = pd.DataFrame(list(image_counts.items()), columns=["Class ID", "Images Count"])
# Convert Class ID to int64.
df["Class ID"] = df["Class ID"].astype("int64")

df["Images Count"].sum()

48562

In [32]:
# Now we add the species name to the csv file.
# We start by storing this information in a dictionary.
class_mapping = {}
# We refer to the classes.txt file that contains class IDs and class names.
with open("../Data/nabirds/classes.txt", "r") as f:
    # Iterate through each line.
    for line in f:
        class_id, class_name = line.strip().split(" ", 1)
        class_mapping[class_id] = class_name

# Convert the ditionary keys to int64.
class_mapping = {int(k): v for k, v in class_mapping.items()}

In [34]:
# Use the dictionary to map the class IDs to the species names in the dataframe.
df["Spcies Name"] = df["Class ID"].map(class_mapping)
df

Unnamed: 0,Class ID,Images Count,Spcies Name
0,987,120,Red-winged Blackbird (Female/juvenile)
1,973,119,White-crowned Sparrow (Immature)
2,342,31,Pacific Loon (Breeding)
3,514,96,Long-billed Curlew
4,974,113,Golden-crowned Sparrow (Immature)
...,...,...,...
550,339,71,Gambel's Quail (Male)
551,550,99,Acorn Woodpecker
552,762,119,Black-throated Blue Warbler (Adult Male)
553,908,116,Dickcissel


In [38]:
# Put the data in descending order by images count.
df = df.sort_values(by="Images Count", ascending=False)
# Now we save the dataframe to a csv file.
df.to_csv("nabirds_images_count.csv", index=False)