In [2]:
import pandas as pd
import os 
import xml.etree.ElementTree as ET
import PIL
annotation_path = 'resources/stanford-dogs-dataset/annotations'
image_path = 'resources/stanford-dogs-dataset/images'

## Exploratory Analysis

The first thing we need to do is make sure that our data is good. 

[ ] How many images are there?

[ ] How many categories?

[ ] Are images evenly distributed?


In [3]:
# Create a list of all the directories with breed images
breed_folders = os.listdir(image_path)
breed_frequency = [] # We will use this to create a dataframe of photo counts for each breed
for breed in breed_folders:
    breed_name = breed.split('-')[1]
    photo_count = len(os.listdir(f'{image_path}/{breed}'))
    breed_frequency.append({'breed': breed_name, 'photos':photo_count})
breed_frequency_df = pd.DataFrame(breed_frequency)

breed_frequency_df

Unnamed: 0,breed,photos
0,Chihuahua,152
1,Japanese_spaniel,185
2,Maltese_dog,252
3,Pekinese,149
4,Shih,214
...,...,...
115,standard_poodle,159
116,Mexican_hairless,155
117,dingo,156
118,dhole,150


In [4]:
# Dogs with the most photos
breed_frequency_df.sort_values(by='photos', ascending=False).head(5)

Unnamed: 0,breed,photos
2,Maltese_dog,252
9,Afghan_hound,239
26,Scottish_deerhound,232
107,Pomeranian,219
88,Bernese_mountain_dog,218


In [5]:
# Dogs with the least photos
breed_frequency_df.sort_values(by='photos', ascending=True).head(5)

Unnamed: 0,breed,photos
17,redbone,148
3,Pekinese,149
74,malinois,150
70,Irish_water_spaniel,150
71,kuvasz,150


In [6]:
print(f"There are {breed_frequency_df['photos'].sum()} images of {len(breed_frequency_df['breed'])} dog breeds in our dataset.")
print(f"On average there are {breed_frequency_df['photos'].mean()} photos for each breed.")
print(f"The most photos a breed has is {breed_frequency_df['photos'].max()} images and the least amount of photos for a breed is {breed_frequency_df['photos'].min()} photos.")
print(f"This means that the largest group is only {round((breed_frequency_df['photos'].max()/breed_frequency_df['photos'].min()),2)} times larger than the smallest group.")

There are 20580 images of 120 dog breeds in our dataset.
On average there are 171.5 photos for each breed.
The most photos a breed has is 252 images and the least amount of photos for a breed is 148 photos.
This means that the largest group is only 1.7 times larger than the smallest group.


With this in mind we can conclude that we have a fairly balanced dataset to start with, and we can decide to work on augmenting the smaller sets later. 

## Data Structure

**TO DO**
- Explain how the dataset is structured
- Explain what the annotations tell us
- Explain what the bound boxes, poses etc are
- Explain how we will be storing data in the tables `dog_images` and `dog_metadata`
- Extract and gather the information from the annotations
- Create the DataFrames and save as `.csv` files.

In [9]:
annotation_path = "resources/stanford-dogs-dataset/annotations"
image_path = "resources/stanford-dogs-dataset/images"

breed_annotation_folders = os.listdir(annotation_path)
breed_image_folders = os.listdir(image_path)
extra_imgs = []
extra_annotations = []
annotations = []

for breed in breed_folders:
    breed_folder_path = os.path.join(annotation_path, breed)
    breed_folder_path_image = os.path.join(image_path, breed)
    # Loop through all annotation files in the breed folder
    for file in os.listdir(breed_folder_path):
        file_path = os.path.join(breed_folder_path, file)
        # Parse XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Extract filename
        filename = root.find("filename").text + ".jpg"  # Ensure it matches image files
        
        if '%s' in filename:
            filename = file.replace('.xml','.jpg')
            
        # Extract bounding box (first object in the file)
        objects = root.findall("object")
        for obj in objects:
            breed_name = obj.find("name").text  # Get breed label

            bbox = obj.find("bndbox")
            xmin, ymin, xmax, ymax = (
                int(bbox.find("xmin").text),
                int(bbox.find("ymin").text),
                int(bbox.find("xmax").text),
                int(bbox.find("ymax").text),
            )

            # Extract additional metadata (pose, truncated, difficult)
            pose = obj.find("pose").text if obj.find("pose") is not None else "Unspecified"
            truncated = int(obj.find("truncated").text) if obj.find("truncated") is not None else 0
            difficult = int(obj.find("difficult").text) if obj.find("difficult") is not None else 0

            # Append all data to list
            annotations.append([filename, breed_name.lower(), breed, xmin, ymin, xmax, ymax, pose, truncated, difficult])

# Convert to a Pandas DataFrame
df = pd.DataFrame(annotations, columns=["filename", "breed_name", "folder", "xmin", "ymin", "xmax", "ymax", "pose", "truncated", "difficult"])

# Save as CSV for easy access
df.to_csv("resources/csv/dog_annotations.csv", index=False)

df.head()

Unnamed: 0,filename,breed_name,folder,xmin,ymin,xmax,ymax,pose,truncated,difficult
0,n02085620_10074.jpg,chihuahua,n02085620-Chihuahua,25,10,276,498,Unspecified,0,0
1,n02085620_10131.jpg,chihuahua,n02085620-Chihuahua,49,9,393,493,Unspecified,0,0
2,n02085620_10621.jpg,chihuahua,n02085620-Chihuahua,142,43,335,250,Unspecified,0,0
3,n02085620_1073.jpg,chihuahua,n02085620-Chihuahua,0,27,312,498,Unspecified,0,0
4,n02085620_10976.jpg,chihuahua,n02085620-Chihuahua,90,104,242,452,Unspecified,0,0


In [15]:
breed_groups = pd.read_csv('resources/csv/akc-dog-breed-groups.csv')

# Merge on 'breed_name' column
df = df.merge(breed_groups[['group', 'breed_name']], on='breed_name', how='left')

df.head()

df.to_csv('resources/csv/dog_annotations_with_groups.csv', index=False)

In [16]:
df['group'].value_counts()

group
Hound                       3840
Terrier                     3765
Working                     3576
Toy                         3016
Sporting                    2958
Herding                     2047
Non-Sporting                1774
Wild                         562
Foundation Stock Service     324
Name: count, dtype: int64

In [17]:
df[['group', 'breed_name']].drop_duplicates().to_csv('resources/csv/dog-breed-groups.csv', index=False)