In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import pathlib
import sys

In [3]:
# go to parent directory
import paths

REPO_DIR = paths.get_repo_path()
ROOT_DIR = REPO_DIR / "Stanford Dogs"

DATA_BASE_PATH = paths.get_data_path() / "stanford-dogs-dataset"
IMAGES_PATH = DATA_BASE_PATH / "images/Images"
ANNOTATIONS_PATH = DATA_BASE_PATH / "annotations/Annotation"
NEW_ANNOTATIONS_PATH = DATA_BASE_PATH / "annotations/New-Annotation"

COCO_FORMAT_DATA_PATH = DATA_BASE_PATH / "coco_format"

RANDOM_SEED = 42

os.chdir(REPO_DIR)

In [4]:
DATA_BASE_PATH

PosixPath('/Users/vineetmahajan/Code/AI/datasets/stanford-dogs-dataset')

In [3]:
np.random.seed(RANDOM_SEED)

In [4]:
breed_dir_name = [
        breed 
        for breed in sorted(os.listdir(IMAGES_PATH))
        if not breed.startswith(".") and os.path.isdir(IMAGES_PATH / breed)
]

len(breed_dir_name), breed_dir_name[:5]

(120,
 ['n02085620-Chihuahua',
  'n02085782-Japanese_spaniel',
  'n02085936-Maltese_dog',
  'n02086079-Pekinese',
  'n02086240-Shih-Tzu'])

In [5]:
dogs_df = pd.DataFrame(columns=["breed", "image_path", "annotation_path"])

for breed_dir in breed_dir_name:
    breed_name = " ".join(breed_dir.replace("_", "-").split("-")[1:]).title()
    
    breed_images_dir_path = IMAGES_PATH / breed_dir
    breed_annotations_dir_path = ANNOTATIONS_PATH / breed_dir
    
    breed_images_name = [
            image 
            for image in sorted(os.listdir(breed_images_dir_path)) 
            if not image.startswith(".") and image.endswith((".jpg", ".jpeg", ".png"))
    ]
    breed_annotations_name = [
            image.split(".")[0]
            for image in breed_images_name
    ]
    
    breed_images_path = [
            breed_images_dir_path / image 
            for image in breed_images_name
            if os.path.isfile(breed_images_dir_path / image)
    ]
    breed_annotations_path = [
            breed_annotations_dir_path / annotation 
            for annotation in breed_annotations_name
            if os.path.isfile(breed_annotations_dir_path / annotation)
    ]
    
    dogs_df = pd.concat([dogs_df, pd.DataFrame({"breed": breed_name, "image_path": breed_images_path, "annotation_path": breed_annotations_path})])
    
    
dogs_df

Unnamed: 0,breed,image_path,annotation_path
0,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...
164,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
165,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
166,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
167,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [6]:
dogs_df_path = DATA_BASE_PATH / "dogs_df.csv"
dogs_df = dogs_df.sort_values(by=["breed", "image_path"])
dogs_df.to_csv(dogs_df_path, index=False)

dogs_df = pd.read_csv(dogs_df_path)
dogs_df

Unnamed: 0,breed,image_path,annotation_path
0,Affenpinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,Affenpinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,Affenpinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Affenpinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Affenpinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...
20575,Yorkshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20576,Yorkshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20577,Yorkshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20578,Yorkshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [7]:
import json

breeds = dogs_df["breed"].unique()
breeds_dict = {i: breed for i, breed in enumerate(breeds)}

with open(ROOT_DIR / "breeds_dict.json", "w") as f:
    json.dump(breeds_dict, f, indent=2)

In [8]:
os.makedirs(NEW_ANNOTATIONS_PATH, exist_ok=True)
breed_id_dict = {breed: i for i, breed in breeds_dict.items()}
breed_id_dict["Affenpinscher"]

0

In [9]:
dogs_df.breed.unique()

array(['Affenpinscher', 'Afghan Hound', 'African Hunting Dog', 'Airedale',
       'American Staffordshire Terrier', 'Appenzeller',
       'Australian Terrier', 'Basenji', 'Basset', 'Beagle',
       'Bedlington Terrier', 'Bernese Mountain Dog',
       'Black And Tan Coonhound', 'Blenheim Spaniel', 'Bloodhound',
       'Bluetick', 'Border Collie', 'Border Terrier', 'Borzoi',
       'Boston Bull', 'Bouvier Des Flandres', 'Boxer',
       'Brabancon Griffon', 'Briard', 'Brittany Spaniel', 'Bull Mastiff',
       'Cairn', 'Cardigan', 'Chesapeake Bay Retriever', 'Chihuahua',
       'Chow', 'Clumber', 'Cocker Spaniel', 'Collie',
       'Curly Coated Retriever', 'Dandie Dinmont', 'Dhole', 'Dingo',
       'Doberman', 'English Foxhound', 'English Setter',
       'English Springer', 'Entlebucher', 'Eskimo Dog',
       'Flat Coated Retriever', 'French Bulldog', 'German Shepherd',
       'German Short Haired Pointer', 'Giant Schnauzer',
       'Golden Retriever', 'Gordon Setter', 'Great Dane',
      

In [10]:
import xmltodict

new_annotations = []
for annotation_path in dogs_df["annotation_path"]:
    annotation_path = pathlib.Path(annotation_path)
    new_annotation_path = NEW_ANNOTATIONS_PATH / annotation_path.parent.name
    os.makedirs(new_annotation_path, exist_ok=True)
    annotation_name = annotation_path.name
    
    annotation_data = xmltodict.parse(annotation_path.read_text())
    image_w, image_h = (
        int(annotation_data["annotation"]["size"]["width"]), 
        int(annotation_data["annotation"]["size"]["height"])
    )
    final_data = ""
    objects = annotation_data["annotation"]["object"]
    if not isinstance(objects, list):
        objects = [objects]
        
    for obj in objects:
        breed = obj["name"]
        xmin, ymin, xmax, ymax = (
            int(obj["bndbox"]["xmin"]), 
            int(obj["bndbox"]["ymin"]), 
            int(obj["bndbox"]["xmax"]), 
            int(obj["bndbox"]["ymax"])
        )
        obj_h = ymax - ymin
        obj_w = xmax - xmin
        class_id = breed_id_dict[breed.replace("_", " ").replace("-", " ").title()]
        x, y, w, h = (
            (xmin + obj_w/2) / image_w,
            (ymin + obj_h/2) / image_h,
            (obj_w) /image_w ,
            (obj_h) / image_h
        )
            
        final_data += f"{class_id} {x} {y} {w} {h}\n"
    
    new_annotation_path = new_annotation_path / (annotation_name+".txt")
    new_annotation_path.write_text(final_data)
    new_annotations.append(new_annotation_path)
    

In [11]:
dogs_df["new_annotation_path"] = new_annotations
dogs_df.to_csv(dogs_df_path, index=False)

In [12]:
# shuffle data
dogs_df = dogs_df.sample(frac=1).reset_index(drop=True)

dogs_df

Unnamed: 0,breed,image_path,annotation_path,new_annotation_path
0,Samoyed,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,Borzoi,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,West Highland White Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Basenji,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Weimaraner,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...,...
20575,Labrador Retriever,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20576,Lhasa,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20577,Chow,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20578,American Staffordshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [13]:
test_split = 0.15

test_df = dogs_df.sample(frac=test_split)
train_df = dogs_df.drop(test_df.index)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df_path = DATA_BASE_PATH / "train_df.csv"
test_df_path = DATA_BASE_PATH / "test_df.csv"

train_df.to_csv(train_df_path, index=False)
test_df.to_csv(test_df_path, index=False)

In [14]:
val_split = 0.15
val_df = train_df.sample(frac=val_split)
train_df = train_df.drop(val_df.index)

In [15]:
len(train_df), len(val_df), len(test_df)

(14869, 2624, 3087)

In [16]:
# train_val_split = 0.15
# train_test_split = 0.15
# # shuffle the data
# dogs_df = dogs_df.sample(frac=1).reset_index(drop=True)

# test_df = dogs_df.sample(frac=train_test_split)
# train_val_df = dogs_df.drop(test_df.index)
# val_df = train_val_df.sample(frac=train_val_split)
# train_df = train_val_df.drop(val_df.index)

In [17]:
val_df


Unnamed: 0,breed,image_path,annotation_path,new_annotation_path
4578,Malinois,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
9869,Clumber,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
12055,Mexican Hairless,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
16652,Border Collie,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4418,Bull Mastiff,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...,...
12128,Otterhound,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
406,Miniature Pinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
11133,Miniature Schnauzer,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2451,Briard,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [18]:
test_df

Unnamed: 0,breed,image_path,annotation_path,new_annotation_path
0,Greater Swiss Mountain Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,Bluetick,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,Tibetan Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Irish Wolfhound,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Miniature Pinscher,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...,...
3082,English Springer,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3083,Great Dane,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3084,Norwich Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3085,German Short Haired Pointer,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [19]:
train_df

Unnamed: 0,breed,image_path,annotation_path,new_annotation_path
0,Samoyed,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,West Highland White Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,Basenji,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Weimaraner,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Whippet,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...,...
17488,Labrador Retriever,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
17489,Lhasa,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
17490,Chow,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
17491,American Staffordshire Terrier,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


In [20]:
COCO_FORMAT_DATA_PATH

PosixPath('/Users/vineetmahajan/Code/AI/datasets/stanford-dogs-dataset/coco_format')

In [21]:
coco_train_path = COCO_FORMAT_DATA_PATH / "train"
coco_val_path = COCO_FORMAT_DATA_PATH / "val"
coco_test_path = COCO_FORMAT_DATA_PATH / "test"

In [22]:
try: 

    for image_path, label_path in zip(train_df["image_path"], train_df["new_annotation_path"]):
        image_path = pathlib.Path(image_path)
        label_path = pathlib.Path(label_path)
        images_dir_path = coco_train_path / "images"
        labels_dir_path = coco_train_path / "labels"
        os.makedirs(images_dir_path, exist_ok=True)
        os.makedirs(labels_dir_path, exist_ok=True)
        
        os.symlink(image_path, images_dir_path / image_path.name)
        os.symlink(label_path, labels_dir_path / label_path.name)

    for image_path, label_path in zip(val_df["image_path"], val_df["new_annotation_path"]):
        image_path = pathlib.Path(image_path)
        label_path = pathlib.Path(label_path)
        images_dir_path = coco_val_path / "images"
        labels_dir_path = coco_val_path / "labels"
        os.makedirs(images_dir_path, exist_ok=True)
        os.makedirs(labels_dir_path, exist_ok=True)
        
        os.symlink(image_path, images_dir_path / image_path.name)
        os.symlink(label_path, labels_dir_path / label_path.name)

    for image_path, label_path in zip(test_df["image_path"], test_df["new_annotation_path"]):
        image_path = pathlib.Path(image_path)
        label_path = pathlib.Path(label_path)
        images_dir_path = coco_test_path / "images"
        labels_dir_path = coco_test_path / "labels"
        os.makedirs(images_dir_path, exist_ok=True)
        os.makedirs(labels_dir_path, exist_ok=True)
        
        os.symlink(image_path, images_dir_path / image_path.name)
        os.symlink(label_path, labels_dir_path / label_path.name)

except FileExistsError:
    pass

In [23]:
coco_format_dataset_details = {
    "path": str(COCO_FORMAT_DATA_PATH),
    "train": "train",
    "val": "val",
    "test": "test",
    
    "names": breeds_dict
}

coco_format_dataset_details

{'path': '/Users/vineetmahajan/Code/AI/datasets/stanford-dogs-dataset/coco_format',
 'train': 'train',
 'val': 'val',
 'test': 'test',
 'names': {0: 'Affenpinscher',
  1: 'Afghan Hound',
  2: 'African Hunting Dog',
  3: 'Airedale',
  4: 'American Staffordshire Terrier',
  5: 'Appenzeller',
  6: 'Australian Terrier',
  7: 'Basenji',
  8: 'Basset',
  9: 'Beagle',
  10: 'Bedlington Terrier',
  11: 'Bernese Mountain Dog',
  12: 'Black And Tan Coonhound',
  13: 'Blenheim Spaniel',
  14: 'Bloodhound',
  15: 'Bluetick',
  16: 'Border Collie',
  17: 'Border Terrier',
  18: 'Borzoi',
  19: 'Boston Bull',
  20: 'Bouvier Des Flandres',
  21: 'Boxer',
  22: 'Brabancon Griffon',
  23: 'Briard',
  24: 'Brittany Spaniel',
  25: 'Bull Mastiff',
  26: 'Cairn',
  27: 'Cardigan',
  28: 'Chesapeake Bay Retriever',
  29: 'Chihuahua',
  30: 'Chow',
  31: 'Clumber',
  32: 'Cocker Spaniel',
  33: 'Collie',
  34: 'Curly Coated Retriever',
  35: 'Dandie Dinmont',
  36: 'Dhole',
  37: 'Dingo',
  38: 'Doberman',


In [24]:
# dict to yaml
import yaml

with open(ROOT_DIR / "detection/dataset.yaml", "w") as f:
    yaml.dump(coco_format_dataset_details, f, default_flow_style=False)