In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import pathlib
import sys
import xmltodict, json

## Contents

### Directories

In [73]:
# go to parent directory
os.chdir("../")
import paths

REPO_DIR = paths.get_repo_path()
ROOT_DIR = REPO_DIR / "Stanford Dogs"
DATA_BASE_PATH = paths.get_data_path() / "stanford-dogs-dataset"
IMAGES_PATH = DATA_BASE_PATH / "images/images"
ANNOTATIONS_PATH = DATA_BASE_PATH / "annotations/Annotation"


# set path to repo_dir
os.chdir(REPO_DIR)

## Dir structure to CSV file

In [74]:
breed_dir_name = [
        breed 
        for breed in sorted(os.listdir(IMAGES_PATH))
        if not breed.startswith(".") and os.path.isdir(IMAGES_PATH / breed)
]

def xml_to_json(xml_path):
    json_path = xml_path.parent / (xml_path.stem + ".json")
    
    with open(xml_path) as xml_file:
        json_data = xmltodict.parse(xml_file.read())
        
    with open(json_path, "w") as json_file:
        json.dump(json_data, json_file, indent=2)
    return json_path

dogs_df = pd.DataFrame(columns=["breed", "image_path", "annotation_path"])

for breed_dir in breed_dir_name:
    breed_name = " ".join(breed_dir.replace("_", "-").split("-")[1:]).title()
    
    breed_images_dir_path = IMAGES_PATH / breed_dir
    breed_annotations_dir_path = ANNOTATIONS_PATH / breed_dir
    
    breed_images_name = [
            image 
            for image in sorted(os.listdir(breed_images_dir_path)) 
            if not image.startswith(".") and image.endswith((".jpg", ".jpeg", ".png"))
    ]
    breed_annotations_name = [
            image.split(".")[0]
            for image in breed_images_name
    ]
    
    breed_images_path = [
            breed_images_dir_path / image 
            for image in breed_images_name
            if os.path.isfile(breed_images_dir_path / image)
    ]
    breed_annotations_path = [
            breed_annotations_dir_path / annotation 
            for annotation in breed_annotations_name
            if os.path.isfile(breed_annotations_dir_path / annotation)
    ]

    dogs_df = pd.concat([dogs_df, pd.DataFrame({"breed": breed_name, "image_path": breed_images_path, "annotation_path": breed_annotations_path})])
    dogs_df.reset_index(drop=True, inplace=True)
    
    
dogs_df

Unnamed: 0,breed,image_path,annotation_path
0,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
1,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
2,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
3,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
4,Chihuahua,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
...,...,...,...
20575,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20576,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20577,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...
20578,African Hunting Dog,/Users/vineetmahajan/Code/AI/datasets/stanford...,/Users/vineetmahajan/Code/AI/datasets/stanford...


## XML annotation to JSON annotation

In [75]:
annotation_json_path = [xml_to_json(annotation_path) for annotation_path in dogs_df["annotation_path"]]

dogs_df["annotation_json_path"] = annotation_json_path
dogs_df.to_csv(DATA_BASE_PATH / "dogs_df.csv", index=False)

print(dogs_df.shape)
dogs_df.columns

(20580, 4)


Index(['breed', 'image_path', 'annotation_path', 'annotation_json_path'], dtype='object')