In [1]:
import pathlib
import random
import shutil
import xml.etree.ElementTree as ET

import pandas as pd
import sklearn.model_selection
import tqdm

In [2]:
root_path = pathlib.Path("~/datasets/labeled_mask_dataset/").expanduser()
clean_data_path = root_path / "data"
clean_train_data_path = clean_data_path / "train"
clean_test_data_path = clean_data_path / "test"
clean_train_data_path.mkdir(parents=True, exist_ok=True)
clean_test_data_path.mkdir(parents=True, exist_ok=True)

In [3]:
annotations = sorted((root_path / "annotations/").rglob("*.xml"))
images = sorted((root_path / "images/").rglob("*.jpg"))
all_files = list(zip(images, annotations))
len(annotations), len(images), len(all_files)

(1370, 1370, 1370)

In [4]:
train, test = sklearn.model_selection.train_test_split(all_files, test_size=0.2, random_state=42)
len(train), len(test)

(1096, 274)

In [5]:
def create_clean_data(subset, path):
    annotations_path = path / "annotations"
    images_path = path / "images"

    annotations_path.mkdir(parents=True, exist_ok=True)
    images_path.mkdir(parents=True, exist_ok=True)

    for image_source, annotation_source in tqdm.tqdm(subset):
        image_destination = images_path / image_source.name
        annotation_destination = annotations_path / annotation_source.name
        shutil.copyfile(image_source, image_destination)
        shutil.copyfile(annotation_source, annotation_destination)


create_clean_data(test, clean_test_data_path)
create_clean_data(train, clean_train_data_path)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:00<00:00, 3360.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1096/1096 [00:00<00:00, 4161.40it/s]


In [6]:
def xml_to_csv(annotations):

    files = annotations.rglob("*.xml")

    classes_names = []
    xml_list = []

    for xml_file in files:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall("object"):
            classes_names.append(member[0].text)
            value = (
                root.find("filename").text,
                int(root.find("size")[0].text),
                int(root.find("size")[1].text),
                member[0].text,
                int(member[4][0].text),
                int(member[4][1].text),
                int(member[4][2].text),
                int(member[4][3].text),
            )
            xml_list.append(value)
    column_name = ["filename", "width", "height", "class", "xmin", "ymin", "xmax", "ymax"]
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    classes_names = list(set(classes_names))
    classes_names.sort()

    return xml_df, classes_names


train_df, train_classes = xml_to_csv(clean_train_data_path / "annotations")
test_df, test_classes = xml_to_csv(clean_test_data_path / "annotations")

In [7]:
train_df.to_csv(clean_train_data_path / "labels.csv", index=False)
test_df.to_csv(clean_test_data_path / "labels.csv", index=False)

In [8]:
pbtxt_content = ""
for i, class_name in enumerate(test_classes):
    pbtxt_content = pbtxt_content + "item {{\n    id: {0}\n    name: '{1}'\n}}\n\n".format(
        i + 1, class_name
    )
pbtxt_content = pbtxt_content.strip()
with open(clean_data_path / "label_map.pbtxt", "w") as f:
    f.write(pbtxt_content)