### Table 1: Dataset Overview

This notebook can be used to create the table providing an overview of the datasets used in this study.

In [1]:
import json
from pathlib import Path

import pandas as pd

In [4]:
# adjust base directory if needed
base_dir = Path("G:/3D-GeoInfo-2025/data/1_base_data/labels")
csv_output_file_path = "./datasets.csv"

datasets = [
    # small plot that was labeled using all three labeling approaches
    {
        "LabelFile": "./manual_labeling/s1_p1_small_coco.json",
        "Site": 1,
        "Plot": "Plot 1.0",
        "Size": "50 x 50",
        "Labeling": "ML",
        "Usage": "train (small)"
    },
    {
        "LabelFile": "./manual_correction/s1_p1_small_coco.json",
        "Site": 1,
        "Plot": "Plot 1.0",
        "Size": "50 x 50",
        "Labeling": "MC",
        "Usage": "train (small)"
    },
    {
        "LabelFile": "./automatic_labeling/s1_p1_small_coco.json",
        "Site": 1,
        "Plot": "Plot 1.0",
        "Size": "50 x 50",
        "Labeling": "AL",
        "Usage": "train (small)"
    },
    # additional manually labeled data
    {
        "LabelFile": "./manual_labeling/s1_p1_ext_ml_coco.json",
        "Site": 1,
        "Plot": "Plot 1.1",
        "Size": "100 x 100",
        "Labeling": "ML",
        "Usage": "train (ext.)"
    },
    # additional manually corrected data
    {
        "LabelFile": "./manual_correction/s1_p1_ext_mc_coco.json",
        "Site": 1,
        "Plot": "Plot 1.2",
        "Size": "120 x 80",
        "Labeling": "MC",
        "Usage": "train (ext.)"
    },
    {
        "LabelFile": "./manual_correction/s1_p2_mc_coco.json",
        "Site": 1,
        "Plot": "Plot 2",
        "Size": "120 x 120",
        "Labeling": "MC",
        "Usage": "train (ext.)"
    },
    {
        "LabelFile": "./manual_correction/s2_p1_mc_coco.json",
        "Site": 2,
        "Plot": "Plot 1",
        "Size": "120 x 120",
        "Labeling": "MC",
        "Usage": "train (ext.)"
    },
    # additional automatically labeled data
    {
        "LabelFile": "./automatic_labeling/s1_p1_ext_al_coco.json",
        "Site": 1,
        "Plot": "Plot 1.3",
        "Size": "200 x 100",
        "Labeling": "AL",
        "Usage": "train (ext.)"
    },
    {
        "LabelFile": "./automatic_labeling/s1_p2_al_coco.json",
        "Site": 1,
        "Plot": "Plot 2",
        "Size": "150 x 100",
        "Labeling": "AL",
        "Usage": "train (ext.)"
    },
    {
        "LabelFile": "./automatic_labeling/s1_p3_al_coco.json",
        "Site": 1,
        "Plot": "Plot 3",
        "Size": "100 x 100",
        "Labeling": "AL",
        "Usage": "train (ext.)"
    },
    # test set
    {
        "LabelFile": "./manual_labeling/s3_p1_coco.json",
        "Site": 3,
        "Plot": "Plot 1",
        "Size": "50 x 50",
        "Labeling": "ML",
        "Usage": "test"
    },
]

dataset_metadata_list = []

for dataset in datasets:
    label_file_path = base_dir / dataset["LabelFile"]
    assert label_file_path.exists(), f"{label_file_path} does not exist."
    with open(label_file_path, "r", encoding="utf-8") as f:
        labels = json.load(f)
    dataset["NumTrees"] = len(labels["annotations"])
    dataset["Plot"] = dataset["Plot"].replace("_", r"\_")
    dataset["Size"] = dataset["Size"].replace("x", r"$\times$")
    dataset_metadata_list.append(dataset)

dataset_metadata = pd.DataFrame(dataset_metadata_list)
dataset_metadata.to_csv(csv_output_file_path, index=False)