In [9]:
import pyarrow as pa
from pathlib import Path

import gzip
import json
from typing import Dict
import numpy as np


In [10]:
def _load_json_gz(path: Path) -> Dict:
    with gzip.open(path, "rt") as f:
        data = json.load(f)

    return data


In [11]:
from curses import meta
from typing import List

from tqdm import tqdm

from asim.common.geometry.bounding_box.bounding_box import BoundingBoxSE3Index
from asim.common.vehicle_state.ego_vehicle_state import EgoVehicleStateIndex
from asim.dataset.arrow.multiple_table import save_arrow_tables


def get_metadata_table(location: str) -> pa.Table:
    import asim

    metadata = {
        "dataset": "carla",
        "location": location,
        "vehicle_name": "carla",
        "version": str(asim.__version__),
    }
    metadata_fields = []
    metadata_values = []
    for key, value in metadata.items():
        metadata_fields.append(key)
        metadata_values.append(pa.scalar(value))

    return pa.Table.from_arrays([pa.array([value]) for value in metadata_values], metadata_fields)


def boxes_path_to_arrow(boxes_path: Path, arrow_path: Path) -> None:
    sorted_paths = sorted([bb_path for bb_path in boxes_path.iterdir()])

    timestamp_log: List[int] = []

    detections_state_log: List[List[List[float]]] = []
    detections_token_log: List[List[str]] = []
    detections_type_log: List[List[int]] = []

    ego_states_log: List[List[float]] = []

    traffic_light_ids_log: List[List[int]] = []
    traffic_light_types_log: List[List[int]] = []
    scenario_tags_log: List[List[str]] = []

    for box_path in tqdm(sorted_paths):
        # if box_path.suffix == ".json.gz":
        data = _load_json_gz(box_path)
        timestamp_log.append(data["timestamp"])
        detections_state_log.append(data["detections_state"])
        detections_token_log.append(data["detections_token"])
        detections_type_log.append(data["detections_types"])
        ego_states_log.append(data["ego_state"])
        traffic_light_ids_log.append(data["traffic_light_ids"])
        traffic_light_types_log.append(data["traffic_light_types"])
        scenario_tags_log.append(data["scenario_tag"])

    recording_data = {
        "timestamp": timestamp_log,
        "detections_state": detections_state_log,
        "detections_token": detections_token_log,
        "detections_type": detections_type_log,
        "ego_states": ego_states_log,
        "traffic_light_ids": traffic_light_ids_log,
        "traffic_light_types": traffic_light_types_log,
        "scenario_tag": scenario_tags_log,
    }

    # Create a PyArrow Table
    recording_schema = pa.schema(
        [
            ("timestamp", pa.int64()),
            ("detections_state", pa.list_(pa.list_(pa.float64(), len(BoundingBoxSE3Index)))),
            ("detections_token", pa.list_(pa.string())),
            ("detections_type", pa.list_(pa.int16())),
            ("ego_states", pa.list_(pa.float64(), len(EgoVehicleStateIndex))),
            ("traffic_light_ids", pa.list_(pa.int64())),
            ("traffic_light_types", pa.list_(pa.int16())),
            ("scenario_tag", pa.list_(pa.string())),
        ]
    )

    tables: Dict[str, pa.Table] = {}
    tables["recording_table"] = pa.Table.from_pydict(recording_data, schema=recording_schema)
    tables["metadata_table"] = get_metadata_table(_load_json_gz(box_path)["location"])

    # multi_table = ArrowMultiTableFile(self._output_path / self._split / f"{log_name}.arrow")
    log_file_path = arrow_path / "carla" / f"{boxes_path.parent.name}.arrow"
    if not log_file_path.parent.exists():
        log_file_path.parent.mkdir(parents=True, exist_ok=True)

    save_arrow_tables(tables, log_file_path)


arrow_path = Path("/home/daniel/asim_workspace/data")
data_path = Path("/home/daniel/carla_workspace/data/")

for log_path in data_path.iterdir():

    boxes_path = log_path / "boxes"
    if not boxes_path.exists():
        print(f"Boxes path {boxes_path} does not exist, skipping.")
        continue
    print(f"Processing {boxes_path}...")
    boxes_path_to_arrow(boxes_path, arrow_path)

Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route6_06_03_21_18_13/boxes...


100%|██████████| 1206/1206 [00:00<00:00, 20598.83it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route8_06_03_21_19_34/boxes...


100%|██████████| 1951/1951 [00:00<00:00, 19775.89it/s]

Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route0_06_03_21_01_18/boxes...



100%|██████████| 1941/1941 [00:00<00:00, 19904.07it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route2_06_03_21_03_30/boxes...


100%|██████████| 1818/1818 [00:00<00:00, 18853.18it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route1_06_03_21_02_12/boxes...


100%|██████████| 2502/2502 [00:00<00:00, 14033.79it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route7_06_03_21_18_44/boxes...


100%|██████████| 1807/1807 [00:00<00:00, 22874.87it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route5_06_03_21_17_39/boxes...


100%|██████████| 1109/1109 [00:00<00:00, 19888.84it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route9_06_03_21_20_22/boxes...


100%|██████████| 481/481 [00:00<00:00, 19845.76it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route3_06_03_21_04_21/boxes...


100%|██████████| 1850/1850 [00:00<00:00, 16960.65it/s]


Processing /home/daniel/carla_workspace/data/Town12_Rep0_longest6_route4_06_03_21_05_18/boxes...


100%|██████████| 1522/1522 [00:00<00:00, 19155.00it/s]
