In [1]:
import matplotlib.pyplot as plt
from shapely.geometry import LineString, Polygon, Point
import numpy as np

from typing import List
import os
from pathlib import Path

from nuplan.database.nuplan_db_orm.nuplandb import LidarBox

import pyarrow as pa
import pyarrow.ipc as ipc

import numpy as np
from tqdm import tqdm

from asim.common.geometry.base import StateSE3
from asim.common.geometry.bounding_box.bounding_box import BoundingBoxSE3
from asim.common.geometry.constants import DEFAULT_ROLL, DEFAULT_PITCH

In [2]:
from asim.dataset.maps.abstract_map import MapSurfaceType

In [3]:
from nuplan.database.nuplan_db_orm.nuplandb import NuPlanDB

In [4]:
from asim.dataset.dataset_specific.nuplan.data_conversion import NuPlanDataset


NUPLAN_DATA_ROOT = Path(os.environ["NUPLAN_DATA_ROOT"])
SPLIT_PATH = NUPLAN_DATA_ROOT / "nuplan-v1.1" / "splits" / "mini"


db_files = list(SPLIT_PATH.iterdir())
# idx = 0
# for idx in range(len(db_files)):
idx = 14

log_db = NuPlanDB(NUPLAN_DATA_ROOT, str(db_files[idx]), None)
print(idx, log_db.log_name, log_db.log.map_version)

14 2021.08.17.16.57.11_veh-08_01200_01636 us-pa-pittsburgh-hazelwood


In [1]:

import asim


print(asim.__version__)

0.0.1


In [53]:


lidar_pc_token = "b1459a3e87ec58fd"

all_lidar_pc_tokens = [lidar_pc.token for lidar_pc in log_db.lidar_pc]
# print(all_lidar_pc_tokens)

# 1. Scenario Tag
# log_db.scenario_tag.select_one(lidar_pc_token=lidar_pc_token).type


# 2. traffic lights
traffic_lights = log_db.traffic_light_status.select_many(lidar_pc_token=all_lidar_pc_tokens[1])
for traffic_light in traffic_lights:
    print(int(traffic_light.lane_connector_id), traffic_light.status)

20166 green
18404 red
18405 red
20169 red
20170 red
20168 red


In [57]:
from asim.dataset.observation.agent_datatypes import BoundingBoxType


int(BoundingBoxType.CZONE_SIGN.value)

5

In [9]:
from asim.dataset.observation.agent_datatypes import BoundingBoxType


name_mapping = {
    "vehicle": BoundingBoxType.VEHICLE,
    "bicycle": BoundingBoxType.BICYCLE,
    "pedestrian": BoundingBoxType.PEDESTRIAN,
    "traffic_cone": BoundingBoxType.TRAFFIC_CONE,
    "barrier": BoundingBoxType.BARRIER,
    "czone_sign": BoundingBoxType.CZONE_SIGN,
    "generic_object": BoundingBoxType.GENERIC_OBJECT,
}

In [10]:
from nuplan.common.geometry.compute import get_pacifica_parameters


log_name = log_db.log_name
log_token = log_db.log.token
map_location = log_db.log.map_version
vehicle_name = log_db.log.vehicle_name


time_us_log: List[int] = []

bb_ego_log: List[List[float]] = []
bb_frame_log: List[List[List[float]]] = []
bb_track_log: List[List[str]] = []
bb_types_log: List[List[int]] = []

ego_states_log: List[List[float]] = []


for lidar_pc in tqdm(log_db.lidar_pc, dynamic_ncols=True):
    # 1. time_us
    time_us_log.append(lidar_pc.timestamp)

    bb_frame: List[List[float]] = []
    bb_track: List[str] = []
    bb_types: List[int] = []

    for lidar_box in lidar_pc.lidar_boxes:
        lidar_box: LidarBox
        center = StateSE3(
            x=lidar_box.x,
            y=lidar_box.y,
            z=lidar_box.z,
            roll=DEFAULT_ROLL,
            pitch=DEFAULT_PITCH,
            yaw=lidar_box.yaw,
        )
        bounding_box_se3 = BoundingBoxSE3(center, lidar_box.length, lidar_box.width, lidar_box.height)

        bb_frame.append(pa.array(bounding_box_se3.array))
        bb_track.append(lidar_box.track_token)
        bb_types.append(int(name_mapping[lidar_box.category.name]))

    bb_frame_log.append(bb_frame)
    bb_track_log.append(bb_track)
    bb_types_log.append(bb_types)

    # 2. ego_states
    yaw, pitch, roll = yaw_pitch_roll = lidar_pc.ego_pose.quaternion.yaw_pitch_roll
    vehicle_parameters = get_pacifica_parameters()
    ego_bounding_box_se3 = BoundingBoxSE3(
        center=StateSE3(
            x=lidar_pc.ego_pose.x,
            y=lidar_pc.ego_pose.y,
            z=lidar_pc.ego_pose.z,
            roll=roll,
            pitch=pitch,
            yaw=yaw,
        ),
        length=vehicle_parameters.length,
        width=vehicle_parameters.width,
        height=vehicle_parameters.height,
    )

    bb_ego_log.append(pa.array(ego_bounding_box_se3.array))

    if len(bb_ego_log) > 9:
        break
    # break


# Option 1: List Column Approach
list_data = {"time_us": time_us_log, "bb_frame": bb_frame_log, "bb_track": bb_track_log, "bb_types": bb_types_log, "bb_ego": bb_ego_log}

# Create a PyArrow Table
list_schema = pa.schema(
    [
        ("time_us", pa.int64()),
        ("bb_frame", pa.list_(pa.list_(pa.float64(), 9))),
        ("bb_track", pa.list_(pa.string())),
        ("bb_types", pa.list_(pa.int32())),
        ("bb_ego", pa.list_(pa.float64(), 9)),
    ]
)
list_table = pa.Table.from_pydict(list_data, schema=list_schema)


metadata = {
    "recording_id": "drive_20250515_001",
    "location": "Mountain View, CA",
    "weather": "sunny",
    "sensor_config": "standard_suite_v3"
}
metadata_fields = []
metadata_values = []
for key, value in metadata.items():
    metadata_fields.append(key)
    metadata_values.append(pa.scalar(value))

metadata_table = pa.Table.from_arrays(
    [pa.array([value]) for value in metadata_values],
    metadata_fields
)

# schema = {
#     "timeseries": list_table.schema,
#     "metadata": metadata_table.schema
# }
# schema_batch = pa.record_batch([pa.array([str(schema)])], ["schema"])

# # Write to Arrow file
# # with pa.OSFile(f"{log_name}.arrow", "wb") as sink:
# #     writer = pa.RecordBatchFileWriter(sink, list_table.schema)
# #     writer.write_table(list_table)
# #     writer.close()
# schema_batch


  0%|          | 9/8720 [00:00<00:39, 218.56it/s]


In [11]:
import pyarrow as pa
import pyarrow.ipc as ipc
import mmap
import struct

def save_arrow_tables(tables_dict, filename):
    with open(filename, 'wb') as f:
        # Write header: number of tables
        f.write(struct.pack('<I', len(tables_dict)))
        
        # Write table of contents (TOC)
        toc_start = f.tell()
        toc_size = len(tables_dict) * (4 + 8 + 8)  # name_len + offset + size
        f.seek(toc_start + toc_size)  # Skip TOC space for now
        
        toc_entries = []
        for name, table in tables_dict.items():
            offset = f.tell()
            
            # Write table using Arrow IPC
            sink = pa.BufferOutputStream()
            with ipc.new_file(sink, table.schema) as writer:
                writer.write_table(table)
            
            buffer = sink.getvalue()
            f.write(buffer.to_pybytes())
            
            toc_entries.append((name, offset, len(buffer)))
        
        # Write TOC at the beginning
        current_pos = f.tell()
        f.seek(toc_start)
        for name, offset, size in toc_entries:
            name_bytes = name.encode('utf-8')
            f.write(struct.pack('<I', len(name_bytes)))
            f.write(name_bytes)
            f.write(struct.pack('<Q', offset))
            f.write(struct.pack('<Q', size))
        
        f.seek(current_pos)

def load_arrow_table(filename, table_name):
    with open(filename, 'rb') as f:
        # Read number of tables
        num_tables = struct.unpack('<I', f.read(4))[0]
        
        # Read TOC to find our table
        for _ in range(num_tables):
            name_len = struct.unpack('<I', f.read(4))[0]
            name = f.read(name_len).decode('utf-8')
            offset = struct.unpack('<Q', f.read(8))[0]
            size = struct.unpack('<Q', f.read(8))[0]
            
            if name == table_name:
                # Memory map the specific table section
                with open(filename, 'rb') as mf:
                    with mmap.mmap(mf.fileno(), 0, access=mmap.ACCESS_READ) as mm:
                        table_data = mm[offset:offset + size]
                        reader = ipc.open_file(pa.py_buffer(table_data))
                        return reader.read_all()
    
    raise KeyError(f"Table '{table_name}' not found")

def list_tables(filename):
    with open(filename, 'rb') as f:
        num_tables = struct.unpack('<I', f.read(4))[0]
        tables = []
        
        for _ in range(num_tables):
            name_len = struct.unpack('<I', f.read(4))[0]
            name = f.read(name_len).decode('utf-8')
            f.read(16)  # Skip offset and size
            tables.append(name)
        
        return tables

In [12]:
# Usage
tables = {'table1': list_table, 'table2': metadata_table}
save_arrow_tables(tables, 'multi_table.arrow')

In [17]:
load_arrow_table("multi_table.arrow", "table2")

pyarrow.Table
recording_id: string
location: string
weather: string
sensor_config: string
----
recording_id: [["drive_20250515_001"]]
location: [["Mountain View, CA"]]
weather: [["sunny"]]
sensor_config: [["standard_suite_v3"]]

In [27]:
class ArrowMultiTableFile:
    def __init__(self, filename):
        self.filename = filename
        self._file = open(filename, 'rb')
        self._mmap = mmap.mmap(self._file.fileno(), 0, access=mmap.ACCESS_READ)
        self._parse_toc()
    
    def _parse_toc(self):
        self._tables = {}
        self._mmap.seek(0)
        num_tables = struct.unpack('<I', self._mmap.read(4))[0]
        
        for _ in range(num_tables):
            name_len = struct.unpack('<I', self._mmap.read(4))[0]
            name = self._mmap.read(name_len).decode('utf-8')
            offset = struct.unpack('<Q', self._mmap.read(8))[0]
            size = struct.unpack('<Q', self._mmap.read(8))[0]
            self._tables[name] = (offset, size)
    
    def get_table(self, name):
        if name not in self._tables:
            raise KeyError(f"Table '{name}' not found")
        
        offset, size = self._tables[name]
        table_data = self._mmap[offset:offset + size]
        reader = ipc.open_file(pa.py_buffer(table_data))
        return reader.read_all()
    
    def list_tables(self):
        return list(self._tables.keys())
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
    
    def close(self):
        if hasattr(self, '_mmap'):
            self._mmap.close()
        if hasattr(self, '_file'):
            self._file.close()

# Usage
tables = {'table1': list_table, 'table2': metadata_table, 'table3': metadata_table}
save_arrow_tables(tables, 'multi_table.arrow')

# Memory-mapped reading
with ArrowMultiTableFile('multi_table.arrow') as reader:
    table1 = reader.get_table('table1')  # Memory mapped!
    table2 = reader.get_table('table2')  # Memory mapped!
    table3 = reader.get_table('table2')  # Memory mapped!
    print(reader.list_tables())

['table1', 'table2', 'table3']


In [28]:
table3

pyarrow.Table
recording_id: string
location: string
weather: string
sensor_config: string
----
recording_id: [["drive_20250515_001"]]
location: [["Mountain View, CA"]]
weather: [["sunny"]]
sensor_config: [["standard_suite_v3"]]