In [1]:
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

In [None]:
# 1. Load base Parquet
ddf = dd.read_parquet("sampled_data/20230*.parquet", columns=['deviceid', 'date', 'time', 'lon', 'lat'])
ddf['datetime'] = dd.to_datetime(ddf['date'].astype(str) + ' ' + ddf['time'].astype(str), format='%d.%m.%Y %H:%M:%S')
ddf = ddf.drop(columns=['date', 'time'])

# 2. Load zoning file once (outside of map_partitions)
zoning = gpd.read_file("maps/zoning.geojson").to_crs(epsg=4326)

# 3. Define zone-tagging function for partitions
def tag_zones_partition(df):
    df = df.copy()
    df['__row_id'] = df.index  # unique temporary ID to track rows

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326")
    joined = gpd.sjoin(gdf, zoning[['geometry']], how='left', predicate='within')

    # Get just mapping of __row_id to zone index
    zone_df = joined[['__row_id', 'index_right']].drop_duplicates('__row_id')

    # Merge zone back safely
    df = df.merge(zone_df, on='__row_id', how='left')
    df = df.rename(columns={'index_right': 'zone'})
    return df.drop(columns=['__row_id', 'geometry'])


# 4. Define meta for Dask (use correct dtypes)
meta = {
    'deviceid': 'object',
    'lon': 'float64',
    'lat': 'float64',
    'datetime': 'datetime64[ns]',
    'zone': 'float64'  # nullable, because some points won't match any zone
}

# 5. Apply zone tagging with map_partitions
tagged = ddf.map_partitions(tag_zones_partition, meta=meta)

# 6. Save result
tagged.to_parquet("meeting/zoned_gps.parquet", engine="pyarrow", write_index=False)
