In [None]:
import pandas as pd
import geopandas as gpd

from update_vars import ANALYSIS_DATE, GCS_PATH, BORDER_BUFFER_METERS, GEOM_SUBFOLDER

from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.geography_utils import CA_NAD83Albers_m
from functools import cache
from segment_speed_utils import helpers
import importlib

@cache
def gcs_pandas():
    return GCSPandas()

@cache
def gcs_geopandas():
    return GCSGeoPandas()

# Refactor scripts to accept other geometries

## `prepare_tracts_borders`

In [None]:
# tract_borders = gcs_geopandas().read_parquet(f'{GCS_PATH}borders_{ANALYSIS_DATE}.parquet')

In [None]:
# tract_sstb = gcs_geopandas().read_parquet(f'{GCS_PATH}shape_stops_tracts_borders_{ANALYSIS_DATE}.parquet')

### uza geoms

In [None]:
uza_cols = ['NAME', 'UACE20', 'geometry']
uza = gcs_geopandas().read_parquet(f'{GCS_PATH}input_geoms/ca_uza_map.parquet')[uza_cols]
uza.columns = uza.columns.str.lower()

In [None]:
# uza.explore()

In [None]:
uza.head(3)

In [None]:
uza = uza.to_crs(CA_NAD83Albers_m)

In [None]:
import importlib
import prepare_tracts_borders
importlib.reload(prepare_tracts_borders)

In [None]:
borders = prepare_tracts_borders.find_borders(uza, id_col='uace20')

In [None]:
borders.shape

In [None]:
borders.to_parquet(f'uza_borders_{ANALYSIS_DATE}.parquet')

### with transit routes

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
st = helpers.import_scheduled_stop_times(analysis_date=ANALYSIS_DATE,
                                         columns=['feed_key', 'trip_id', 'stop_id'],
                                         get_pandas=True)
trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['shape_array_key', 'trip_id', 'feed_key'])
stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=['feed_key', 'stop_id', 'geometry'])

shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
 .merge(trips, on = ['feed_key', 'trip_id'])
 .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id'])
 .dropna()
)

In [None]:
shape_stops_areas_borders = prepare_tracts_borders.find_shapes_in_areas_borders(shape_stops, uza, borders, id_col='uace20')

In [None]:
keep_cols = ['feed_key', 'stop_id', 'trip_id', 'shape_array_key', 'tsi_segment_id', 'geometry']

In [None]:
shape_stops_areas_borders = shape_stops_areas_borders[keep_cols]

In [None]:
gcs_geopandas().geo_data_frame_to_parquet(borders, f"{GCS_PATH}{GEOM_SUBFOLDER}borders_{ANALYSIS_DATE}.parquet")

In [None]:
shape_stops_areas_borders.to_parquet(f'shape_stops_areas_borders_{ANALYSIS_DATE}.parquet')

In [None]:
# gcs_geopandas().geo_data_frame_to_parquet(shape_stops_areas_borders, f"{GCS_PATH}{GEOM_SUBFOLDER}shape_stops_areas_borders_{ANALYSIS_DATE}.parquet")

## `define_tsi_segments`

* need option to group or not by shape
* 

In [None]:
# tract_segments = gcs_geopandas().read_parquet(f'{GCS_PATH}tsi_segments_{ANALYSIS_DATE}.parquet')

In [None]:
# tract_segments.head(10000).explore(column='tsi_segment_id')

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
    'shape_id', 'shape_array_key', 'route_id',
    'route_key', 'direction_id', 'route_short_name',
    'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(ANALYSIS_DATE, columns=trip_cols)
    .dropna(subset=['shape_id'])
    )

In [None]:
from tqdm import tqdm
tqdm.pandas(desc=f"TSI Segments Progress {ANALYSIS_DATE}")

In [None]:
import define_tsi_segments
importlib.reload(define_tsi_segments)

In [None]:
# tsi_segs = (shapes
#        .groupby('shape_array_key')
#        .progress_apply(define_tsi_segments.overlay_areas_borders, areas_gdf=uza, border_gdf=borders,
#                       id_col='uace20')
#        .reset_index(drop=True)
#       )

In [None]:
tsi_segs = define_tsi_segments.overlay_areas_borders(shape_gdf=shapes, areas_gdf=uza, border_gdf=borders, id_col='uace20')

In [None]:
# m = uza.explore()

In [None]:
# tsi_segs.explore(column='tsi_segment_id', m=m) # too big

In [None]:
tsi_segs.to_parquet(f'uza_tsi_segs_{ANALYSIS_DATE}.parquet')

## `time_distance_in_segments`

In [None]:
import time_distance_in_segments
importlib.reload(time_distance_in_segments)

In [None]:
def read_tsi_segs(tsi_segs, shapes):
    # tsi_segs = gpd.read_parquet(f'tsi_segments_{analysis_date}.parquet')
    tsi_segs = tsi_segs.drop(columns=['geometry'])

    shape_merged = (shapes.merge(tsi_segs, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    shape_merged = shape_merged.assign(
        start_meters = shape_merged.shape_geometry.project(shape_merged.start)
    )
    shape_merged = shape_merged.sort_values('start_meters').reset_index(drop=True)
    cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']
    shape_merged = shape_merged[cols]
    return shape_merged

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE, crs=CA_NAD83Albers_m)
st_proj = time_distance_in_segments.attach_projected_stop_times(ANALYSIS_DATE)

In [None]:
shape_merged = read_tsi_segs(gpd.read_parquet('./uza_tsi_segs_2025-07-16.parquet'), shapes)
# shape_merged = time_distance_in_segments.read_tsi_segs(ANALYSIS_DATE, shapes)
tsi_segments_trips = shape_merged.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [None]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
many_trip_test =(tsi_segments_trips.head(10)
            .groupby('trip_instance_key', group_keys=False)
            .apply(time_distance_in_segments.tract_border_time_by_trip, st_proj_df = st_proj))
meta = many_trip_test[:0]
time_distance_in_segments.dask_calculate_batch(tsi_segments_trips,
                     st_proj, meta).to_parquet(f'tsi_uza_{ANALYSIS_DATE}.parquet')

## results

In [None]:
path = f'{GCS_PATH}urbanized_areas/tsi_uza_2025-07-16.parquet'

In [None]:
df = gcs_pandas().read_parquet(path)

In [None]:
df

In [None]:
df = df.groupby('tsi_segment_id')[['tsi_segment_meters', 'segment_seconds']].sum().reset_index()

In [None]:
df

In [None]:
draft = uza.merge(df, left_on='uace20', right_on = 'tsi_segment_id')

In [None]:
draft.explore(column = 'segment_seconds', scheme = 'FisherJenks', tiles='CartoDBPositron', k=8)