In [59]:
import pandas as pd
import geopandas as gpd

from update_vars import ANALYSIS_DATE, GCS_PATH, BORDER_BUFFER_METERS, GEOM_SUBFOLDER

from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.geography_utils import CA_NAD83Albers_m
from functools import cache
from segment_speed_utils import helpers
import importlib
import utils

@cache
def gcs_pandas():
    return GCSPandas()

@cache
def gcs_geopandas():
    return GCSGeoPandas()

# Refactor scripts to accept other geometries

## `prepare_tracts_borders`

### uza geoms

In [60]:
uza = utils.read_uzas()

In [61]:
uza.head(3)

Unnamed: 0,name,uace20,geometry
0,"Antioch, CA",2683,"MULTIPOLYGON (((-145644.487 -4256.439, -145802..."
1,"McKinleyville, CA",52672,"POLYGON ((-341313.118 329954.224, -341407.902 ..."
2,"Arroyo Grande--Grover Beach--Pismo Beach, CA",3196,"POLYGON ((-50334.547 -325887.744, -50347.534 -..."


In [62]:
import importlib
import prepare_tracts_borders
importlib.reload(prepare_tracts_borders)

<module 'prepare_tracts_borders' from '/home/jovyan/data-analyses/transit_service_intensity/prepare_tracts_borders.py'>

In [63]:
borders = prepare_tracts_borders.find_borders(uza, id_col='uace20')

In [64]:
borders.shape

(58, 7)

In [65]:
borders.to_parquet(f'uza_borders_{ANALYSIS_DATE}.parquet')

In [66]:
gcs_geopandas().geo_data_frame_to_parquet(borders, f"{GCS_PATH}{GEOM_SUBFOLDER}borders_{ANALYSIS_DATE}.parquet")

### with transit routes

In [219]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
st = helpers.import_scheduled_stop_times(analysis_date=ANALYSIS_DATE,
                                         columns=['feed_key', 'trip_id', 'stop_id'],
                                         get_pandas=True)
trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['shape_array_key', 'trip_id', 'feed_key'])
stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=['feed_key', 'stop_id', 'geometry'])

shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
 .merge(trips, on = ['feed_key', 'trip_id'])
 .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id'])
 .dropna()
)

In [100]:
shape_stops_areas_borders = prepare_tracts_borders.find_shapes_in_areas_borders(shape_stops, uza, borders, id_col='uace20')

In [101]:
keep_cols = ['shape_array_key', 'tsi_segment_id']

In [102]:
shape_stops_areas_borders = shape_stops_areas_borders[keep_cols].drop_duplicates()

In [103]:
shape_stops_areas_borders.to_parquet(f'shape_stops_areas_borders_{ANALYSIS_DATE}.parquet')

In [104]:
gcs_pandas().data_frame_to_parquet(shape_stops_areas_borders, f"{GCS_PATH}{GEOM_SUBFOLDER}shape_stops_areas_borders_{ANALYSIS_DATE}.parquet")

## `define_tsi_segments`

* need option to group or not by shape
* 

In [None]:
# tract_segments = gcs_geopandas().read_parquet(f'{GCS_PATH}tsi_segments_{ANALYSIS_DATE}.parquet')

In [None]:
# tract_segments.head(10000).explore(column='tsi_segment_id')

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
    'shape_id', 'shape_array_key', 'route_id',
    'route_key', 'direction_id', 'route_short_name',
    'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(ANALYSIS_DATE, columns=trip_cols)
    .dropna(subset=['shape_id'])
    )

In [None]:
from tqdm import tqdm
tqdm.pandas(desc=f"TSI Segments Progress {ANALYSIS_DATE}")

In [None]:
import define_tsi_segments
importlib.reload(define_tsi_segments)

In [None]:
# tsi_segs = (shapes
#        .groupby('shape_array_key')
#        .progress_apply(define_tsi_segments.overlay_areas_borders, areas_gdf=uza, border_gdf=borders,
#                       id_col='uace20')
#        .reset_index(drop=True)
#       )

In [None]:
tsi_segs = define_tsi_segments.overlay_areas_borders(shape_gdf=shapes, areas_gdf=uza, border_gdf=borders, id_col='uace20')

In [None]:
# m = uza.explore()

In [None]:
# tsi_segs.explore(column='tsi_segment_id', m=m) # too big

In [None]:
tsi_segs.to_parquet(f'uza_tsi_segs_{ANALYSIS_DATE}.parquet')

In [39]:
tsi_segs = gpd.read_parquet('uza_tsi_segs_2025-07-16.parquet')

In [40]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}tsi_segs_{ANALYSIS_DATE}.parquet'

In [41]:
gcs_geopandas().geo_data_frame_to_parquet(tsi_segs, path)

## `time_distance_in_segments`

In [None]:
import time_distance_in_segments
importlib.reload(time_distance_in_segments)

In [None]:
def read_tsi_segs(tsi_segs, shapes):
    # tsi_segs = gpd.read_parquet(f'tsi_segments_{analysis_date}.parquet')
    tsi_segs = tsi_segs.drop(columns=['geometry'])

    shape_merged = (shapes.merge(tsi_segs, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    shape_merged = shape_merged.assign(
        start_meters = shape_merged.shape_geometry.project(shape_merged.start)
    )
    shape_merged = shape_merged.sort_values('start_meters').reset_index(drop=True)
    cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']
    shape_merged = shape_merged[cols]
    return shape_merged

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE, crs=CA_NAD83Albers_m)
st_proj = time_distance_in_segments.attach_projected_stop_times(ANALYSIS_DATE)

In [None]:
shape_merged = read_tsi_segs(gpd.read_parquet('./uza_tsi_segs_2025-07-16.parquet'), shapes)
# shape_merged = time_distance_in_segments.read_tsi_segs(ANALYSIS_DATE, shapes)
tsi_segments_trips = shape_merged.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [None]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
many_trip_test =(tsi_segments_trips.head(10)
            .groupby('trip_instance_key', group_keys=False)
            .apply(time_distance_in_segments.tract_border_time_by_trip, st_proj_df = st_proj))
meta = many_trip_test[:0]
time_distance_in_segments.dask_calculate_batch(tsi_segments_trips,
                     st_proj, meta).to_parquet(f'tsi_uza_{ANALYSIS_DATE}.parquet')

## results

In [None]:
path = f'{GCS_PATH}urbanized_areas/tsi_uza_2025-07-16.parquet'

In [None]:
df = gcs_pandas().read_parquet(path)

In [None]:
df

In [None]:
df = df.groupby('tsi_segment_id')[['tsi_segment_meters', 'segment_seconds']].sum().reset_index()

In [None]:
df

In [None]:
draft = uza.merge(df, left_on='uace20', right_on = 'tsi_segment_id')

In [None]:
draft.explore(column = 'segment_seconds', scheme = 'FisherJenks', tiles='CartoDBPositron', k=8)

## aggregations

In [164]:
import borders_stops_aggregation
importlib.reload(borders_stops_aggregation)

<module 'borders_stops_aggregation' from '/home/jovyan/data-analyses/transit_service_intensity/borders_stops_aggregation.py'>

In [165]:
sstb = borders_stops_aggregation.read_shapes_stopping_in_seg(ANALYSIS_DATE)

In [166]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,a66e6c6c1aaa8f579f48a7912d68a594,2683,True
0,41ed37b9a26ffe2ad724e4247c2d4a0a,2683,True
0,d2fe16b0b3ff388bfab976575f248026,2683,True


In [167]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}trip_tsi_segments_{ANALYSIS_DATE}.parquet'

In [168]:
trip_tsi_segments = gcs_pandas().read_parquet(path)

In [169]:
joined = borders_stops_aggregation.attach_stopping_info(trip_segment_df=trip_tsi_segments, shape_stopping_df=sstb)

In [170]:
joined.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop
0,59b80ec04501eaaffeedf142ae468bac,3323a4c3-c718-4dfb-8d1f-b80711810763,68224.295195,2515.768806,2a591130c1edeac89e62db5c22cb2805,28424.575398,28519.424101,94.848704,False
1,9087d981ca93ccc12d566eaa0389e20a,51445,68277.486823,178.134082,12a1749348fae282c28dd007025868fd,26700.0,26700.0,0.0,True
2,fe80f6e976b9373b46360c006f305a2a,78661,68621.298056,163.178124,98064cfe0042bce07ee76615190399ba,25376.086859,25394.733445,18.646587,True


In [171]:
stopping_sequences_df = borders_stops_aggregation.assign_stopping_sequences(joined)

  return getattr(df, df_function)(wrapper, **kwargs)
Progress: 100%|██████████| 5764/5764 [00:18<00:00, 305.44it/s]


0 segments out of 26734 can not be matched to a stop


In [172]:
stopping_sequences_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352
0,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352
1,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0


In [116]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}tsi_segs_{ANALYSIS_DATE}.parquet'
tsi_segs = gcs_geopandas().read_parquet(path)

In [117]:
tsi_segs.head(3)

Unnamed: 0,shape_array_key,uace20,name_1,uace20_1,name_2,uace20_2,intersection_id,length,geometry,border,start,tsi_segment_id,tsi_segment_meters
0,5d23e7f673d20a805fc1e2375b8f0989,51445,,,,,,,"LINESTRING (143090.838 -438138.600, 143084.377...",False,POINT (143090.838 -438138.600),51445,5832.806871
1,3fbc2c25ca0c39e05c3d1999ae1dbdee,98020,,,,,,,"LINESTRING (502349.254 -580267.307, 502329.217...",False,POINT (502349.254 -580267.307),98020,721.023224
2,3fbc2c25ca0c39e05c3d1999ae1dbdee,98020,,,,,,,"LINESTRING (502294.081 -579459.977, 502293.858...",False,POINT (502294.081 -579459.977),98020,10452.301272


In [118]:
border_assigned_df = borders_stops_aggregation.assign_borders(stopping_sequences_df=stopping_sequences_df, border_df=tsi_segs, id_col='uace20')

  border_merged['border_divide'] = border_merged.border.replace({True: 2, False: 1})


In [119]:
border_assigned_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,uace20
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,78904
1,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,79039
2,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0,False,51445


In [120]:
trips = helpers.import_scheduled_trips(analysis_date=ANALYSIS_DATE, columns=['shape_array_key', 'gtfs_dataset_key'])

In [121]:
border_assigned_df = border_assigned_df.merge(trips, on='shape_array_key')

In [122]:
border_assigned_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,uace20,schedule_gtfs_dataset_key
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,78904,8a1405af8da1379acc062e346187ac98
1,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,79039,8a1405af8da1379acc062e346187ac98
2,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0,False,51445,2f1c266fc20f9875777fb752af32a66e


In [124]:
borders_stops_aggregation.aggregate_to_area?

[0;31mSignature:[0m
[0mborders_stops_aggregation[0m[0;34m.[0m[0maggregate_to_area[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mborder_assigned_df[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m=[0m[0;34m[[0m[0;34m'tract'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
[0;31mFile:[0m      ~/data-analyses/transit_service_intensity/borders_stops_aggregation.py
[0;31mType:[0m      function

In [234]:
uza_tsi_agency = borders_stops_aggregation.aggregate_to_area(border_assigned_df=border_assigned_df, group_cols=['uace20', 'schedule_gtfs_dataset_key'])

In [174]:
uza = utils.read_uzas()

In [175]:
from calitp_data_analysis.sql import query_sql

## analysis_name

* better to use 

In [229]:
#  ideally add schedule_gtfs_dataset_key to bridge table, avoid the join and date query here

query = f'''

with t1 as (select * from cal-itp-data-infra.mart_transit_database.bridge_gtfs_analysis_name_x_ntd),
t2 as (select schedule_source_record_id, schedule_gtfs_dataset_key, organization_name
from cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
where _valid_from <= "{ANALYSIS_DATE}" and _valid_to >= "{ANALYSIS_DATE}"
)
select distinct t1.schedule_gtfs_dataset_name,
t1.analysis_name, t2.schedule_gtfs_dataset_key
from t1
inner join t2 on t1.organization_name = t2.organization_name

'''

In [230]:
analysis_name_bridge = query_sql(query)

In [231]:
analysis_name_bridge

Unnamed: 0,schedule_gtfs_dataset_name,analysis_name,schedule_gtfs_dataset_key
0,Maywood Schedule,City of Maywood,
1,Maywood Schedule,City of Maywood,85bcef5ab0e4d4e108d0e54d559a0994
2,Mountain Transit Schedule,Mountain Area Regional Transit Authority,5ca5d244836397b178993c9bdc4dfb00
3,Mountain Transit Schedule,Mountain Area Regional Transit Authority,
4,Mountain Transit Schedule,Mountain Area Regional Transit Authority,0c092a514e4b9ad1427bdacdc67a0091
...,...,...,...
528,Calabasas Schedule,City of Calabasas,
529,Calabasas Schedule,City of Calabasas,907a502754757f897cef1409d297a397
530,TCRTA Schedule,Tulare County Regional Transit Agency,
531,TCRTA Schedule,Tulare County Regional Transit Agency,b16ecb67f0c7b702488e1ceb8966df4a


In [235]:
uza_tsi_agency = uza_tsi_agency.merge(analysis_name_bridge, on='schedule_gtfs_dataset_key', how='left')
uza_tsi_agency

Unnamed: 0,uace20,schedule_gtfs_dataset_key,daily_vrm_miles,daily_vrh_hours,schedule_gtfs_dataset_name,analysis_name
0,01701,3364ec074ca85001da3abd78be2ae521,190.2,9.1,San Diego International Airport Shuttles Schedule,San Diego International Airport
1,01701,3364ec074ca85001da3abd78be2ae521,190.2,9.1,San Diego Schedule,"San Diego Metropolitan Transit System, Airport..."
2,02683,55a01ef72af21906934ae8ffb4786e86,5383.0,251.9,Bay Area 511 Tri Delta Schedule,Eastern Contra Costa Transit Authority
3,02683,587e730fac4db21d54037e0f12b0dd5d,110.1,3.4,Bay Area 511 County Connection Schedule,Central Contra Costa Transit Authority
4,02683,8a1405af8da1379acc062e346187ac98,1793.3,57.4,Bay Area 511 BART Schedule,San Francisco Bay Area Rapid Transit District
...,...,...,...,...,...,...
576,97966,b0760015c9fcd0500c4fddd5b9bb115b,503.1,36.8,,
577,98020,17712ec68e3869e3c53525426e38cadd,1280.6,71.7,Yuma Schedule,Yuma County Intergovernmental Public Transport...
578,98020,48e137bc977da88970393f629c18432c,152.7,101.3,Amtrak Schedule,Amtrak
579,98020,48e137bc977da88970393f629c18432c,152.7,101.3,Amtrak San Joaquins Schedule,San Joaquin Joint Powers Authority


In [236]:
no_match = uza_tsi_agency[uza_tsi_agency.analysis_name.isna()]

In [237]:
no_match

Unnamed: 0,uace20,schedule_gtfs_dataset_key,daily_vrm_miles,daily_vrh_hours,schedule_gtfs_dataset_name,analysis_name
138,35839,711004f97e690bb62264e3456a87a4f0,170.7,17.7,,
151,41347,a7f5522d7690161fc2be75857d7e2f79,47.7,3.0,,
152,41347,b0760015c9fcd0500c4fddd5b9bb115b,52.4,2.5,,
331,74179,0d65d96d07115e28313f207d5ed0d3b2,33.2,0.8,,
389,78904,09ff99ecac281cec96e67a0890b17d39,57.4,7.2,,
465,79417,711004f97e690bb62264e3456a87a4f0,57.1,8.3,,
504,85870,0d65d96d07115e28313f207d5ed0d3b2,4.1,5.4,,
540,89191,a7f5522d7690161fc2be75857d7e2f79,639.7,30.8,,
541,89191,b0760015c9fcd0500c4fddd5b9bb115b,549.9,26.3,,
542,89218,a7f5522d7690161fc2be75857d7e2f79,150.8,2.5,,


In [238]:
query = f'''

select *
from cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
where _valid_from <= "{ANALYSIS_DATE}"
and _valid_to >= "{ANALYSIS_DATE}"
and schedule_gtfs_dataset_key in {tuple(x for x in no_match.schedule_gtfs_dataset_key.to_list())}


'''

In [239]:
query

'\n\nselect *\nfrom cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data\nwhere _valid_from <= "2025-07-16"\nand _valid_to >= "2025-07-16"\nand schedule_gtfs_dataset_key in (\'711004f97e690bb62264e3456a87a4f0\', \'a7f5522d7690161fc2be75857d7e2f79\', \'b0760015c9fcd0500c4fddd5b9bb115b\', \'0d65d96d07115e28313f207d5ed0d3b2\', \'09ff99ecac281cec96e67a0890b17d39\', \'711004f97e690bb62264e3456a87a4f0\', \'0d65d96d07115e28313f207d5ed0d3b2\', \'a7f5522d7690161fc2be75857d7e2f79\', \'b0760015c9fcd0500c4fddd5b9bb115b\', \'a7f5522d7690161fc2be75857d7e2f79\', \'b0760015c9fcd0500c4fddd5b9bb115b\', \'a7f5522d7690161fc2be75857d7e2f79\', \'b0760015c9fcd0500c4fddd5b9bb115b\')\n\n\n'

In [240]:
query_sql(query) # no match -- likely either no longer operating or changed feeds

Unnamed: 0,key,public_customer_facing_fixed_route,public_customer_facing_or_regional_subfeed_fixed_route,organization_key,organization_name,organization_itp_id,organization_hubspot_company_record_id,organization_ntd_id,organization_source_record_id,service_key,...,vehicle_positions_source_record_id,trip_updates_gtfs_dataset_name,trip_updates_source_record_id,schedule_gtfs_dataset_key,service_alerts_gtfs_dataset_key,vehicle_positions_gtfs_dataset_key,trip_updates_gtfs_dataset_key,_valid_from,_valid_to,_is_current
0,d421d79501c46ce5dfd34fabdb9dd9d1,False,False,5b5950793dd6b992b08deee2233c86f9,Basin Transit,212.0,1880784470.0,9R02-91090,recniaSbsXcRSLsWe,cd5cff96c3d8037f90806049001fce98,...,,,,b0760015c9fcd0500c4fddd5b9bb115b,,,,2025-03-06 00:00:00+00:00,2025-08-25 23:59:59.999999+00:00,False
1,6b97439498124b86209a53d8c2cbd7ce,True,True,e9a4a3161951cd2a725151b644212e96,City of Guadalupe,129.0,9608934542.0,9R02-91043,recVdB3dpp3VhC1bd,ef5e069012d5e4a3c4fa43db6445d838,...,,,,711004f97e690bb62264e3456a87a4f0,,,,2025-03-06 00:00:00+00:00,2025-08-14 23:59:59.999999+00:00,False
2,5b1490684867f65b2f9256cc3c3ad35b,True,True,f790a1680764b204736e3ae5ecd86391,Modoc Transportation Agency,204.0,1880607709.0,9R02-91008,rec5Q3mEXcHPpQ8bn,50436431f03e5abfbb13b942c84494eb,...,,,,0d65d96d07115e28313f207d5ed0d3b2,,,,2025-03-06 00:00:00+00:00,2025-08-25 23:59:59.999999+00:00,False
3,b00d059e56dd545dadc8b63ccad20b9c,True,True,5b5950793dd6b992b08deee2233c86f9,Basin Transit,212.0,1880784470.0,9R02-91090,recniaSbsXcRSLsWe,cd5cff96c3d8037f90806049001fce98,...,recspVPAEPZITy5pm,Basin Transit GMV Trip Updates,recA1r48yGZ6Ad63C,a7f5522d7690161fc2be75857d7e2f79,d0d775cb927f76276d52cc16fa92ac32,acf2b9df4777f003385e794e8b542968,d7f5167a6e0adb73e147e57d229af5c4,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
4,4d65c7bbdda0e9920344a917c3669a31,False,False,52eeb25fc4be13024e9079d4d3fa879a,Quarterra Living Inc,,,,recMC4gsvf7C5bgx2,11cf5a6dee725cba65b904abf898754a,...,reczmCalmBJtOTnfk,Bay Area 511 Emery Express Trip Updates,recQjYOlty19nBNmq,09ff99ecac281cec96e67a0890b17d39,191299dc657257f66c71cfa9fc3d3549,b300d7c3cfbe527f3251429c3eacc5d7,e1b5d1020a70bb335aceebb3b96da3f1,2025-02-08 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
