In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [2]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_utils

import sys
sys.path.append('./scripts/')

from calitp_data_analysis import geography_utils

In [3]:
from update_vars import (analysis_date, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD, SHARED_STOP_THRESHOLD,
                        HALF_MILE_BUFFER_METERS, INTERSECTION_BUFFER_METERS)

In [4]:
import prep_pairwise_intersections

In [5]:
# !pip install calitp-data-analysis==2025.6.24
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

In [6]:
import pyproj
geodesic = pyproj.Geod(ellps="WGS84")

## Refine intersections with azimuth

In [7]:
# corridors = prep_pairwise_intersections.prep_bus_corridors(is_ms_precursor=True)

In [None]:
# corridors.columns

In [None]:
# segments = gcsgp.read_parquet(f'{GCS_FILE_PATH}hqta_segments.parquet')

In [None]:
# rail = gcsgp.read_parquet(f'{GCS_FILE_PATH}rail_brt_ferry.parquet')

In [None]:
# segments = rt_utils.add_origin_destination(segments.to_crs(geography_utils.WGS84))

In [None]:
def add_azimuth(row: pd.Series, geodesic: pyproj.Geod) -> pd.Series:
    '''
    Given a row of a GeoDataFrame with origin and destination points in WGS84,
    find forward and back azimuth. 
    pyproj method returns signed azimuth -180(south) - -90(west) - 0(north) - +90(east) - +180(south),
    convert this to degrees 0-360 and keep the higher one to enable consistent comparison later
    '''
    # print(geodesic)
    long1, lat1 = row.origin.coords[0]
    long2, lat2 = row.destination.coords[0]
    assert all([abs(x) <= 180 for x in [long1, lat1, long2, lat2]]), 'CRS must be WGS84'
    fwd_azimuth, back_azimuth, _distance = geodesic.inv(long1, lat1, long2, lat2, return_back_azimuth=True)
    # signed_azimuth_to_360_deg = lambda x: 360 + x if x <= 0 else x
    # fwd_azimuth = signed_azimuth_to_360_deg(fwd_azimuth)
    # back_azimuth = signed_azimuth_to_360_deg(back_azimuth)
    max_azimuth = max(fwd_azimuth, back_azimuth)
    row['fwd_azimuth'] = fwd_azimuth
    row['back_azimuth'] = back_azimuth
    row['max_azimuth'] = max_azimuth
    return row

In [None]:
# with_azi = segments.apply(add_azimuth, axis=1, geodesic=geodesic) #  added to segments script

In [None]:
signed_azimuth_to_360_deg = lambda x: 360 + x if x < 0 else x

In [None]:
signed_azimuth_to_360_deg(-179)

In [None]:
signed_azimuth_to_360_deg(179)

In [None]:
corridors.head(3)

In [None]:
import altair as alt
# alt.data_transformers.enable("vegafusion")
data = corridors[['fwd_azimuth_360', 'segment_direction']].sample(4999) #  sample to avoid needing vegafusion for this visual
alt.Chart(data, title = 'HQTA Segment Directions and Azimuth').mark_bar().encode(
    # x=alt.X('azimuth'),
    x=alt.X('fwd_azimuth_360', bin=alt.Bin(step=5)),
    y='count()',
    color='segment_direction'
)

In [None]:
# corridors.explore(column='fwd_azimuth_360')

In [None]:
back_azi = lambda x: x - 180 if x >= 180 else x + 180

def azimuth_360_compare(azi1, azi2) -> float:
    '''
    compare two 360-degree azimuths
    '''
    if azi1 >= azi2:
        return azi1 - azi2
    else:
        return azi2 - azi1

def find_intersections_azimuth(azi1, azi2, threshold_degrees = 45) -> bool:
    '''
    With two 360-degree azimuths, compare all combininations of forward
    and back azimuths to see if all are more than a specified degree threshold apart.
    
    find_intersections_azimuth(360, 45) should return True
    find_intersections_azimuth(40, 80) should return False
    '''
    back_azi_2 = back_azi(azi2)
    back_azi_1 = back_azi(azi1)
    to_compare = [(azi1, azi2), (azi1, back_azi_2), (back_azi_1, azi2), (back_azi_1, back_azi_2)]
    compare_all = [azimuth_360_compare(x, y) for x, y in to_compare]
    # print(compare_all)
    return not(any([x < threshold_degrees for x in compare_all]))

In [None]:
find_intersections_azimuth(179, 358)

In [None]:
find_intersections_azimuth(360, 45)

In [None]:
find_intersections_azimuth(10, 350)

In [None]:
find_intersections_azimuth(350, 0)

In [None]:
find_intersections_azimuth(60, 100)

In [None]:
find_intersections_azimuth(39, 12)

In [None]:
find_intersections_azimuth(45, 359)

## test new corridors function

In [None]:
def sjoin_against_other_operators(
    in_group_df: gpd.GeoDataFrame, 
    out_group_df: gpd.GeoDataFrame
) -> pd.DataFrame: 
    """
    Spatial join of the in group vs the out group. 
    This could be the operator vs other operators, 
    or a route vs other routes. This is currently
    east-west vs north-south segments, which requires
    the additional step of excluding intersections
    resulting from the same route changing direction.
    
    Create a crosswalk / pairwise table showing these links.
    
    Compile all of them, because finding intersections is 
    computationally expensive,
    so we want to do it on fewer rows. 
    """
    route_cols = ["hqta_segment_id", "segment_direction", "route_key", "fwd_azimuth_360"]
    
    s1 = gpd.sjoin(
        in_group_df[route_cols + ["geometry"]], 
        out_group_df[route_cols  + ["geometry"]],
        how = "inner",
        predicate = "intersects"
    ).drop(columns = ["index_right", "geometry"])
                
    route_pairs = (
        s1.rename(
            columns = {
                "hqta_segment_id_left": "hqta_segment_id",
                "hqta_segment_id_right": "intersect_hqta_segment_id",
                "fwd_azimuth_360_left": "fwd_azimuth_360",
                "fwd_azimuth_360_right": "intersect_fwd_azimuth_360"
            })
          [["hqta_segment_id", "intersect_hqta_segment_id", "fwd_azimuth_360", "intersect_fwd_azimuth_360"]]
          .drop_duplicates()
          .reset_index(drop=True)
    )
    route_pairs = route_pairs.assign(intersect = route_pairs.apply(
        lambda x: find_intersections_azimuth(x.fwd_azimuth_360, x.intersect_fwd_azimuth_360), axis=1)
        )
    # print(route_pairs.empty)
    return route_pairs.query('intersect')

In [None]:
find_intersections_azimuth(59, 21)

In [None]:
test_ecr = sjoin_against_other_operators(corridors.query('route_id == "Rapid 522"'), corridors.query('route_id != "Rapid 522"'))

In [None]:
def pairwise_intersections(
    corridors_gdf: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Do pairwise comparisons of hqta segments.
    Take all the north-south segments and compare to east-west
    and vice versa.
    """
    # Intersect each route with all others
    corridors_gdf = corridors_gdf[corridors_gdf['segment_direction'] != 'inconclusive']
    results = [
        sjoin_against_other_operators(corridors.query('route_key == @route_key'),
                                      corridors.query('route_key != @route_key'))
        for route_key in corridors_gdf.route_key.unique()
    ]
    
    pairs = pd.concat(results, axis=0, ignore_index=True)
    
    segments_p1 = pairs.hqta_segment_id.unique()
    segments_p2 = pairs.intersect_hqta_segment_id.unique()
    
    # Subset the hqta segments that do have hq_transit_corr == True 
    # down to the ones where routes have with sjoin intersections
    corridors2 = (
        corridors_gdf[
            (corridors_gdf.hqta_segment_id.isin(segments_p1)) | 
            (corridors_gdf.hqta_segment_id.isin(segments_p2))]
        .drop_duplicates()
        .sort_values(
            ["schedule_gtfs_dataset_key", "route_id", "hqta_segment_id"], 
            ascending = [True, True, True])
        .reset_index(drop=True)
    )
    
    return pairs, corridors2

In [None]:
pairs, corridors2 = pairwise_intersections(corridors)

In [None]:
import get_intersections

In [None]:
pairs_table = get_intersections.attach_geometry_to_pairs(corridors2, pairs)

In [None]:
results = get_intersections.find_intersections(pairs_table)

In [None]:
results

### create_bus_hqta_types

In [None]:
import create_bus_hqta_types

In [None]:
def buffer_around_intersections(gdf, buffer_size: int) -> gpd.GeoDataFrame: 
    """
    Draw 500 ft buffers around intersections to better catch stops
    that might fall within it.
    """
#     gdf = gpd.read_parquet(
#         f"{GCS_FILE_PATH}all_intersections.parquet",
#         storage_options={"token": credentials.token}
#     )
    
    gdf = gdf.assign(
        geometry = gdf.geometry.buffer(buffer_size)
    )

    return gdf 

In [None]:
bus_intersections = buffer_around_intersections(results, INTERSECTION_BUFFER_METERS)

In [None]:
# Grab point geom with all stops
trips_cols = ["feed_key", "gtfs_dataset_key"]
gtfs_keys = helpers.import_scheduled_trips(
    analysis_date,
    columns = trips_cols,
    get_pandas=True
)
published_operators_dict = lookback_wrappers.read_published_operators(analysis_date)
print(published_operators_dict)
lookback_trips = lookback_wrappers.get_lookback_trips(published_operators_dict, trips_cols + ['name'])
lookback_trips_ix = lookback_wrappers.lookback_trips_ix(lookback_trips)
gtfs_keys = pd.concat([gtfs_keys, lookback_trips.drop(columns=['name'])])

stops_cols = ["feed_key", "stop_id", "geometry"]
all_stops = helpers.import_scheduled_stops(
    analysis_date,
    get_pandas = True,
    columns = stops_cols,
    crs = PROJECT_CRS
)
lookback_stops = lookback_wrappers.get_lookback_stops(published_operators_dict, lookback_trips_ix, stops_cols,
                                                     crs=PROJECT_CRS)
all_stops = pd.concat([all_stops, lookback_stops])

all_stops = all_stops.merge(
    gtfs_keys,
    on = "feed_key",
).drop(columns = "feed_key")

In [None]:
major_stop_bus_testing = create_bus_hqta_types.create_major_stop_bus(all_stops, bus_intersections)

In [None]:
major_stop_bus_testing

### current pipeline result

In [None]:
major_stop_bus = gcsgp.read_parquet(f'{GCS_FILE_PATH}major_stop_bus.parquet')

In [None]:
major_stop_bus

In [None]:
from shared_utils import webmap_utils

In [None]:
import branca

In [None]:
cmap = branca.colormap.step.Spectral_04.scale(vmin=0, vmax=4)

In [None]:
cmap

In [None]:
major_stop_bus['iteration'] = 0
major_stop_bus_testing['iteration'] = 4

In [None]:
gdf = pd.concat([major_stop_bus, major_stop_bus_testing])

In [None]:
color_state = webmap_utils.set_state_export(gdf, filename='test_azimuth_mts_5', map_title='Azimuth for Major Transit Stops',
                                             cmap=cmap, color_col='iteration')

In [None]:
color_state

In [None]:
major_stop_bus.geometry = major_stop_bus.buffer(HALF_MILE_BUFFER_METERS)
major_stop_bus_testing.geometry = major_stop_bus_testing.buffer(HALF_MILE_BUFFER_METERS)

In [None]:
major_stop_bus.dissolve().geometry.area

In [None]:
major_stop_bus_testing.dissolve().geometry.area

In [None]:
m = major_stop_bus.dissolve().overlay(major_stop_bus_testing.dissolve(), how='difference').explore(color='red')

In [None]:
# major_stop_bus_testing.dissolve().overlay(major_stop_bus.dissolve(), how='difference').explore(m=m)

## Lookback

* `rail_ferry_brt_stops` trips, stops, st
* `create_hqta_segments` shapes
* `create_aggregate_stop_frequencies` trips, st
* `sjoin_stops_to_segments` stops
* `create_bus_hqta_types` trips, stops
* `assemble_hqta_points` 
* `assemble_hqta_polygons`
    - `assemble_hqta_points.get_agency_info` only

In [8]:
import datetime as dt
import yaml
from shared_utils import catalog_utils, publish_utils
import lookback_wrappers
import assemble_hqta_points
import importlib
importlib.reload(lookback_wrappers)

<module 'lookback_wrappers' from '/home/jovyan/data-analyses/high_quality_transit_areas/lookback_wrappers.py'>

In [9]:
published_operators_dict = lookback_wrappers.read_published_operators(analysis_date)

In [10]:
published_operators_dict

{'2025-03-12': ['Imperial Valley Transit Schedule',
  'Bay Area 511 Rio Vista Delta Breeze Schedule'],
 '2025-04-16': ['Glenn Schedule'],
 '2025-05-14': ['LAX FlyAway Schedule',
  'Morro Bay Cal-ITP Schedule',
  'Bay Area 511 Golden Gate Ferry Schedule',
  'Long Beach Schedule',
  'Arvin Schedule',
  'Bellflower Bus Schedule',
  'Sacramento Schedule',
  'SBMTD Schedule']}

In [11]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [12]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

'crosswalk/gtfs_key_organization'

In [13]:
GTFS_DATA_DICT.schedule_tables.dir

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [14]:
trips_cols = [
            "name", "feed_key", "trip_id",
            "route_id", "route_type", "route_desc",
            "gtfs_dataset_key",
        ]

In [15]:
lookback_trips = lookback_wrappers.get_lookback_trips(published_operators_dict, trips_cols)

In [16]:
lookback_trips.head(3)

Unnamed: 0,name,feed_key,trip_id,route_id,route_type,route_desc,schedule_gtfs_dataset_key,lookback_date
0,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10314,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2025-03-12
1,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10300,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2025-03-12
2,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10316,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2025-03-12


In [17]:
lookback_trips_ix = lookback_wrappers.lookback_trips_ix(lookback_trips)

In [18]:
lookback_trips_ix.head(3)

Unnamed: 0,name,feed_key,schedule_gtfs_dataset_key,lookback_date
0,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,fe4aab1717eca5a2935c32c85a35a5bf,2025-03-12
64,Bay Area 511 Rio Vista Delta Breeze Schedule,b289d80491722393827621e2a0286179,a131b466de5793f858b5dec42fbbb4a9,2025-03-12
0,Glenn Schedule,c74eaf4ebabfbb26ac8c9712bbcbb01e,7f2ae5bcb719c5912462858471a28aa5,2025-04-16


In [19]:
st_cols = ["feed_key", "trip_id", "stop_id", "arrival_hour"]

In [20]:
lookback_st = lookback_wrappers.get_lookback_st(published_operators_dict, lookback_trips_ix, st_cols)

In [21]:
lookback_st.head(3)

Unnamed: 0,feed_key,trip_id,stop_id,arrival_hour
0,cdd0c7f4c8f78f1805af16d34b8ed0c3,10303,53,8.0
1,cdd0c7f4c8f78f1805af16d34b8ed0c3,200305,5,14.0
2,cdd0c7f4c8f78f1805af16d34b8ed0c3,10321,69,20.0


In [22]:
stops_cols = ["feed_key", "stop_id", "stop_name", "geometry"]

In [23]:
lookback_stops = lookback_wrappers.get_lookback_stops(published_operators_dict, lookback_trips_ix, stops_cols)

In [24]:
lookback_stops.head(3)

Unnamed: 0,feed_key,stop_id,stop_name,geometry
0,b289d80491722393827621e2a0286179,2329722,Walmart,POINT (-173771.189 26928.051)
1,b289d80491722393827621e2a0286179,27694,Main & 2nd,POINT (-148022.773 16896.812)
2,b289d80491722393827621e2a0286179,27695,Hillside Terrace & Flores,POINT (-148388.515 17466.601)


In [25]:
lookback_hqta_shapes = lookback_wrappers.get_lookback_hqta_shapes(published_operators_dict, lookback_trips_ix)

In [26]:
lookback_hqta_shapes.head(3)

Unnamed: 0,geometry,schedule_gtfs_dataset_key,route_id,direction_id,route_key
2697,"LINESTRING (-147965.518 16828.747, -148025.619...",a131b466de5793f858b5dec42fbbb4a9,50,0,bc606373dd152e0585a444b5c77fa689
2698,"LINESTRING (-180785.421 27827.886, -180782.894...",a131b466de5793f858b5dec42fbbb4a9,50,1,bc606373dd152e0585a444b5c77fa689
3803,"LINESTRING (422429.667 -582166.415, 422282.476...",fe4aab1717eca5a2935c32c85a35a5bf,1,0,2e7565e0f7520c62b8210f9ba512e33e


In [27]:
lookback_xwalk = assemble_hqta_points.get_lookback_agency_crosswalk(published_operators_dict, lookback_trips_ix)

## tweak `branching_derived_intersections`

In [28]:
import branching_derived_intersections

In [30]:
trips, _ix = branching_derived_intersections.get_trips_with_route_dir(analysis_date, published_operators_dict)

In [32]:
shapes = helpers.import_scheduled_shapes(analysis_date, columns=['shape_array_key', 'geometry'])

feeds = trips[['feed_key', 'schedule_gtfs_dataset_key']].drop_duplicates()
# stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

shapes = shapes.merge(trips, on='shape_array_key').assign(length = shapes.geometry.length)
shapes.geometry = shapes.buffer(branching_derived_intersections.BRANCHING_OVERLAY_BUFFER)
shapes = shapes.assign(area = shapes.geometry.map(lambda x: x.area))
max_by_route_dir = shapes.groupby(['schedule_gtfs_dataset_key', 'route_dir']).length.max().reset_index()
shapes = (shapes.merge(max_by_route_dir, on = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
      .drop_duplicates(subset = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
     )

In [34]:
shapes.head(3)

Unnamed: 0,shape_array_key,geometry,feed_key,schedule_gtfs_dataset_key,trip_id,route_id,direction_id,route_type,route_short_name,name,lookback_date,route_dir,length,area
0,b4410832dddd41b8ef04743cf996d0b7,"POLYGON ((-216199.773 -38727.908, -216199.378 ...",6340f51d88b74feaa040e4f3a6f7fbe1,b75dfb03ca9b316fdb18f98a6b97e419,110-8,110,0,3,110,South San Francisco Schedule,,110_0,74313.739804,559685.9
1,c42892248fa7d65f51c4661296611a37,"POLYGON ((91133.279 -40603.234, 91129.348 -406...",6a185d2b4606192831e84a125b4cd54c,cb8a465cffec67c8fd90f31b389ed4c3,t_146781_b_2645_tn_0,563,1,3,MMX,Eastern Sierra Schedule,,563_1,68744.282332,2919198.0
2,97bea6ce476801b6e456c0e88b3cfba6,"POLYGON ((-191210.743 -61531.747, -191209.248 ...",9e8eed3cc17c4fdde369dd07bcb45faa,3c275e5acf8974e1afd765bd3011424c,14602095|25712661:T4|16:07:00,14602095,1,3,RP PM,Stanford Schedule,,14602095_1,85356.973002,343233.8


In [57]:
outside_amtrak_shapes = gtfs_schedule_wrangling.amtrak_trips(
    analysis_date = analysis_date, inside_ca = False).shape_array_key.unique()

gdf = gtfs_schedule_wrangling.longest_shape_by_route_direction(
    analysis_date = analysis_date
).query(
    'shape_array_key not in @outside_amtrak_shapes'
).fillna({"direction_id": 0}).astype({"direction_id": "int"})

In [64]:
test_shapes = pd.concat([gdf, lookback_hqta_shapes])
test_shapes = test_shapes.assign(route_dir = test_shapes.apply(lambda x: str(x.route_id) + '_' + str(x.direction_id), axis=1),
                  )

In [59]:
test_shapes.geometry = test_shapes.buffer(branching_derived_intersections.BRANCHING_OVERLAY_BUFFER)

In [60]:
test_shapes = test_shapes.assign(area = test_shapes.geometry.map(lambda x: x.area))

In [62]:
test_shapes

Unnamed: 0,shape_array_key,geometry,feed_key,schedule_gtfs_dataset_key,route_id,direction_id,route_key,route_length,route_dir,area
0,d41e13ef05f70afbaa069ae674a85eba,"POLYGON ((-245338.263 62853.282, -245338.868 6...",04c77e525e367b2752a6c7372a5f741a,0881af3822466784992a49f1cc57d38f,SMART,0,2a33e33ac7acea7f4367fc787da6e831,77586.642251,SMART_0,3.103763e+06
1,a68a8c5063dd4613d26a02fc21a5fb46,"POLYGON ((-245304.574 62874.942, -245006.664 6...",04c77e525e367b2752a6c7372a5f741a,0881af3822466784992a49f1cc57d38f,SMART,1,2a33e33ac7acea7f4367fc787da6e831,77570.717886,SMART_1,3.103713e+06
2,21cb3a14414d64629ff623921e328d42,"POLYGON ((-63493.167 -305725.695, -63559.825 -...",058a3b43698f803c8686f7e72a9efd4c,f4c3ea214214ee0d96f7646b3e9d69dc,11789,1,61c01874dffadb6781fee402c49300d1,17866.964698,11789_1,6.233952e+05
3,0aa0a5f1d73a0a4e16bc7dc99068dd20,"POLYGON ((-63490.271 -305721.984, -63529.754 -...",058a3b43698f803c8686f7e72a9efd4c,f4c3ea214214ee0d96f7646b3e9d69dc,11790,1,ba951c4b56db19822a2ddc802552fb97,16817.401333,11790_1,6.003238e+05
4,101c7fd91b575d21beee7e978d017a7e,"POLYGON ((-61631.080 -302321.844, -61820.076 -...",058a3b43698f803c8686f7e72a9efd4c,f4c3ea214214ee0d96f7646b3e9d69dc,11791,1,5a612df5b615473d91ceb14f4a4e47b6,10519.095010,11791_1,3.959218e+05
...,...,...,...,...,...,...,...,...,...,...
3950,,"POLYGON ((105302.162 -311291.186, 105302.147 -...",,7b62eba740c115d06c0e6baec8d83c12,16776,0,2759ace38e60b004d5c73a45347be111,,16776_0,5.358778e+05
3951,,"POLYGON ((98664.585 -306203.856, 98664.406 -30...",,7b62eba740c115d06c0e6baec8d83c12,16778,0,352650a6063110174759c5bbfc9a38dd,,16778_0,9.139007e+05
3952,,"POLYGON ((98746.169 -311471.774, 98753.655 -31...",,7b62eba740c115d06c0e6baec8d83c12,16779,0,a43788023cffe46666bc6e533f937f22,,16779_0,1.402907e+06
3953,,"POLYGON ((95623.517 -336488.552, 95625.171 -33...",,7b62eba740c115d06c0e6baec8d83c12,16779,1,a43788023cffe46666bc6e533f937f22,,16779_1,1.407364e+06


In [63]:
test_shapes.to_parquet('test_shapes.parquet')