In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [28]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_utils

import sys
sys.path.append('./scripts/')

from calitp_data_analysis import geography_utils

In [3]:
from update_vars import (analysis_date, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD, SHARED_STOP_THRESHOLD,
                        HALF_MILE_BUFFER_METERS, INTERSECTION_BUFFER_METERS)

In [4]:
import prep_pairwise_intersections

In [5]:
# !pip install calitp-data-analysis==2025.6.24
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

In [6]:
import pyproj
geodesic = pyproj.Geod(ellps="WGS84")

## Refine intersections with azimuth

In [7]:
# corridors = prep_pairwise_intersections.prep_bus_corridors(is_ms_precursor=True)

In [8]:
# corridors.columns

In [9]:
# segments = gcsgp.read_parquet(f'{GCS_FILE_PATH}hqta_segments.parquet')

In [10]:
# segments = rt_utils.add_origin_destination(segments.to_crs(geography_utils.WGS84))

In [None]:
def add_azimuth(row: pd.Series, geodesic: pyproj.Geod) -> pd.Series:
    '''
    Given a row of a GeoDataFrame with origin and destination points in WGS84,
    find forward and back azimuth. 
    pyproj method returns signed azimuth -180(south) - -90(west) - 0(north) - +90(east) - +180(south),
    convert this to degrees 0-360 and keep the higher one to enable consistent comparison later
    '''
    # print(geodesic)
    long1, lat1 = row.origin.coords[0]
    long2, lat2 = row.destination.coords[0]
    assert all([abs(x) <= 180 for x in [long1, lat1, long2, lat2]]), 'CRS must be WGS84'
    fwd_azimuth, back_azimuth, _distance = geodesic.inv(long1, lat1, long2, lat2, return_back_azimuth=True)
    # signed_azimuth_to_360_deg = lambda x: 360 + x if x <= 0 else x
    # fwd_azimuth = signed_azimuth_to_360_deg(fwd_azimuth)
    # back_azimuth = signed_azimuth_to_360_deg(back_azimuth)
    max_azimuth = max(fwd_azimuth, back_azimuth)
    row['fwd_azimuth'] = fwd_azimuth
    row['back_azimuth'] = back_azimuth
    row['max_azimuth'] = max_azimuth
    return row

In [None]:
# with_azi = segments.apply(add_azimuth, axis=1, geodesic=geodesic) #  added to segments script

In [None]:
signed_azimuth_to_360_deg = lambda x: 360 + x if x < 0 else x

In [None]:
signed_azimuth_to_360_deg(-179)

In [None]:
signed_azimuth_to_360_deg(179)

In [None]:
corridors.head(3)

In [None]:
import altair as alt
# alt.data_transformers.enable("vegafusion")
data = corridors[['fwd_azimuth_360', 'segment_direction']].sample(4999) #  sample to avoid needing vegafusion for this visual
alt.Chart(data, title = 'HQTA Segment Directions and Azimuth').mark_bar().encode(
    # x=alt.X('azimuth'),
    x=alt.X('fwd_azimuth_360', bin=alt.Bin(step=5)),
    y='count()',
    color='segment_direction'
)

In [None]:
corridors.explore(column='fwd_azimuth_360')

In [None]:
back_azi = lambda x: x - 180 if x >= 180 else x + 180

def azimuth_360_compare(azi1, azi2):
    if azi1 >= azi2:
        return azi1 - azi2
    else:
        return azi2 - azi1

In [None]:
def find_intersections_azimuth(azi1, azi2, threshold_degrees = 45):
    back_azi_2 = back_azi(azi2)
    back_azi_1 = back_azi(azi1)
    to_compare = [(azi1, azi2), (azi1, back_azi_2), (back_azi_1, azi2), (back_azi_1, back_azi_2)]
    compare_all = [azimuth_360_compare(x, y) for x, y in to_compare]
    # print(compare_all)
    return not(any([x <= threshold_degrees for x in compare_all]))

In [None]:
find_intersections_azimuth(179, 358)

In [None]:
find_intersections_azimuth(10, 350)

In [None]:
find_intersections_azimuth(350, 0)

In [None]:
find_intersections_azimuth(60, 100)

In [None]:
find_intersections_azimuth(39, 12)

In [None]:
find_intersections_azimuth(45, 359)

## test new corridors function

In [None]:
def sjoin_against_other_operators(
    in_group_df: gpd.GeoDataFrame, 
    out_group_df: gpd.GeoDataFrame
) -> pd.DataFrame: 
    """
    Spatial join of the in group vs the out group. 
    This could be the operator vs other operators, 
    or a route vs other routes. This is currently
    east-west vs north-south segments, which requires
    the additional step of excluding intersections
    resulting from the same route changing direction.
    
    Create a crosswalk / pairwise table showing these links.
    
    Compile all of them, because finding intersections is 
    computationally expensive,
    so we want to do it on fewer rows. 
    """
    route_cols = ["hqta_segment_id", "segment_direction", "route_key", "fwd_azimuth_360"]
    
    #  TODO replace with other dropping step
    in_group_df = in_group_df[in_group_df["segment_direction"] != "inconclusive"]
    out_group_df = out_group_df[out_group_df["segment_direction"] != "inconclusive"]
    
    s1 = gpd.sjoin(
        in_group_df[route_cols + ["geometry"]], 
        out_group_df[route_cols  + ["geometry"]],
        how = "inner",
        predicate = "intersects"
    ).drop(columns = ["index_right", "geometry"])
    
    s1 = s1[s1["route_key_left"] != s1["route_key_right"]]
            
    route_pairs = (
        s1.rename(
            columns = {
                "hqta_segment_id_left": "hqta_segment_id",
                "hqta_segment_id_right": "intersect_hqta_segment_id",
                "fwd_azimuth_360_left": "fwd_azimuth_360",
                "fwd_azimuth_360_right": "intersect_fwd_azimuth_360"
            })
          [["hqta_segment_id", "intersect_hqta_segment_id", "fwd_azimuth_360", "intersect_fwd_azimuth_360"]]
          .drop_duplicates()
          .reset_index(drop=True)
    )
    route_pairs = route_pairs.assign(intersect = route_pairs.apply(lambda x: find_intersections_azimuth(x.fwd_azimuth_360, x.intersect_fwd_azimuth_360), axis=1))
    # print(route_pairs.empty)
    return route_pairs.query('intersect')

In [None]:
find_intersections_azimuth(59, 21)

In [None]:
test_ecr = sjoin_against_other_operators(corridors.query('route_id == "Rapid 522"'), corridors.query('route_id != "Rapid 522"'))

In [None]:
def pairwise_intersections(
    corridors_gdf: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Do pairwise comparisons of hqta segments.
    Take all the north-south segments and compare to east-west
    and vice versa.
    """
    # Route intersections across operators
    # east_west = corridors_gdf[corridors_gdf.segment_direction == "east-west"]
    # north_south = corridors_gdf[corridors_gdf.segment_direction == "north-south"]
    
    results = [
        sjoin_against_other_operators(corridors.query('route_key == @route_key'),
                                      corridors.query('route_key != @route_key'))
        for route_key in corridors_gdf.route_key.unique()
    ]
    
    pairs = pd.concat(results, axis=0, ignore_index=True)
    
    segments_p1 = pairs.hqta_segment_id.unique()
    segments_p2 = pairs.intersect_hqta_segment_id.unique()
    
    # Subset the hqta segments that do have hq_transit_corr == True 
    # down to the ones where routes have with sjoin intersections
    corridors2 = (
        corridors_gdf[
            (corridors_gdf.hqta_segment_id.isin(segments_p1)) | 
            (corridors_gdf.hqta_segment_id.isin(segments_p2))]
        .drop_duplicates()
        .sort_values(
            ["schedule_gtfs_dataset_key", "route_id", "hqta_segment_id"], 
            ascending = [True, True, True])
        .reset_index(drop=True)
    )
    
    return pairs, corridors2

In [None]:
pairs, corridors2 = pairwise_intersections(corridors)

In [None]:
import get_intersections

In [None]:
pairs_table = get_intersections.attach_geometry_to_pairs(corridors2, pairs)

In [None]:
results = get_intersections.find_intersections(pairs_table)

In [None]:
results

### create_bus_hqta_types

In [None]:
import create_bus_hqta_types

In [None]:
def buffer_around_intersections(gdf, buffer_size: int) -> gpd.GeoDataFrame: 
    """
    Draw 500 ft buffers around intersections to better catch stops
    that might fall within it.
    """
#     gdf = gpd.read_parquet(
#         f"{GCS_FILE_PATH}all_intersections.parquet",
#         storage_options={"token": credentials.token}
#     )
    
    gdf = gdf.assign(
        geometry = gdf.geometry.buffer(buffer_size)
    )

    return gdf 

In [None]:
bus_intersections = buffer_around_intersections(results, INTERSECTION_BUFFER_METERS)

In [None]:
# Grab point geom with all stops
gtfs_keys = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["feed_key", "gtfs_dataset_key"],
    get_pandas=True
)

all_stops = helpers.import_scheduled_stops(
    analysis_date,
    get_pandas = True,
    columns = ["feed_key", "stop_id", "geometry"],
    crs = PROJECT_CRS
).merge(
    gtfs_keys,
    on = "feed_key",
).drop(columns = "feed_key")

In [None]:
major_stop_bus_testing = create_bus_hqta_types.create_major_stop_bus(all_stops, bus_intersections)

In [None]:
major_stop_bus_testing

### current pipeline result

In [None]:
major_stop_bus = gcsgp.read_parquet(f'{GCS_FILE_PATH}major_stop_bus.parquet')

In [None]:
major_stop_bus

In [None]:
from shared_utils import webmap_utils

In [None]:
import branca

In [None]:
cmap = branca.colormap.step.Spectral_04.scale(vmin=0, vmax=4)

In [None]:
cmap

In [None]:
major_stop_bus['iteration'] = 0
major_stop_bus_testing['iteration'] = 4

In [None]:
gdf = pd.concat([major_stop_bus, major_stop_bus_testing])

In [None]:
color_state = webmap_utils.set_state_export(gdf, filename='test_azimuth_mts_2', map_title='Azimuth for Major Transit Stops',
                                             cmap=cmap, color_col='iteration')

In [None]:
color_state

In [None]:
major_stop_bus.geometry = major_stop_bus.buffer(HALF_MILE_BUFFER_METERS)
major_stop_bus_testing.geometry = major_stop_bus_testing.buffer(HALF_MILE_BUFFER_METERS)

In [None]:
major_stop_bus.dissolve().geometry.area

In [None]:
major_stop_bus_testing.dissolve().geometry.area

In [None]:
m = major_stop_bus.dissolve().overlay(major_stop_bus_testing.dissolve(), how='difference').explore(color='red')

In [None]:
major_stop_bus_testing.dissolve().overlay(major_stop_bus.dissolve(), how='difference').explore(m=m)

## Lookback

* `rail_ferry_brt_stops`
* `create_hqta_segments`
* `create_aggregate_stop_frequencies`
* `sjoin_stops_to_segments`
* `create_bus_hqta_types`
* `assemble_hqta_points`
* `assemble_hqta_polygons`
    - `assemble_hqta_points.get_agency_info` only

In [7]:
import datetime as dt
import yaml
from shared_utils import catalog_utils, publish_utils

In [8]:
import lookback_wrappers
import importlib
importlib.reload(lookback_wrappers)

<module 'lookback_wrappers' from '/home/jovyan/data-analyses/high_quality_transit_areas/lookback_wrappers.py'>

In [9]:
published_operators_yaml = "../gtfs_funnel/published_operators.yml"
lookback_days = 90
    
    # Read in the published operators file
with open(published_operators_yaml) as f:
    published_operators_dict = yaml.safe_load(f)

In [10]:
published_operators_dict = lookback_wrappers.read_published_operators(analysis_date)

In [11]:
published_operators_dict

{'2025-03-12': ['Imperial Valley Transit Schedule',
  'Bay Area 511 Rio Vista Delta Breeze Schedule'],
 '2025-04-16': ['Glenn Schedule'],
 '2025-05-14': ['LAX FlyAway Schedule',
  'Morro Bay Cal-ITP Schedule',
  'Bay Area 511 Golden Gate Ferry Schedule',
  'Long Beach Schedule',
  'Arvin Schedule',
  'Bellflower Bus Schedule',
  'Sacramento Schedule',
  'SBMTD Schedule']}

In [12]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [13]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

'crosswalk/gtfs_key_organization'

In [14]:
GTFS_DATA_DICT.schedule_tables.dir

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [15]:
may_date = '2025-05-14'

In [16]:
trips_cols = [
            "name", "feed_key", "trip_id",
            "route_id", "route_type", "route_desc",
            "gtfs_dataset_key", "shape_array_key"
        ]

In [17]:
def get_lookback_trips(published_operators_dict, trips_cols):
    lookback_trips = []
    for date in published_operators_dict.keys():
        lookback_trips += [helpers.import_scheduled_trips(date, filters=[['name', 'in', published_operators_dict[date]]],
                                  columns=trips_cols)]
    return pd.concat(lookback_trips)

In [18]:
lookback_trips = get_lookback_trips(published_operators_dict, trips_cols)

In [19]:
lookback_trips.head(3)

Unnamed: 0,name,feed_key,trip_id,route_id,route_type,route_desc,schedule_gtfs_dataset_key,shape_array_key
0,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10314,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2e7565e0f7520c62b8210f9ba512e33e
1,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10300,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2e7565e0f7520c62b8210f9ba512e33e
2,Imperial Valley Transit Schedule,cdd0c7f4c8f78f1805af16d34b8ed0c3,10316,1,3,,fe4aab1717eca5a2935c32c85a35a5bf,2e7565e0f7520c62b8210f9ba512e33e


In [20]:
st_cols = ["feed_key", "trip_id", "stop_id"]

In [21]:
def get_lookback_st(published_operators_dict, feed_keys, st_cols = st_cols):
    lookback_st = []
    for date in published_operators_dict.keys():
        st = helpers.import_scheduled_stop_times(
            date,
            columns = st_cols,
            filters = [["feed_key", "in", feed_keys]],
            with_direction = False, #  required to include rail/ferry/brt stops w/out shapes
            get_pandas = True
        )
        # display(st)
        lookback_st += [st]
    return pd.concat(lookback_st)

In [22]:
lookback_st = get_lookback_st(published_operators_dict, lookback_trips.feed_key.unique())

In [23]:
lookback_st.head(3)

Unnamed: 0,feed_key,trip_id,stop_id
0,0f0cd1b91cfdb7c398ac6f028fbfd888,1643,1
1,0f0cd1b91cfdb7c398ac6f028fbfd888,1642,1
2,0f0cd1b91cfdb7c398ac6f028fbfd888,1630,1


In [24]:
stops_cols = ["feed_key", "stop_id", "stop_name", "geometry"]

In [25]:
def get_lookback_stops(published_operators_dict, feed_keys, stops_cols = stops_cols):
    lookback_stops = []
    for date in published_operators_dict.keys():
        stops = helpers.import_scheduled_stops(
            date,
            columns = stops_cols,
            filters = [["feed_key", "in", feed_keys]],
            get_pandas = True
        )
        # display(st)
        lookback_stops += [stops]
    return pd.concat(lookback_stops)

In [26]:
lookback_stops = get_lookback_stops(published_operators_dict, lookback_trips.feed_key.unique())

In [27]:
lookback_stops.head(3)

Unnamed: 0,feed_key,stop_id,stop_name,geometry
0,5f59bf642bf4f8f7e1479f5992b9d595,2623823,Bellflower Blvd & Somerset Blvd,POINT (173416.007 -455884.654)
1,5f59bf642bf4f8f7e1479f5992b9d595,2623825,Bellflower Blvd and Rosencrans Ave,POINT (173403.915 -455343.436)
2,5f59bf642bf4f8f7e1479f5992b9d595,2623826,Bellflower Blvd-St. Johns High School,POINT (173392.908 -454852.335)


In [30]:
def get_lookback_hqta_shapes(published_operators_dict, feed_keys):
    
    lookback_shapes = []
    for date in published_operators_dict.keys():
        # Only include certain Amtrak routes
        outside_amtrak_shapes = gtfs_schedule_wrangling.amtrak_trips(
            analysis_date = date, inside_ca = False).shape_array_key.unique()

        gdf = gtfs_schedule_wrangling.longest_shape_by_route_direction(
            analysis_date = date
        ).query(
            'shape_array_key not in @outside_amtrak_shapes & feed_key.isin(@feed_keys)'
        ).drop(
            columns = ["feed_key", "shape_array_key", "route_length"]
        ).fillna({"direction_id": 0}).astype({"direction_id": "int"})
        lookback_shapes += [gdf]
    return pd.concat(lookback_shapes)

In [31]:
lookback_hqta_shapes = get_lookback_hqta_shapes(published_operators_dict, lookback_trips.feed_key.unique())

In [32]:
lookback_hqta_shapes.head(3)

Unnamed: 0,geometry,schedule_gtfs_dataset_key,route_id,direction_id,route_key
164,"LINESTRING (139749.657 -421925.794, 139763.477...",4a3f5fe2e305cc696684d11226ba9878,1,0,47f2a750d318507484e3142abede5f61
165,"LINESTRING (163084.145 -438564.392, 163082.474...",4a3f5fe2e305cc696684d11226ba9878,2,0,17800def41fe8ad6f27479de223db0a3
166,"LINESTRING (147763.839 -450970.618, 147746.213...",4a3f5fe2e305cc696684d11226ba9878,5,0,ee67991f848a2d17547c400a219a945f
