# Check exported data

In [7]:
import geopandas as gpd
import intake
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from shared_utils import catalog_utils
from update_vars import analysis_date

catalog = intake.open_catalog("./catalog.yml")
gcsgp = GCSGeoPandas()

In [4]:
GTFS_DATA_DICT = catalog_utils.get_catalog('gtfs_analytics_data')

In [5]:
GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [6]:
GTFS_DATA_DICT.speedmap_segments.segment_timeofday

'rollup_singleday/speeds_shape_timeofday_speedmap_segments'

In [11]:
path = f"{GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS}{GTFS_DATA_DICT.speedmap_segments.segment_timeofday}_{analysis_date}.parquet"

In [12]:
path

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-11-05.parquet'

In [9]:
gcsgp.read_parquet(path)

FileNotFoundError: gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments2025-11-05.parquet

In [20]:
def print_stats(gdf):
    print(f"CRS: {gdf.crs.to_epsg()}")
    print(f"{gdf.columns}")
    print(gdf.dtypes)
    print(f"# rows: {len(gdf)}")

## Routes

In [21]:
gdf = catalog.ca_transit_routes.read()

In [22]:
print_stats(gdf)

CRS: 4326
Index(['agency', 'route_id', 'route_type', 'route_name', 'route_length_feet',
       'shape_id', 'n_trips', 'base64_url', 'shn_route', 'on_shs',
       'shn_districts', 'pct_route_on_hwy_across_districts', 'geometry'],
      dtype='object')
agency                                 object
route_id                               object
route_type                             object
route_name                             object
route_length_feet                     float64
shape_id                               object
n_trips                                 int64
base64_url                             object
shn_route                              object
on_shs                                  int64
shn_districts                          object
pct_route_on_hwy_across_districts     float64
geometry                             geometry
dtype: object
# rows: 6927


In [23]:
gdf.head(3)

Unnamed: 0,agency,route_id,route_type,route_name,route_length_feet,shape_id,n_trips,base64_url,shn_route,on_shs,shn_districts,pct_route_on_hwy_across_districts,geometry
0,Marin County Transit District,17,3,17,71740.640202,1,24,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"1, 101, 580",1,4,43.7,"LINESTRING (-122.52238 37.97087, -122.52235 37..."
1,Marin County Transit District,17,3,17,73956.836506,2,17,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"1, 131, 101, 580",1,4,44.6,"LINESTRING (-122.48047 37.85714, -122.48041 37..."
2,Marin County Transit District,17,3,17,79841.885053,3,3,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"1, 131, 101, 580",1,4,37.9,"LINESTRING (-122.48047 37.85714, -122.48041 37..."


In [None]:
cols = ['agency', 'route_id', 'shape_id']
print(f"# unique combos: {len(gdf[cols].drop_duplicates())}")

In [None]:
def make_map(gdf: gpd.GeoDataFrame, plot_col: str):
    m = gdf.explore(plot_col, categorical=True, 
                    tiles="CartoDB Positron", legend=False
                   )
    display(m)

In [None]:
gdf.plot("route_id")
#make_map(gdf, "route_id")

## Stops

In [None]:
gdf = catalog.ca_transit_stops.read()   

In [None]:
print_stats(gdf)

In [None]:
cols = ['agency', 'route_ids_served', 'stop_id']
print(f"# unique combos: {len(gdf[cols].drop_duplicates())}")

In [None]:
gdf.plot("stop_id")
#make_map(gdf, "route_id")

## HQTA Areas

In [None]:
gdf = catalog.ca_hq_transit_areas.read()

In [None]:
print_stats(gdf)

In [None]:
gdf[gdf.hqta_type=="major_stop_brt"].route_id.value_counts()

In [None]:
gdf.plot("org_id_primary")

## HQTA Stops

In [None]:
gdf = catalog.ca_hq_transit_stops.read()

In [None]:
print_stats(gdf)

In [None]:
gdf.plot("org_id_primary")

## Speeds by Stop Segments

In [None]:
gdf = catalog.speeds_by_stop_segments.read()

In [None]:
print_stats(gdf)

In [None]:
gdf.p50_mph.hist(bins=range(0, 80, 5))

In [None]:
for col in ["p20_mph", "p50_mph", "p80_mph"]:
    print(gdf[col].describe())

In [None]:
gdf.plot("p50_mph", legend=True)

## Speeds by Route and Time-of-Day

In [None]:
gdf = catalog.speeds_by_route_time_of_day.read()

In [None]:
print_stats(gdf)

In [None]:
gdf.plot("speed_mph", legend=True)