In [9]:
import json
import yaml

from pathlib import Path

from calitp_data_analysis import utils, geography_utils
from update_vars import analysis_date, ESRI_BASE_URL
from open_data_utils import RENAME_HQTA, RENAME_SPEED, STANDARDIZED_COLUMNS_DICT, standardize_column_names, remove_internal_keys

In [2]:
import geopandas as gpd
import intake
import pandas as pd

In [16]:
catalog = intake.open_catalog("./catalog.yml")

In [17]:
from update_vars import RUN_ME

In [18]:
RUN_ME

['ca_hq_transit_areas',
 'ca_hq_transit_stops',
 'ca_transit_routes',
 'ca_transit_stops',
 'speeds_by_stop_segments',
 'speeds_by_route_time_of_day']

In [22]:
for d in ['ca_transit_stops'] :
    stops = catalog[d].read().to_crs(geography_utils.WGS84).pipe(
        standardize_column_names
    ).pipe(
        remove_internal_keys
    )

drop: []


In [23]:
stops.columns

Index(['org_id', 'agency', 'stop_id', 'stop_name', 'n_routes',
       'route_ids_served', 'routetypes', 'n_arrivals', 'n_hours_in_service',
       'meters_to_ca_state_highway', 'base64_url', 'district_name',
       'geometry'],
      dtype='object')

In [24]:
for d in ['ca_hq_transit_areas'] :
    hqta = catalog[d].read().to_crs(geography_utils.WGS84).pipe(
        standardize_column_names
    ).pipe(
        remove_internal_keys
    )

drop: []


In [26]:
hqta.columns

Index(['agency_primary', 'agency_secondary', 'hqta_type', 'hqta_details',
       'route_id', 'base64_url_primary', 'base64_url_secondary',
       'org_id_primary', 'org_id_secondary', 'geometry'],
      dtype='object')

https://gdal.org/en/stable/drivers/vector/shapefile.html

* Attribute names can only be up to 10 characters long. The OGR Shapefile driver tries to generate unique field names. Successive duplicate field names, including those created by truncation to 10 characters, will be truncated to 8 characters and appended with a serial number from 1 to 99.

In [27]:
RENAME_HQTA

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_detai': 'hqta_details',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'avg_trips_': 'avg_trips_per_peak_hr'}

In [30]:
def shapefile_rename_from_df(df: pd.DataFrame | gpd.GeoDataFrame):
    return

In [32]:
[col[:10] for col in hqta.columns]

['agency_pri',
 'agency_sec',
 'hqta_type',
 'hqta_detai',
 'route_id',
 'base64_url',
 'base64_url',
 'org_id_pri',
 'org_id_sec',
 'geometry']

In [33]:
input_cols = hqta.columns

In [64]:
hqta.columns.to_list() * 2

['agency_primary',
 'agency_secondary',
 'hqta_type',
 'hqta_details',
 'route_id',
 'base64_url_primary',
 'base64_url_secondary',
 'org_id_primary',
 'org_id_secondary',
 'geometry',
 'agency_primary',
 'agency_secondary',
 'hqta_type',
 'hqta_details',
 'route_id',
 'base64_url_primary',
 'base64_url_secondary',
 'org_id_primary',
 'org_id_secondary',
 'geometry']

In [66]:
truncated_cols = []
# for col in hqta.columns:
for col in hqta.columns.to_list() * 2:
    if col[:10] not in truncated_cols:
        truncated_cols += [col[:10]]
    else:
        for i in range(1, 101):
            if i > 99: raise Exception("gdal does not support more than 99 truncated duplicates")
            suffix = str(i).rjust(2, "_") #  pad single digits with _ on left
            if col[:8] + suffix not in truncated_cols:
                truncated_cols += [col[:8] + suffix]
                break

In [67]:
truncated_cols

['agency_pri',
 'agency_sec',
 'hqta_type',
 'hqta_detai',
 'route_id',
 'base64_url',
 'base64_u_1',
 'org_id_pri',
 'org_id_sec',
 'geometry',
 'agency_p_1',
 'agency_s_1',
 'hqta_typ_1',
 'hqta_det_1',
 'route_id_1',
 'base64_u_2',
 'base64_u_3',
 'org_id_p_1',
 'org_id_s_1',
 'geometry_1']

In [36]:
dict(zip(truncated_cols, hqta.columns))

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_type': 'hqta_type',
 'hqta_detai': 'hqta_details',
 'route_id': 'route_id',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'geometry': 'geometry'}

In [37]:
RENAME_HQTA

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_detai': 'hqta_details',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'avg_trips_': 'avg_trips_per_peak_hr'}

In [39]:
for d in ['ca_hq_transit_stops'] :
    hq_stops = catalog[d].read().to_crs(geography_utils.WGS84).pipe(
        standardize_column_names
    ).pipe(
        remove_internal_keys
    )

drop: []


In [40]:
dict(zip(truncated_cols, hq_stops.columns))

{'agency_pri': 'agency_primary',
 'agency_sec': 'hqta_type',
 'hqta_type': 'stop_id',
 'hqta_detai': 'route_id',
 'route_id': 'hqta_details',
 'base64_url': 'agency_secondary',
 'base64_u_1': 'base64_url_primary',
 'org_id_pri': 'base64_url_secondary',
 'org_id_sec': 'org_id_primary',
 'geometry': 'org_id_secondary'}

In [41]:
RENAME_HQTA

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_detai': 'hqta_details',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'avg_trips_': 'avg_trips_per_peak_hr'}