In [1]:
import json
import yaml

from pathlib import Path

from calitp_data_analysis import utils, geography_utils
from update_vars import analysis_date, ESRI_BASE_URL
from open_data_utils import standardize_column_names, remove_internal_keys, esri_truncate_columns

In [2]:
import geopandas as gpd
import intake
import pandas as pd

In [3]:
catalog = intake.open_catalog("./catalog.yml")

In [4]:
from update_vars import RUN_ME

# Explore generting shapefile-truncated column names programatically

https://gdal.org/en/stable/drivers/vector/shapefile.html

> Attribute names can only be up to 10 characters long. The OGR Shapefile driver tries to generate unique field names. Successive duplicate field names, including those created by truncation to 10 characters, will be truncated to 8 characters and appended with a serial number from 1 to 99.

## current renaming dictionaries

* `STANDARDIZED_COLUMNS_DICT` is used to both standardize
column names from our various analyses (as in `create_stops_data.py`, `gcs_to_esri.py`), and
rename shapefile-truncated columns `"route_ids_"` and `"n_hours_in"` via `supplement_metadata.py` and `arcgis_pro_script.py`. 

*   When the desired name is longer than the shapefile limit, we would need keys for both the 
    original name and the shapefile-truncated desired name to point to the full
    desired name, for example:     
    ```
    "meters_to_shn": "meters_to_ca_state_highway"
    # "meters_to_": "meters_to_ca_state_highway" #  not present but would have to be added
    ```
    
* `RENAME_HQTA` and `RENAME_SPEED` exclusively handle shapefile-truncated names

## proposal

* new function `esri_truncate_columns` handles all truncation and can be called in `supplement_metadata.py`
* `STANDARDIZED_COLUMNS_DICT` remains in its standardization role, but doesn't need to rename truncated columns on top of that
* `RENAME_HQTA` and `RENAME_SPEED` no longer necessary and can be deleted

In [14]:
# Rename columns when shapefile truncates
RENAME_HQTA = {
    "agency_pri": "agency_primary",
    "agency_sec": "agency_secondary",
    "hqta_detai": "hqta_details",
    "base64_url": "base64_url_primary",
    "base64_u_1": "base64_url_secondary",  
    "org_id_pri": "org_id_primary",
    "org_id_sec": "org_id_secondary",
    "avg_trips_": "avg_trips_per_peak_hr"
}

RENAME_SPEED = {
    "stop_seque": "stop_sequence",
    "time_of_da": "time_of_day",
    "time_perio": "time_period",
    "district_n": "district_name",
    "direction_": "direction_id",
    "common_sha": "common_shape_id",
    "avg_sched_": "avg_sched_trip_min", 
    "avg_rt_tri": "avg_rt_trip_min",
    "caltrans_d": "district_name",
    "organization_source_record_id": "org_id",
    "organization_name": "agency",
    "stop_pair_": "stop_pair_name"
}

STANDARDIZED_COLUMNS_DICT = {
    "caltrans_district": "district_name",
    "organization_source_record_id": "org_id",
    "organization_name": "agency",
    "agency_name_primary": "agency_primary",
    "agency_name_secondary": "agency_secondary",
    "route_name_used": "route_name",
    "route_types_served": "routetypes",
    "n_hours_in": "n_hours_in_service",
    "route_ids_": "route_ids_served",
    "meters_to_shn": "meters_to_ca_state_highway"
    # "meters_to_": "meters_to_ca_state_highway" #  not present but would have to be added
    # "district_n": "district_name" #  same as above
}

In [15]:
RUN_ME

['ca_hq_transit_areas',
 'ca_hq_transit_stops',
 'ca_transit_routes',
 'ca_transit_stops',
 'speeds_by_stop_segments',
 'speeds_by_route_time_of_day']

In [16]:
speeds_stop = catalog['speeds_by_stop_segments'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: ['schedule_gtfs_dataset_key', 'name']


In [17]:
speeds_time_of_day = catalog['speeds_by_route_time_of_day'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: ['schedule_gtfs_dataset_key', 'meters_elapsed', 'sec_elapsed', 'name']


In [18]:
routes = catalog['ca_transit_routes'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: []


In [19]:
stops = catalog['ca_transit_stops'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: []


In [20]:
hqta = catalog['ca_hq_transit_areas'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: []


In [21]:
hq_stops = catalog['ca_hq_transit_stops'].read().to_crs(geography_utils.WGS84).pipe(
    standardize_column_names
).pipe(
    remove_internal_keys
)

drop: []


In [22]:
# see open_data_utils.py

# def esri_truncate_columns(columns: list | pd.Index) -> dict:
#     '''
#     from a list of columns or df.columns, match gdal algorithm
#     to generate ESRI Shapefile truncated names. Includes handling
#     truncated duplicates.
    
#     https://gdal.org/en/stable/drivers/vector/shapefile.html
    
#     Intended for use after all other renaming and just before shapefile export
#     '''
#     truncated_cols = []
#     for col in columns:
#         if col[:10] not in truncated_cols:
#             truncated_cols += [col[:10]]
#         else: #  truncated duplicate present
#             for i in range(1, 101):
#                 if i > 99: raise Exception("gdal does not support more than 99 truncated duplicates")
#                 suffix = str(i).rjust(2, "_") #  pad single digits with _ on left
#                 if col[:8] + suffix not in truncated_cols:
#                     truncated_cols += [col[:8] + suffix]
#                     break
#     truncated_dict = dict(zip(truncated_cols, columns))
#     truncated_dict = {key: truncated_dict[key] for key in truncated_dict.keys() if key != truncated_dict[key]}
#     return truncated_dict

In [27]:
# esri_truncate_columns(['duplicate'] * 100) #  works but unlikely edge case!
# esri_truncate_columns(['duplicate'] * 101) #  fails and raises as it should.

## compare new function with current dicts

#### hqta

* note that `base64_url_primary`, `base64_url_secondary` truncated duplicate order is preserved in function

In [28]:
RENAME_HQTA

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_detai': 'hqta_details',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'avg_trips_': 'avg_trips_per_peak_hr'}

In [29]:
esri_truncate_columns(hqta.columns) #  keys/values match, note avg_trips_per_peak_hr not present in this dataset

{'agency_pri': 'agency_primary',
 'agency_sec': 'agency_secondary',
 'hqta_detai': 'hqta_details',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary'}

In [30]:
esri_truncate_columns(hq_stops.columns) #  keys/values match

{'agency_pri': 'agency_primary',
 'hqta_detai': 'hqta_details',
 'agency_sec': 'agency_secondary',
 'base64_url': 'base64_url_primary',
 'base64_u_1': 'base64_url_secondary',
 'org_id_pri': 'org_id_primary',
 'org_id_sec': 'org_id_secondary',
 'avg_trips_': 'avg_trips_per_peak_hr'}

#### speeds

In [31]:
RENAME_SPEED

{'stop_seque': 'stop_sequence',
 'time_of_da': 'time_of_day',
 'time_perio': 'time_period',
 'district_n': 'district_name',
 'direction_': 'direction_id',
 'common_sha': 'common_shape_id',
 'avg_sched_': 'avg_sched_trip_min',
 'avg_rt_tri': 'avg_rt_trip_min',
 'caltrans_d': 'district_name',
 'organization_source_record_id': 'org_id',
 'organization_name': 'agency',
 'stop_pair_': 'stop_pair_name'}

note `caltrans_district` has changed to `district_name` via `STANDARDIZED_COLUMNS_DICT`, but that change isn't present in RENAME_SPEED

In [32]:
speeds_stop.columns

Index(['route_id', 'direction_id', 'stop_pair', 'stop_pair_name',
       'time_period', 'p50_mph', 'n_trips', 'p20_mph', 'p80_mph',
       'district_name', 'org_id', 'agency', 'base64_url', 'geometry'],
      dtype='object')

In [33]:
esri_truncate_columns(speeds_stop.columns)

{'direction_': 'direction_id',
 'stop_pair_': 'stop_pair_name',
 'time_perio': 'time_period',
 'district_n': 'district_name'}

In [34]:
speeds_time_of_day.columns

Index(['route_id', 'direction_id', 'time_period', 'speed_mph', 'district_name',
       'org_id', 'agency', 'base64_url', 'route_name', 'geometry'],
      dtype='object')

In [35]:
esri_truncate_columns(speeds_time_of_day.columns)

{'direction_': 'direction_id',
 'time_perio': 'time_period',
 'district_n': 'district_name'}

#### routes and stops

In [36]:
STANDARDIZED_COLUMNS_DICT

{'caltrans_district': 'district_name',
 'organization_source_record_id': 'org_id',
 'organization_name': 'agency',
 'agency_name_primary': 'agency_primary',
 'agency_name_secondary': 'agency_secondary',
 'route_name_used': 'route_name',
 'route_types_served': 'routetypes',
 'n_hours_in': 'n_hours_in_service',
 'route_ids_': 'route_ids_served',
 'meters_to_shn': 'meters_to_ca_state_highway'}

In [37]:
esri_truncate_columns(routes.columns)

{}

In [38]:
routes.columns

Index(['org_id', 'agency', 'route_id', 'route_type', 'route_name', 'shape_id',
       'n_trips', 'base64_url', 'geometry'],
      dtype='object')

In [39]:
[col for col in routes.columns if len(col) > 10] #  all short enough already

[]

In [40]:
esri_truncate_columns(stops.columns) # matches top 2 existing key/values, correctly adds second 2

{'route_ids_': 'route_ids_served',
 'n_hours_in': 'n_hours_in_service',
 'meters_to_': 'meters_to_ca_state_highway',
 'district_n': 'district_name'}