In [None]:
import geopandas as gpd
import pandas as pd
import json
import gcsfs
from calitp_data_analysis import get_fs
import numpy as np
import pathlib
from enum import Enum

In [None]:
fs = get_fs()

Constant values, should go to a constants.py

In [None]:
# This maybe should be an enum, but that's probably unnecessary complexity here
LINEAR = "linear"
STOP = "stop"
TSP = "tsp"

DOCUMENTED_COLUMNS = {
    LINEAR: {
        "infrastructure_id", 
        "infrastructure_type", 
        "direction", 
        "owner",
        "hours_active",
        "days_active",
        "start_date",
        #"end_date", # this doesn't appear in the DF
        "source",
    },
    STOP: {
        "infrastructure_id",
        "infrastructure_type",
        "gtfs_stop_id",
        "owner",
        "direct_freeway_access",
        "amenities",
        "source",
    },
    TSP: {
        "infrastructure_id",
        "infrastructure_type",
        "tsp_type",
        "owner",
        "source",
    },
}

COLUMN_TYPE_MAP = {
    "transit signal priority": TSP,
    "bus only lane": LINEAR,
    "bus stop": STOP,
    "median bus stop": STOP,
    "freeway median bus stop": STOP,
    "bus hub": STOP,
    "bus and rail hub": STOP,
    "express lane": LINEAR,
    "bus hov lane": LINEAR,
    "transit center": STOP,
}

CORRECTION_COLUMN_NAMES = {
    "infrastructructure_type": "infrastructure_type",
    "infrastrcture_type": "infrastructure_type",
    "infrastucture_id": "infrastructure_id",
    "iinfrastructure_id": "infrastructure_id",
    "infrascture_id": "infrastructure_id",
    "infrastructrure_id": "infrastructure_id",
    "amentities": "amenities",
    "Amenities": "amenities",
    "grade_seperated": "grade_separated",
    "Direct Access Ramp": "direct_access_ramp",
    "Direct Freeway Access": "direct_freeway_access",
    "direction_both": "direction",
    "grfs_stop_id": "gtfs_stop_id",
    "Owner": "owner",
}

CORRECTION_VALUES = {
    "direction": {
        "one way west/northbound": "one way",
    },
    "hours_active": {
        "7am-9am 4pm-7pm": "7am-9am: and 4pm-7pm",
    },
    "owner": {
        "ladot": "city of los angeles",
        "lax": "city of los angeles (lawa)",
        "loma linda": "city of loma linda",
        "culver city": "city of culver city",
        "metro": "lacmta",
        "metrolink": "scrra",
    },
    "infrastructure_type": {
        "median bus staton": "median bus stop",
        "median bus station": "median bus stop",
        "freeway median bus station": "freeway median bus stop",
    }
}

In [None]:
import itertools
# Get all columns used in all three specs, doesn't need to move to another file but is useful for visualization here
useful_columns = list(set(itertools.chain(*DOCUMENTED_COLUMNS.values()))) + ["geometry"]

Data Cleaning, should go to a separate Python file

In [None]:
base_path = pathlib.Path("calitp-analytics-data/data-analyses/transit_priority_infrastructure")
output_path_inconsistent = base_path / "district_data_all_columns"
output_path_minimized = base_path / "district_data_consistent_columns"
paths = fs.glob(str(base_path / "*.geojson"))
gdfs = []
for path in paths:
    # Load feature collection json
    with fs.open(path) as f:
        district_json = json.load(f)
    # Convert feature collection to a GDF
    gdf_district = gpd.GeoDataFrame.from_features(
        district_json, crs=4326
    ).replace(
        to_replace=r"^\s*$", value=np.nan, regex=True
    )
    #new_path = output_path / pathlib.Path(path).name
    # Replace typo column names
    for column in CORRECTION_COLUMN_NAMES:
        if column in gdf_district.columns and CORRECTION_COLUMN_NAMES[column] in gdf_district.columns:
            gdf_district[CORRECTION_COLUMN_NAMES[column]].fillna(
                gdf_district[column], inplace=True
            )
            gdf_district.drop(column, axis=1, inplace=True)
        elif column in gdf_district.columns:
            gdf_district.rename(columns={column: CORRECTION_COLUMN_NAMES[column]}, inplace=True)
    # Make all values lower case
    for column in gdf_district.select_dtypes(include=[object]).columns:
        gdf_district[column] = gdf_district[column].str.lower()
    # Replace undocumented values
    gdf_district.replace(CORRECTION_VALUES, inplace=True)
    gdfs.append(gdf_district)
gdf_processed = pd.concat(gdfs, axis=0)
# Handle the one case where infrastructure_id does not uniquely identify a corridor or stop
gdf_processed.loc[
    (gdf_processed["infrastructure_id"] == "canoga station") & (gdf_processed["infrastructure_type"] == "bus hub"), 
    "infrastructure_id"
] = "canoga station off-busway platform"
# Categorize for export
gdf_processed["category"] = gdf_processed["infrastructure_type"].map(COLUMN_TYPE_MAP)

In [None]:
# Get a usable id for all values
na_id_points = gdf_processed["infrastructure_id"].isna()
gdf_processed["infrastructure_id_filled"] = gdf_processed["infrastructure_id"].fillna(1)
gdf_processed.loc[na_id_points, "infrastructure_id_filled"] = gdf_processed.loc[na_id_points, "infrastructure_id_filled"].cumsum()
gdf_processed["infrastructure_id_filled"] = gdf_processed["infrastructure_id_filled"].astype(str)
# Dissolve
gdf_dissolved = gdf_processed.dissolve(by=["infrastructure_id_filled"], as_index=False)

In [None]:
COLUMN_TYPE_MAP.values()

Export to GCS

In [None]:
def get_gdf_categorized(gdf, category, category_name="category", category_source=DOCUMENTED_COLUMNS):
    gdf_reset = gdf.reset_index()
    return gdf_reset.loc[
        gdf_reset[category_name] == category,
        list(category_source[category]) + [gdf_reset.geometry.name]
    ].copy()

#TODO: handle NA category values
for category in set(COLUMN_TYPE_MAP.values()):
    output_file_name = f"{category.lower()}.parquet"
    dissolved_path = f"gs://{base_path / 'processed_dissolved_infrastructure' / output_file_name}"
    non_dissolved_path = f"gs://{base_path / 'processed_infrastructure' / output_file_name}"
    get_gdf_categorized(gdf_processed, category).to_parquet(non_dissolved_path)
    get_gdf_categorized(gdf_dissolved, category).to_parquet(dissolved_path)

Visualizing the cleaned result with all infra categories

In [None]:
gdf_processed[useful_columns].explore(column="infrastructure_type")

In [None]:
gdf_linear = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/transit_priority_infrastructure/processed_dissolved_infrastructure/stop.parquet")

Checking the results saved to GCS look correct

In [None]:
gdf_linear.explore(column="infrastructure_type")