## Caltrans District Refactor

In [3]:
import branca
import branca.colormap as cm
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from great_tables import GT
from shared_utils import catalog_utils, webmap_utils

In [4]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [5]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month

In [6]:
import google.auth

credentials, project = google.auth.default()

In [7]:
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis.tables import tbls

db_engine = get_engine()

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
GCS_PATH = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/"

In [10]:
fct_monthly_routes_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet"

In [11]:
operator_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"

In [12]:
district = '03-Marysville / Sacramento'

In [13]:
fct_monthly_route_df = prep_gdf(district)

NameError: name 'prep_gdf' is not defined

In [None]:
operator_df = pd.read_parquet(
    operator_summary_url,
    filters=[
        ("Caltrans District", "==", district),
        ("Date", "==", pd.Timestamp(analysis_month)),
        ("Day Type", "==", "Weekday")
    ],
)

In [None]:
district_int = int(''.join(filter(str.isdigit, district)))

In [None]:
FILEPATH_URL = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"

In [None]:
df = (pd.read_parquet(
        FILEPATH_URL
    )
    .sort_values(["caltrans_district"]).reset_index(drop=True)
    .drop_duplicates()
         )

In [None]:
df

## Summary 
* `arrivals` column is missing.

In [2]:
def transpose_summary_stats(
    df: pd.DataFrame, 
    district_col: str = "Caltrans District"
) -> pd.DataFrame:
    """
    District summary should be transposed, otherwise columns
    get shrunk and there's only 1 row.
    
    Do some wrangling here so that great tables
    can display it fairly cleanly.
    """
    # Fix this so we can see it
    subset_df = df.drop(
        columns = district_col
    ).reset_index(drop=True)
    
    subset_df2 = subset_df.rename(
        columns = {
            **{c: f"{c.replace('N', '# ')}" for c in subset_df.columns},
            "n_operators": "# Operators",
            "arrivals_per_stop": "Arrivals per Stop",
            "trips_per_operator": "Trips per Operator"
        }).T.reset_index().rename(columns = {0: "Value", "index": "Category"})
    
    # Change to string for display
    subset_df2['Value'] = subset_df2['Value'].astype(int).apply(lambda x: "{:,}".format(x))
    return subset_df2

NameError: name 'pd' is not defined

In [None]:
def create_summary_table(df:pd.DataFrame)->pd.DataFrame:
    sum_me = [
    'N Trips',
    'N Stops',
    'N Routes',
    ]

    agg1 = (df.groupby(['Caltrans District'], 
                      observed=True, group_keys=False)
           .agg({
               "Analysis Name": "nunique",
               **{c:"sum" for c in sum_me},
           })
           .reset_index()
           .rename(columns = {"Analysis Name": "N Operators"})
          )

    agg1 = transpose_summary_stats(agg1)
    return agg1

In [None]:
operator_df3 = create_summary_table(operator_df)

In [None]:
(
    GT(operator_df3
        )
        .tab_header(title=f"District {district} GTFS summary stats")
    )

## Routes within the District

In [None]:
# Files for webmaps
boundary_file = f"district_{district_int}_boundary"
transit_routes_file = f"district_{district_int}_transit_routes"
shn_file = f"district_{district_int}_shn"
transit_shn_file = f"district_{district_int}_transit_routes_shn"

In [None]:
color_map = cm.linear.Spectral_11.scale()

In [None]:
color_map = branca.colormap.LinearColormap(
    colors=color_map.colors[7:], vmin=0, vmax=fct_monthly_route_df.Number.max()
)

In [None]:
def load_ct_district(district:int)->gpd.GeoDataFrame:
    """
    Load in Caltrans Shape.
    """
    caltrans_url = "https://gis.data.ca.gov/datasets/0144574f750f4ccc88749004aca6eb0c_0.geojson?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"
    ca_geojson = (gpd.read_file(caltrans_url)).to_crs(geography_utils.CA_NAD83Albers_m)
    district_geojson = ca_geojson.loc[ca_geojson.DISTRICT == district][["geometry"]]
    
    # Add color column
    district_geojson["color"] = [(58, 25, 79)]
    district_geojson["description"] = f"geometry for district {district}"
    boundary = district_geojson.geometry.iloc[0].boundary 
    district_geojson.geometry = [boundary]
    district_geojson.geometry = district_geojson.geometry.buffer(100)
    return district_geojson

In [None]:
district_gdf = load_ct_district(district_int)

In [None]:
district_map = webmap_utils.set_state_export(
    district_gdf,
    subfolder = "caltrans_district_digest/",
    filename=boundary_file,
    map_title="District Map",
    overwrite = True
)

In [None]:
fct_monthly_route_df.columns

In [None]:
transit_routes = webmap_utils.set_state_export(
    fct_monthly_route_df,
    subfolder = "caltrans_district_digest/",
    filename=transit_routes_file,
    map_title="Transit Routes",
    cmap=color_map,
    color_col="Number",
    existing_state=district_map,
    overwrite = True
    
)

In [None]:
webmap_utils.render_spa_link(transit_routes["spa_link"], text="Open Routes for all Operators Map")

In [None]:
# webmap_utils.display_spa_map(transit_routes["spa_link"])

## Transit Routes on the State Highway Network
**Only transit routes that have 15% or more if its length on one or more State Highway Network routes are included**

In [None]:
transit_shn_map_columns = {
    "analysis_name": "Analysis Name",
    "recent_combined_name": "Route",
    "shn_route": "State Highway Network Route",
    "pct_route_on_hwy_across_districts": "Percentage of Transit Route on SHN Across All Districts",
}

In [None]:
color_map2 = cm.linear.RdYlBu_11.scale()
# adjust the number of steps as needed

In [None]:
color_map2 = branca.colormap.LinearColormap(
    colors=color_map2.colors[7:], vmin=0, vmax=100
)

In [None]:
shn_map_readable_columns = {"shn_route": "State Highway Network Route",
                           "district":"District"}

In [None]:
def load_buffered_shn_map(district:int) -> gpd.GeoDataFrame:
    """
    Load buffered and dissolved version of the SHN that we can
    use with the webmaps.
    """
    SHN_FILE = catalog_utils.get_catalog("shared_data_catalog").state_highway_network.urlpath

    gdf = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_m)
    
    # Filter for the relevant district
    gdf2 = gdf.loc[gdf.District == district]
    
    # Dissolve
    gdf2 = gdf2.dissolve(by = ["Route","County","District", "RouteType"]).reset_index().drop(columns = ["Direction"])
    
    # Buffer - make it a bit bigger so we can actually see stuff
    gdf2.geometry = gdf2.geometry.buffer(100)
    
    # Rename the columns
    gdf2 = gdf2.rename(columns = shn_map_readable_columns)
    
    return gdf2

In [None]:
shn_gdf = load_buffered_shn_map(district_int)

In [None]:
shn_map = webmap_utils.set_state_export(
    shn_gdf,
    subfolder = "caltrans_district_digest/",
    filename=shn_file,
    map_title="State Highway Network Map",
    map_type='state_highway_network',
    overwrite = True
)

In [None]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:

    # Aggregate by adding all the districts and SHN to a single row, rather than
    # multiple and sum up the total % of SHN a transit route intersects with
    agg1 = (
        df.groupby(
            [
                "analysis_name",
                "recent_combined_name",
            ],
            as_index=False,
        )[["shn_route", "district", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "district": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )
    return agg1

In [None]:
def final_transit_route_shs_outputs(
    pct_route_intersection: int,
    district: int,
):
    """
    Take the dataframes from prep_open_data_portal and routes_shn_intersection.
    Prepare them for display on the GTFS Caltrans District Digest.

    intersecting_gdf: geodataframe created by
    open_data_df: dataframe created by
    pct_route_intersection: cutoff of the % of the transit route intersecting with the SHN
    district: the Caltrans district we are interested in.
    """
    GCS_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"
    open_data_df = pd.read_parquet(
    f"{GCS_PATH}transit_route_shn_open_data_portal_50.parquet")

    intersecting_gdf = gpd.read_parquet(
    f"{GCS_PATH}transit_route_intersect_shn_50_gtfs_digest.parquet",
    storage_options={"token": credentials.token})

    # Filter out for any pct_route_on_hwy that we deem too low & for the relevant district.
    open_data_df = open_data_df.loc[
        (open_data_df.pct_route_on_hwy_across_districts >= pct_route_intersection)
    ]
    
    intersecting_gdf = intersecting_gdf.loc[
        intersecting_gdf.district == district
    ]

    # TEMP
    intersecting_gdf = intersecting_gdf.rename(columns = {"portfolio_organization_name":"analysis_name"})
    open_data_df = open_data_df.rename(columns = {"portfolio_organization_name":"analysis_name"})
    # Join back to get the long gdf with the transit route geometries and the names of the
    # state highways these routes intersect with. This gdf will be used to
    # display a map.
    map_gdf = pd.merge(
        intersecting_gdf[
            ["analysis_name", "recent_combined_name", "geometry"]
        ].drop_duplicates(),
        open_data_df,
        on=["analysis_name", "recent_combined_name"],
    )
    
    # Buffer so we can see stuff and change the CRS
    map_gdf = map_gdf.to_crs(geography_utils.CA_NAD83Albers_m)
    map_gdf.geometry = map_gdf.geometry.buffer(35)
    
    # We want a text table to display.
    # Have to rejoin and to find only the SHN routes that are in the district
    # we are interested in.
    text_table_df = pd.merge(
        intersecting_gdf[
            [
                "analysis_name",
                "recent_combined_name",
                "shn_route",
               "district",
            ]
        ],
        open_data_df[
            [
                "analysis_name",
                "recent_combined_name",
                "pct_route_on_hwy_across_districts",
            ]
        ],
        on=["analysis_name", "recent_combined_name"],
    )

    # Now we have to aggregate again so each route will only have one row with the
    # district and SHN route info delinated by commas if there are multiple values.
    text_table = group_route_district(text_table_df, "max").drop(columns = ["district"])

    # Rename for clarity
    text_table = text_table.rename(
        columns={
            "shn_route": f"State Highway Network Routes in District {district}",
        }
    )

    text_table = text_table.rename(columns = transit_shn_map_columns)
    map_gdf = map_gdf.rename(columns = transit_shn_map_columns).drop(columns = ["on_shs"])
    map_gdf = map_gdf.reset_index(drop=False)
    map_gdf = map_gdf.rename(columns={"index": "Number"})
    #map_gdf = map_gdf[['Analysis Name', 'Route', 'geometry',
    #   'State Highway Network Route', "Number"]]
    return map_gdf, text_table

In [None]:
transit_route_shs_gdf, transit_route_shs_table = final_transit_route_shs_outputs(15, district_int)

In [None]:
shn_map = webmap_utils.set_state_export(
    shn_gdf,
    subfolder = "caltrans_district_digest/",
    filename=shn_file,
    map_title="State Highway Network Map",
    map_type='state_highway_network',
    overwrite = True
)

In [None]:
transit_shn_map = webmap_utils.set_state_export(
    transit_route_shs_gdf,
    subfolder = "caltrans_district_digest/",
    filename=transit_shn_file,
    map_title="Transit Routes on the State Highway Network",
    cmap=color_map2,
    color_col="Percentage of Transit Route on SHN Across All Districts",
    existing_state=shn_map,
    legend_url="https://storage.googleapis.com/calitp-map-tiles/transit_route_pct.svg",
    overwrite = True
    
)

In [None]:
webmap_utils.render_spa_link(transit_shn_map["spa_link"], text="Open Routes on State Highway System Map")

In [None]:
# webmap_utils.display_spa_map(transit_shn_map["spa_link"])

In [None]:
GT(
    transit_route_shs_table.sort_values(
        by=[
            "Analysis Name",
            "Percentage of Transit Route on SHN Across All Districts",
        ],
        ascending=[True, False],
    )
)

## GTFS Stats by Operator

In [None]:
def create_operator_table(df:pd.DataFrame)->pd.DataFrame:
    cols_to_keep = ["Analysis Name",
                    'Daily Trips',
                    'N Routes',
                    'N Shapes', 
                    'N Stops', ]
    df2 = df[cols_to_keep]
    df2 = df2.rename(columns = {"Analysis Name":"Operator"})
    df2.columns = df2.columns.str.replace("N","#")
    return df2

In [None]:
gtfs_table = create_operator_table(operator_df)

In [None]:
gtfs_table

In [None]:
(
    GT(gtfs_table.sort_values("Daily Trips", ascending=False))
    .fmt_integer(
        columns=[
            c
            for c in gtfs_table.columns
            if c not in ["Operator"]
        ]
    )
    .tab_header(
        title=f"District {district}",
        subtitle="Daily GTFS schedule statistics for Weekday by operator",
    )
    .cols_align(
        columns=[
            c
            for c in gtfs_table.columns
        ],
        align="center",
    )
)

## Create YML

In [14]:
def generate_operator_grain_yaml() -> pd.DataFrame:
    """
    Generate the yaml for our Operator grain portfolio.
    """
    FILEPATH_URL = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"
    
    # Keep only organizations with RT and schedule OR only schedule.
    df = (pd.read_parquet(
        FILEPATH_URL, columns = ["caltrans_district",]
    )
    .sort_values(["caltrans_district"]).reset_index(drop=True)
    .drop_duplicates()
         )
                     
    return df

In [15]:
yml_df = generate_operator_grain_yaml()

In [16]:
yml_df

Unnamed: 0,caltrans_district
0,01-Eureka
6,02-Redding
15,03-Marysville / Sacramento
29,04-Bay Area / Oakland
55,05-San Luis Obispo / Santa Barbara
66,06-Fresno / Bakersfield
76,07-Los Angeles / Ventura
122,08-San Bernardino / Riverside
133,09-Bishop
135,10-Stockton
