In [None]:
%%capture

import warnings

warnings.filterwarnings("ignore")
import calitp_data_analysis.magics

import geopandas as gpd
import pandas as pd

from great_tables import GT

import _report_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

In [None]:
# Read in all datasets here with GTFS_DATA_DICT 
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles

#operator_df = pd.read_parquet(
#    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet"
#)

Functions used in notebook

In [None]:
# Need this, but sometimes we'll subset columns for table
def replace_column_names(df: pd.DataFrame):
    df.columns = df.columns.map(_report_utils.replace_column_names)
    return df

Edits

Work edits needed into relevant scripts.

In [None]:
## adjust merge_operator_data.py a bit
# notes here until we can work in bac into script
from shared_utils import rt_dates
import merge_operator_data
from segment_speed_utils import time_series_utils, gtfs_schedule_wrangling
from update_vars import SCHED_GCS

analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates 

OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map
SCHED_RT_CATEGORY = GTFS_DATA_DICT.digest_tables.operator_sched_rt
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
    
# Concat operator profiles
df = merge_operator_data.concatenate_operator_stats(analysis_date_list)
    
ntd_cols = [
    "schedule_gtfs_dataset_key",
    # add this!
    "caltrans_district",
    "counties_served",
    "service_area_sq_miles",
    "hq_city",
    "uza_name",
    "service_area_pop",
    "organization_type",
    "primary_uza",
    "reporter_type"
]

# Merge in NTD data. 
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS,
        CROSSWALK,
        analysis_date_list,
        data_type="df",
        columns=ntd_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

# Merge
merge_cols = ["schedule_gtfs_dataset_key", "service_date"]
op_profiles_df1 = pd.merge(
    df, 
    crosswalk_df, 
    on = merge_cols, 
    how = "left"
)

In [None]:
# using name instead of schedule_gtfs_dataset_key allows us to get 
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
op_profiles_test = op_profiles_df1.sort_values(
    ["service_date", "name"], 
    ascending=[False, True]
).drop_duplicates(
    subset=["name"]
).reset_index(drop=True)

In [None]:
# Comment out and leave this cell right below pandas
#district = "07 - Los Angeles"

In [None]:
%%capture_parameters
district

# District {district} GTFS Digest

## Start report section

## District Stats placeholder

If we're doing tables with operators listed...we might want subtotals or district stats on some of those columns.

* Aggregate GTFS stats for the whole district.
* Get some NTD stats for the district, but maybe we'll add UZA breakdown?


In [None]:
route_typologies = [
    "downtown_local", "local", "coverage",
    "rapid", "express", "rail"
]
def district_stats(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    """
    
    sum_me = [
        f"operator_{i}" for i in ["n_routes", "n_trips", "n_shapes", "n_stops", 
                                    "n_arrivals", "route_length_miles"]
    ] + [f"n_{i}_routes" for i in route_typologies]
    
    df2 = (df.groupby(group_cols, 
                      observed=True, group_keys=False)
           .agg({
               "name": "nunique",
               **{c:"sum" for c in sum_me},
           })
           .reset_index()
           .rename(columns = {"name": "n_operators"})
          )
    
    df2 = df2.assign(
        arrivals_per_stop = df2.operator_n_arrivals.divide(df2.operator_n_stops).round(2)
    )
    
    return df2
#op_profiles_test

In [None]:
district_df = district_stats(op_profiles_test, "caltrans_district")

GT(district_df[district_df.caltrans_district==district])

In [None]:
def ntd_counts_pivot_wide(
    df: pd.DataFrame, 
    group_cols: list = ["caltrans_district", "uza_name"],
    count_col: str = ""
) -> pd.DataFrame:
    
    grouped_df = (df.groupby(group_cols + [count_col], 
                  observed=True, group_keys=False)
       .agg({
           "name": "nunique",
       })
       .reset_index()
       .rename(columns = {"name": "n_operators"})
      )
    
    df2 = grouped_df.pivot(
        index = group_cols,
        columns = [count_col],
        values=["n_operators"],
    )
    
    df2.columns = [f'# {b} {a.replace("n_operators", "Operators")}' for a, b in df2.columns]
    df2 = df2.reset_index()
    
    return df2

def sample_ntd_table(df: pd.DataFrame, group_cols: list):
    table = (
        GT(df)
        .fmt_integer(
             columns = [c for c in df.columns if c not in group_cols]
         ).cols_label(
             uza_name = "Urbanized Area"
         ).cols_hide("caltrans_district")
    )
    
    display(table)
    
    return

In [None]:
reporter_counts = ntd_counts_pivot_wide(
    op_profiles_test[op_profiles_test.caltrans_district==district], 
    count_col = "reporter_type"
)

sample_ntd_table(reporter_counts, ["caltrans_district", "uza_name"])

In [None]:
org_counts = ntd_counts_pivot_wide(
    op_profiles_test[op_profiles_test.caltrans_district==district], 
    count_col = "organization_type"
)

sample_ntd_table(org_counts, ["caltrans_district", "uza_name"])

## Existing Operator Section

Move everything from GTFS digest in section 1 into here, all operators for district displayed.

* GTFS transit operator stats (number of routes, trips, shapes, stops, arrivals, etc).
* NTD caption (these are written as captions, but lift the underlying table and put into table)
* NACTO route typologies

In [None]:
# Load operator dataset, subset for district
district_operator_profiles = op_profiles_test[
    op_profiles_test.caltrans_district == district
].reset_index(drop=True)

In [None]:
operators_in_district = district_operator_profiles.schedule_gtfs_dataset_key.unique()

operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
    filters = [["schedule_gtfs_dataset_key", "in", operators_in_district]],
    columns = ["name", "service_date", 
               "route_combined_name", "geometry"]
).sort_values(
    ["service_date", "name", "route_combined_name"], 
    ascending=[False, True, True]
).drop_duplicates(
    subset = ["name", "route_combined_name"]
).drop(
    columns = ["service_date", "route_combined_name"]
    # drop route because after the dissolve, all operator routes are combined
    # so route would hold only the first row's value
).dissolve(by = "name").reset_index().pipe(replace_column_names)

In [None]:
operator_route_gdf.explore("Transit Operator", tiles = "CartoDB Positron")

In [None]:
shared_cols = ["organization_name", "name"]
exclude_cols = ["schedule_gtfs_dataset_key", "caltrans_district",
                "organization_source_record_id", "service_date", 
                "primary_uza"]

gtfs_service_cols = [
    c for c in district_operator_profiles.columns 
    if "operator_" in c]

nacto_cols = [
    c for c in district_operator_profiles.columns 
    if c.startswith("n_")
]

ntd_cols = [
    c for c in district_operator_profiles.columns 
    if (c not in gtfs_service_cols) and 
    (c not in nacto_cols) and (c not in shared_cols)
    and (c not in exclude_cols)
]

In [None]:
gtfs_table_df = district_operator_profiles[
    shared_cols + gtfs_service_cols
].pipe(replace_column_names)

nacto_table_df = district_operator_profiles[
    shared_cols + nacto_cols
].pipe(replace_column_names)

ntd_table_df = district_operator_profiles[
    shared_cols + ntd_cols
].pipe(replace_column_names) 

In [None]:
string_cols = gtfs_table_df.select_dtypes(include="object").columns.tolist()

gtfs_table = (
    GT(gtfs_table_df.sort_values("# Trips", ascending=False))
    .fmt_integer(
        columns = [
            c for c in gtfs_table_df.columns if c not in 
         ["Operator Service Miles", "Avg Arrivals per Stop"]
            and (c not in string_cols)
        ]
     ).fmt_number(
        columns = ["Operator Service Miles", "Avg Arrivals per Stop"], 
        decimals = 1
    )
    .data_color(
        columns=["# Trips", "Avg Arrivals per Stop"],
        palette=["white", "green"],
        na_color="lightgray"
     ).tab_header(
         title = f"District {district} Operator Overview",
         subtitle = "GTFS schedule statistics"
     )
    .tab_options(
        container_width = "100%",
        table_background_color="white",
        table_body_hlines_style="none",
        table_body_vlines_style="none",
        heading_background_color="white",
        column_labels_background_color="white",
        row_group_background_color="white",
        stub_background_color="white",
        source_notes_background_color="white"
     )
    )

gtfs_table

In [None]:
number_cols = nacto_table_df.select_dtypes(include="number").columns.tolist()

nacto_table = (
    GT(nacto_table_df)
    .fmt_integer(
        columns = number_cols
     ).data_color(
        # fix these to match GTFS digest
        columns = "# Downtown Local Route Types", 
        palette=["white", "purple"],
        na_color="lightgray"
     ).data_color(
        columns = "# Express Route Types", 
        palette=["white", "orange"],
        na_color="lightgray"
    ).data_color(
        columns = "# Rapid Route Types", 
        palette=["white", "gold"],
        na_color="lightgray"
    ).data_color(
        columns = "# Rail Route Types", 
        palette=["white", "red"],
        na_color="lightgray"
    ).tab_header(
         title = f"District {district} Operator Overview",
         subtitle = "NACTO route typologies"
     )
    .tab_options(
        container_width = "100%",
        table_background_color="white",
        table_body_hlines_style="none",
        table_body_vlines_style="none",
        heading_background_color="white",
        column_labels_background_color="white",
        row_group_background_color="white",
        stub_background_color="white",
        source_notes_background_color="white"
     )
    )

nacto_table

In [None]:
number_cols = ntd_table_df.select_dtypes(include="number").columns.tolist()

ntd_table = (
    GT(ntd_table_df.sort_values("service_area_pop", ascending=False))
    .fmt_integer(
        columns = number_cols
     ).data_color(
        columns = "reporter_type",
        palette=["white", "red", "green"],
    ).data_color(
        columns = "organization_type",
        palette=["white", "red", "green"],
    ).tab_header(
         title = f"District {district} Operator Overview",
         subtitle = "NTD stats"
     ).cols_label(
        counties_served = "Counties",
        service_area_sq_miles = "Service Area (sq mi)",
        hq_city = "HQ City",
        uza_name = "Urbanized Area",
        service_area_pop = "Service Area Population",
        organization_type = "Organization Type",
        reporter_type = "Reporter Type"
    )
    .tab_options(
        container_width = "100%",
        table_background_color="white",
        table_body_hlines_style="none",
        table_body_vlines_style="none",
        heading_background_color="white",
        column_labels_background_color="white",
        row_group_background_color="white",
        stub_background_color="white",
        source_notes_background_color="white"
     )
    )

ntd_table