# Refactoring `legislative_district_report.ipynb`

In [1]:
import _legislative_district_prep
import _report_utils
import geopandas as gpd
import pandas as pd
from great_tables import GT
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import google.auth

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [4]:
# Comment out and leave this cell right below pandas
district = "AD 03"

In [5]:
district_full_name = _legislative_district_prep.readable_district_name(district)

In [6]:
district_full_name

'Assembly District 03'

In [65]:
def load_district_stats() -> pd.DataFrame:
    OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles

    legislative_crosswalk = pd.read_parquet(
        f"{SHARED_GCS}crosswalk_transit_operators_legislative_districts.parquet",
        filters=[[("legislative_district", "==", district)]],
    )

    operator_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
    )
    m1 = operator_df.merge(legislative_crosswalk, on="name", how="inner")
    # Keep only the most recent rows
    m1 = m1.sort_values(
        ["service_date", "name"], ascending=[False, True]
    ).drop_duplicates(subset=["portfolio_organization_name"])

    return m1

In [70]:
operator_df = load_district_stats()

In [71]:
len(operator_df)

8

In [10]:
district_summary = _report_utils.district_stats(operator_df, "legislative_district")

In [11]:
district_summary.T

Unnamed: 0,0
legislative_district,AD 03
n_operators,8
operator_n_routes,8094
operator_n_trips,61776
operator_n_stops,71648
operator_n_arrivals,863067
arrivals_per_stop,12.05
trips_per_operator,7722.00


In [12]:
summary_table1 = (
    GT(
        district_summary.drop(columns=["arrivals_per_stop", "trips_per_operator"]).pipe(
            _report_utils.transpose_summary_stats, district_col="legislative_district"
        )
    )
    .fmt_integer(columns="value")
    .cols_label(index="")
    .tab_header(title=f"{district_full_name} GTFS summary stats")
)

summary_table2 = (
    GT(
        district_summary[
            ["legislative_district", "arrivals_per_stop", "trips_per_operator"]
        ].pipe(
            _report_utils.transpose_summary_stats, district_col="legislative_district"
        )
    )
    .fmt_number("value", decimals=1)
    .cols_label(index="")
)

In [13]:
summary_table1

Assembly District 03 GTFS summary stats,Assembly District 03 GTFS summary stats
Unnamed: 0_level_1,Value
# Operators,8
# routes,8094
# trips,61776
# stops,71648
# arrivals,863067


In [14]:
summary_table2

Unnamed: 0,Value
Arrivals per Stop,12.05
Trips per Operator,7722.0


In [48]:
def load_gtfs_data(df: pd.DataFrame) -> pd.DataFrame:
    # Load the relevant operators in the district
    operators_in_district = df.schedule_gtfs_dataset_key.unique()

    OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

    operator_route_gdf = gpd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
        storage_options={"token": credentials.token},
    )
    operator_route_gdf = operator_route_gdf.loc[
        operator_route_gdf.schedule_gtfs_dataset_key.isin(operators_in_district)
    ][
        [
            "portfolio_organization_name",
            "service_date",
            "recent_combined_name",
            "geometry",
        ]
    ]

    # Only keep the most recent transit route geographies
    operator_route_gdf2 = operator_route_gdf.drop_duplicates(
        subset=["portfolio_organization_name", "recent_combined_name"]
    )
    operator_route_gdf2 = operator_route_gdf2.dissolve(
        by=["portfolio_organization_name"]
    ).reset_index()[["portfolio_organization_name", "geometry"]]

    operator_route_gdf2 = operator_route_gdf2.rename(
        columns={"portfolio_organization_name": "Transit Operator"}
    )
    return operator_route_gdf2

In [49]:
operator_route_gdf = load_gtfs_data(operator_df)

In [50]:
operator_route_gdf.columns

Index(['Transit Operator', 'geometry'], dtype='object')

In [96]:
def create_gtfs_table(df: pd.DataFrame) -> pd.DataFrame:
    gtfs_service_cols = [c for c in df.columns if "operator_" in c]

    gtfs_table_df = df[["organization_name"] + gtfs_service_cols]

    gtfs_table_df = gtfs_table_df.rename(
        columns={
            "organization_name": "Organization",
            "operator_n_routes": "# Routes",
            "operator_n_trips": "# Trips",
            "operator_n_shapes": "# Shapes",
            "operator_n_stops": "# Stops",
            "operator_n_arrivals": "# Arrivals",
            "operator_route_length_miles": "Operator Service Miles",
            "operator_arrivals_per_stop": "Avg Arrivals per Stop",
        }
    )

    gtfs_table_df = gtfs_table_df.reset_index(drop=True)
    return gtfs_table_df

In [97]:
gtfs_table_df = create_gtfs_table(operator_df)

In [98]:
operator_df.shape

(8, 32)

In [99]:
string_cols = gtfs_table_df.select_dtypes(include="object").columns.tolist()

In [100]:
int_columns = [
    c
    for c in gtfs_table_df.columns
    if c not in ["Operator Service Miles", "Avg Arrivals per Stop"]
    and (c not in string_cols)
]

In [101]:
gtfs_table_df

Unnamed: 0,Organization,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop
0,Amtrak,59,587,131,509,4117,35028.54,8.09
1,Butte County Association of Governments,19,360,41,502,8480,243.13,16.89
2,Greyhound,222,871,759,888,5665,92107.85,6.38
3,Placer County,6,108,18,132,1984,168.61,15.03
4,Tehama County,9,71,10,72,934,284.97,12.97
5,Yuba-Sutter Transit Authority,14,231,44,287,6477,442.78,22.57
6,Redding Area Bus Authority,12,198,27,343,5123,278.49,14.94
7,Glenn County,1,12,4,62,414,46.18,6.68


In [102]:
gtfs_table = (
    GT(gtfs_table_df.sort_values("# Trips", ascending=False))
    .fmt_integer(columns=int_columns)
    .data_color(
        columns=["# Trips", "Avg Arrivals per Stop"],
        palette=["white", "green"],
        na_color="lightgray",
    )
    .tab_header(
        title=f"{district_full_name}",
        subtitle="Daily GTFS schedule statistics by operator",
    )
    .cols_align(
        columns=[
            c
            for c in gtfs_table_df.columns
            if c not in ["Organization", "Transit Operator"]
        ],
        align="center",
    )
)

In [103]:
gtfs_table = _report_utils.great_table_formatting(gtfs_table)
gtfs_table

Assembly District 03,Assembly District 03,Assembly District 03,Assembly District 03,Assembly District 03,Assembly District 03,Assembly District 03,Assembly District 03
Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator
Organization,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop
Greyhound,222,871,759,888,5665,92107.85,6.38
Amtrak,59,587,131,509,4117,35028.54,8.09
Butte County Association of Governments,19,360,41,502,8480,243.13,16.89
Yuba-Sutter Transit Authority,14,231,44,287,6477,442.78,22.57
Redding Area Bus Authority,12,198,27,343,5123,278.49,14.94
Placer County,6,108,18,132,1984,168.61,15.03
Tehama County,9,71,10,72,934,284.97,12.97
Glenn County,1,12,4,62,414,46.18,6.68


In [104]:
gtfs_table_df

Unnamed: 0,Organization,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop
0,Amtrak,59,587,131,509,4117,35028.54,8.09
1,Butte County Association of Governments,19,360,41,502,8480,243.13,16.89
2,Greyhound,222,871,759,888,5665,92107.85,6.38
3,Placer County,6,108,18,132,1984,168.61,15.03
4,Tehama County,9,71,10,72,934,284.97,12.97
5,Yuba-Sutter Transit Authority,14,231,44,287,6477,442.78,22.57
6,Redding Area Bus Authority,12,198,27,343,5123,278.49,14.94
7,Glenn County,1,12,4,62,414,46.18,6.68
