In [1]:
%%capture

import warnings

warnings.filterwarnings("ignore")
import _ct_district_visuals as _ct_district_utils
import _report_utils
import calitp_data_analysis.magics
import geopandas as gpd
import pandas as pd
from great_tables import GT
from IPython.display import HTML, Image, Markdown, display, display_html
from slugify import slugify
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

In [2]:
import google.auth

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [3]:
# Comment out and leave this cell right below pandas
district = "08 - San Bernardino / Riverside"

In [4]:
DISTRICT_DIGEST_URL = (
    "https://gtfs-digest--cal-itp-data-analyses.netlify.app/district_"
    f"{slugify(district)}"
)

In [5]:
# %%capture_parameters
# district, DISTRICT_DIGEST_URL

In [6]:
# Extract district from district string when doing an sjoin
# between CT districts & routes
district_int = [int(s) for s in district.split() if s.isdigit()][0]

In [7]:
# Read in all datasets here with GTFS_DATA_DICT
# Reran merge_operator_data.py to test this
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
    filters=[[("caltrans_district", "==", district)]],
)

# using name instead of schedule_gtfs_dataset_key allows us to get
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
operator_df = (
    operator_df.sort_values(["service_date", "name"], ascending=[False, True])
    .drop_duplicates(subset=["name"])
    .reset_index(drop=True)
)

In [8]:
# De duplicate
# First find any organizations_names with more than 2 names per
orgs_agg = (
    operator_df.groupby(["caltrans_district", "organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
)
orgs_agg2 = orgs_agg.loc[orgs_agg.name > 1]
orgs_with_2_names = list(orgs_agg2.organization_name.unique())

In [9]:
# Delete out these organizations from the original df so we can manipulate them.
operator_df2 = operator_df.loc[
    ~operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [10]:
# Filter for these organizations with more than 2 names in their own df.
orgs_with_2_names_df = operator_df.loc[
    operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [11]:
three_month_reference = operator_df2["service_date"].max() - pd.DateOffset(months=3)

In [12]:
orgs_with_2_names_df = orgs_with_2_names_df[
    orgs_with_2_names_df["service_date"] >= three_month_reference
]

In [13]:
# Filter out any rows in which `vp_per_min_agency` and `spatial_accuracy_agency` is equal than 0
# and still has 2+ names
orgs_agg = (
    orgs_with_2_names_df.groupby(["organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
    .rename(columns={"name": "n_names"})
)

In [14]:
orgs_with_2_names_df = pd.merge(
    orgs_with_2_names_df, orgs_agg, on="organization_name", how="left"
)

In [15]:
orgs_with_2_names_df2 = orgs_with_2_names_df[
    (orgs_with_2_names_df.vp_per_min_agency > 0)
    & (orgs_with_2_names_df.spatial_accuracy_agency > 0)
    & (orgs_with_2_names_df.n_names > 1)
].reset_index(drop=True)

In [16]:
# Keep rows that meet service_date
service_date = operator_df2.service_date.max()
orgs_with_2_names_df3 = orgs_with_2_names_df2.loc[
    orgs_with_2_names_df2.service_date == service_date
]
final_names = list(orgs_with_2_names_df3.organization_name.unique())

In [17]:
## Concat back
orgs_with_2_names_df = orgs_with_2_names_df.loc[
    ~orgs_with_2_names_df.organization_name.isin(final_names)
]

In [18]:
orgs_with_2_names_df_final = pd.concat([orgs_with_2_names_df, orgs_with_2_names_df3])

In [19]:
operator_df2 = pd.concat([operator_df2, orgs_with_2_names_df_final])

# District {district}

These are district summaries for [GTFS Digest](https://gtfs-digest--cal-itp-data-analyses.netlify.app/). 

Individual transit operators have their pages at: **[{DISTRICT_DIGEST_URL}]({DISTRICT_DIGEST_URL})**

In [20]:
district_summary = _report_utils.district_stats(operator_df2, "caltrans_district")

In [21]:
summary_table1 = (
    GT(
        district_summary.drop(columns=["arrivals_per_stop", "trips_per_operator"]).pipe(
            _report_utils.transpose_summary_stats, district_col="caltrans_district"
        )
    )
    .fmt_integer(columns="value")
    .cols_label(index="")
    .tab_header(title=f"District {district} GTFS summary stats")
)

summary_table2 = (
    GT(
        district_summary[
            ["caltrans_district", "arrivals_per_stop", "trips_per_operator"]
        ].pipe(_report_utils.transpose_summary_stats, district_col="caltrans_district")
    )
    .fmt_number("value", decimals=1)
    .cols_label(index="")
)

In [22]:
display(summary_table1)
display(summary_table2)

District 08 - San Bernardino / Riverside GTFS summary stats,District 08 - San Bernardino / Riverside GTFS summary stats
Unnamed: 0_level_1,Value
# Operators,12
# routes,149
# trips,4771
# stops,6738
# arrivals,180294


Unnamed: 0,Value
Arrivals per Stop,26.76
Trips per Operator,397.58


## GTFS Stats by Operator

In [23]:
operators_in_district = operator_df2.name.unique()

In [24]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
    storage_options={"token": credentials.token},
)

In [25]:
operator_route_gdf = operator_route_gdf.loc[
    operator_route_gdf.name.isin(operators_in_district)
][["name", "service_date", "recent_combined_name", "geometry"]]

In [26]:
operator_route_gdf = operator_route_gdf.sort_values(
    ["service_date", "name", "recent_combined_name"], 
    ascending=[False, True, True]
).drop_duplicates(
    subset = ["name", "recent_combined_name"]
).drop(
    columns = ["service_date", "recent_combined_name"]
    # drop route because after the dissolve, all operator routes are combined
    # so route would hold only the first row's value
).dissolve(by = "name").reset_index().pipe(_report_utils.replace_column_names)


In [27]:
operator_route_gdf["Transit Operator"] = operator_route_gdf["Transit Operator"].str.replace(" Schedule", "")

In [28]:
district_gdf = _ct_district_utils.ct_district(district_int)

In [29]:
m = district_gdf.explore(name="District")

In [30]:
m = operator_route_gdf.explore(
    m=m,
    column="Transit Operator",
    tiles="CartoDB Positron",
    categorical=True,
    legend=True,
    legend_kwds={"width": 200},
)

In [31]:
display(m)

In [32]:
shared_cols = ["organization_name", "name"]
exclude_cols = [
    "schedule_gtfs_dataset_key",
    "caltrans_district",
    "organization_source_record_id",
    "service_date",
    "primary_uza",
]

gtfs_service_cols = [c for c in operator_df.columns if "operator_" in c]

In [33]:
gtfs_table_df = (
    operator_df2[shared_cols + gtfs_service_cols]
    .pipe(_report_utils.replace_column_names)
    .reset_index(drop=True)
)

In [35]:
gtfs_table_df.columns

Index(['Organization', 'Transit Operator', '# Routes', '# Trips', '# Shapes',
       '# Stops', '# Arrivals', 'Operator Service Miles',
       'Avg Arrivals per Stop'],
      dtype='object')

In [34]:
string_cols = gtfs_table_df.select_dtypes(include="object").columns.tolist()

gtfs_table = (
    GT(gtfs_table_df.sort_values("# Trips", ascending=False))
    .fmt_integer(
        columns=[
            c
            for c in gtfs_table_df.columns
            if c not in ["Operator Service Miles", "Avg Arrivals per Stop"]
            and (c not in string_cols)
        ]
    )
    .fmt_number(columns=["Operator Service Miles", "Avg Arrivals per Stop"], decimals=1)
    .data_color(
        columns=["# Trips", "Avg Arrivals per Stop"],
        palette=["white", "green"],
        na_color="lightgray",
    )
    .tab_header(
        title=f"District {district}",
        subtitle="Daily GTFS schedule statistics by operator",
    )
    .cols_align(
        columns=[
            c
            for c in gtfs_table_df.columns
            if c not in ["Organization", "Transit Operator"]
        ],
        align="center",
    )
)

gtfs_table = _report_utils.great_table_formatting(gtfs_table)
gtfs_table

District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside,District 08 - San Bernardino / Riverside
Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator,Daily GTFS schedule statistics by operator
Organization,Transit Operator,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop
OmniTrans,OmniTrans Schedule,29,1682,86,2285,79380,476.8,34.7
Riverside Transit Agency,Riverside Schedule,35,1193,107,2135,53626,713.8,25.1
Victor Valley Transit Authority,Victor Valley Schedule,33,866,75,1057,21141,850.0,20.0
SunLine Transit Agency,SunLine Avail Schedule,19,476,35,566,15339,426.7,27.1
Mountain Area Regional Transit Authority,Mountain Transit Schedule,7,138,27,136,3121,150.8,22.9
City of Beaumont,Beaumont Transit Schedule,8,123,29,128,1548,126.5,12.1
Basin Transit,Morongo Basin Schedule,7,89,24,169,2457,181.5,14.5
Palo Verde Valley Transit Agency,Desert Roadrunner GMV Schedule,5,54,9,39,472,220.1,12.1
City of Banning,Banning Pass Schedule,2,53,8,22,419,48.8,19.1
City of Corona,Corona Schedule,2,51,8,167,2339,26.3,14.0
