In [None]:
%%capture

import warnings

warnings.filterwarnings("ignore")
import calitp_data_analysis.magics

import geopandas as gpd
import pandas as pd

from great_tables import GT
from slugify import slugify

import _report_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS
import _ct_district_utils

from IPython.display import HTML, Image, Markdown, display, display_html

In [None]:
# Comment out and leave this cell right below pandas
district = "08 - San Bernardino"

In [None]:
DISTRICT_DIGEST_URL = (
    "https://gtfs-digest--cal-itp-data-analyses.netlify.app/district_"
    f"{slugify(district)}"
)

In [None]:
# %%capture_parameters
# district, DISTRICT_DIGEST_URL

In [None]:
# Extract district from district string when doing an sjoin
# between CT districts & routes
district_int = [int(s) for s in district.split() if s.isdigit()][0]

In [None]:
# Read in all datasets here with GTFS_DATA_DICT 
# Reran merge_operator_data.py to test this
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
    filters = [[("caltrans_district", "==", district)]]
)

# using name instead of schedule_gtfs_dataset_key allows us to get 
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
operator_df = operator_df.sort_values(
    ["service_date", "name"], 
    ascending=[False, True]
).drop_duplicates(
    subset=["name"]
).reset_index(drop=True)

In [None]:
# De duplicate
# First find any organizations_names with more than 2 names per 
orgs_agg = (
    operator_df.groupby(["caltrans_district", "organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
)
orgs_agg2 = orgs_agg.loc[orgs_agg.name > 1]
orgs_with_2_names = list(orgs_agg2.organization_name.unique())

In [None]:
# Delete out these organizations from the original df so we can manipulate them.
operator_df2 = operator_df.loc[
    ~operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [None]:
# Filter for these organizations with more than 2 names in their own df.
orgs_with_2_names_df = operator_df.loc[
    operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [None]:
three_month_reference = operator_df2["service_date"].max() - pd.DateOffset(
    months=3
)

In [None]:
orgs_with_2_names_df = orgs_with_2_names_df[
    orgs_with_2_names_df["service_date"] >= three_month_reference
]

In [None]:
# Filter out any rows in which `vp_per_min_agency` and `spatial_accuracy_agency` is equal than 0
# and still has 2+ names
orgs_agg = (
    orgs_with_2_names_df.groupby(["organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
    .rename(columns={"name": "n_names"})
)

In [None]:
orgs_with_2_names_df = pd.merge(
    orgs_with_2_names_df, orgs_agg, on="organization_name", how="left"
)

In [None]:
orgs_with_2_names_df2 = orgs_with_2_names_df[
    (orgs_with_2_names_df.vp_per_min_agency > 0)
    & (orgs_with_2_names_df.spatial_accuracy_agency > 0)
    & (orgs_with_2_names_df.n_names > 1)
].reset_index(drop=True)

In [None]:
# Keep rows that meet service_date
service_date = operator_df2.service_date.max()
orgs_with_2_names_df3 = orgs_with_2_names_df2.loc[
    orgs_with_2_names_df2.service_date == service_date
]
final_names = list(orgs_with_2_names_df3.organization_name.unique())

In [None]:
## Concat back
orgs_with_2_names_df = orgs_with_2_names_df.loc[~orgs_with_2_names_df.organization_name.isin(final_names)]

In [None]:
orgs_with_2_names_df_final = pd.concat([orgs_with_2_names_df, orgs_with_2_names_df3])

In [None]:
operator_df2 = pd.concat([operator_df2, orgs_with_2_names_df_final])

# District {district}

These are district summaries for [GTFS Digest](https://gtfs-digest--cal-itp-data-analyses.netlify.app/). 

Individual transit operators have their pages at: **[{DISTRICT_DIGEST_URL}]({DISTRICT_DIGEST_URL})**

In [None]:
district_summary = _report_utils.district_stats(
    operator_df2, 
    "caltrans_district"
)

In [None]:
summary_table1 = (GT(
    district_summary.drop(
        columns = ["arrivals_per_stop", "trips_per_operator"]
    ).pipe(
        _report_utils.transpose_summary_stats,
        district_col = "caltrans_district"
    )
)
 .fmt_integer(columns="value")
 .cols_label(index="")
 .tab_header(title = f"District {district} GTFS summary stats")
)

summary_table2 = (GT(
    district_summary[
        ["caltrans_district", "arrivals_per_stop", "trips_per_operator"]
    ].pipe(
        _report_utils.transpose_summary_stats,
        district_col = "caltrans_district"
    )
)
 .fmt_number("value", decimals=1)
 .cols_label(index="")
)

In [None]:
display(summary_table1)
display(summary_table2)

## GTFS Stats by Operator

In [None]:
operators_in_district = operator_df2.name.unique()

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
    filters = [["name", "in", operators_in_district]],
    columns = ["name", "service_date", 
               "route_combined_name", "geometry"]
).sort_values(
    ["service_date", "name", "route_combined_name"], 
    ascending=[False, True, True]
).drop_duplicates(
    subset = ["name", "route_combined_name"]
).drop(
    columns = ["service_date", "route_combined_name"]
    # drop route because after the dissolve, all operator routes are combined
    # so route would hold only the first row's value
).dissolve(by = "name").reset_index().pipe(_report_utils.replace_column_names)

operator_route_gdf["Transit Operator"] = operator_route_gdf["Transit Operator"].str.replace(" Schedule", "")

In [None]:
operator_route_gdf.crs

In [None]:
district_gdf = _ct_district_utils.ct_district(district_int)

In [None]:
shapes_within_dist = gpd.sjoin(
            operator_route_gdf,
            district_gdf,
            how = "inner",
            predicate = "within",
        ).drop(columns = "index_right")
    

In [None]:
display(
        Markdown(
            f"""Please note, the routes are clipped to the boundaries of <b>District {district_int}</b>
        """
        )
    )

In [None]:
shapes_within_dist.explore(
    "Transit Operator", 
    tiles = "CartoDB Positron",
    categorical=True,
    legend=True,
    legend_kwds = {
        "width": 200
    }
)

In [None]:
shared_cols = ["organization_name", "name"]
exclude_cols = ["schedule_gtfs_dataset_key", "caltrans_district",
                "organization_source_record_id", "service_date", 
                "primary_uza"]

gtfs_service_cols = [
    c for c in operator_df.columns 
    if "operator_" in c]

In [None]:
gtfs_table_df = operator_df2[
    shared_cols + gtfs_service_cols
].pipe(_report_utils.replace_column_names).reset_index(drop = True)

In [None]:
string_cols = gtfs_table_df.select_dtypes(include="object").columns.tolist()

gtfs_table = (
    GT(gtfs_table_df.sort_values("# Trips", ascending=False))
    .fmt_integer(
        columns = [
            c for c in gtfs_table_df.columns if c not in 
         ["Operator Service Miles", "Avg Arrivals per Stop"]
            and (c not in string_cols)
        ])
    .fmt_number(
        columns = ["Operator Service Miles", "Avg Arrivals per Stop"], 
        decimals = 1)
    .data_color(
        columns=["# Trips", "Avg Arrivals per Stop"],
        palette=["white", "green"],
        na_color="lightgray")
    .tab_header(
         title = f"District {district}",
         subtitle = "Daily GTFS schedule statistics by operator")
    .cols_align(columns = [c for c in gtfs_table_df.columns if c not in ["Organization", "Transit Operator"]],
        align="center")
    )

gtfs_table = _report_utils.great_table_formatting(gtfs_table)
gtfs_table