## Adjusting `service_hours` and `operator_profiles` with NTD data to be published on the Public GCS Page 

In [1]:
import geopandas as gpd
import pandas as pd

from pathlib import Path
from typing import Literal

from calitp_data_analysis import utils
from shared_utils import publish_utils
from update_vars import GTFS_DATA_DICT, SCHED_GCS
from segment_speed_utils import helpers, time_series_utils
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [2]:
from shared_utils import catalog_utils, rt_dates

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
import _gtfs_digest_dataset

### Understand how `gtfs_digets/publish_public_data` works.

#### `grab_filepaths()`

In [8]:
GCS = GTFS_DATA_DICT["digest_tables"].dir

In [13]:
GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [6]:
digest_df_keys = [
        "route_schedule_vp", 
        "operator_profile_portfolio_view",  
        "operator_sched_rt",
        "scheduled_service_hours",
    ]  

In [14]:
GTFS_DATA_DICT["digest_tables"]

{'dir': '${gcs_paths.RT_SCHED_GCS}', 'route_schedule_vp': 'digest/schedule_vp_metrics', 'route_segment_speeds': 'digest/segment_speeds', 'operator_profiles': 'digest/operator_profiles', 'operator_routes_map': 'digest/operator_routes', 'operator_sched_rt': 'digest/operator_schedule_rt_category', 'scheduled_service_hours': 'digest/total_scheduled_service_hours', 'operator_profile_portfolio_view': 'digest/operator_profile_portfolio_view'}

In [15]:
GTFS_DATA_DICT["digest_tables"]["route_schedule_vp"]

'digest/schedule_vp_metrics'

In [10]:
filepaths = [GTFS_DATA_DICT["digest_tables"][f] for f in digest_df_keys]

In [11]:
filepaths

['digest/schedule_vp_metrics',
 'digest/operator_profile_portfolio_view',
 'digest/operator_schedule_rt_category',
 'digest/total_scheduled_service_hours']

In [17]:
df_file_paths = [f"{GCS}{f}.parquet" for f in filepaths]

In [18]:
df_file_paths

['gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profile_portfolio_view.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_schedule_rt_category.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/total_scheduled_service_hours.parquet']

#### `export_parquet_as_csv_or_geojson`

In [16]:
from pathlib import Path

In [23]:
Path(df_file_paths[0])

PosixPath('gs:/calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics.parquet')

In [24]:
Path(df_file_paths[0]).stem

'schedule_vp_metrics'

In [21]:
f"{Path(df_file_paths[0]).stem}.csv"

'schedule_vp_metrics.csv'

In [25]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [26]:
f"{PUBLIC_GCS}gtfs_digest/{Path(df_file_paths[0]).stem}.csv"

'gs://calitp-publish-data-analysis/gtfs_digest/schedule_vp_metrics.csv'

### Check out Crosswalk `gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization`

In [None]:
import sys

sys.path.append("../gtfs_funnel")
import crosswalk_gtfs_dataset_key_to_organization

In [None]:
# Check out crosswalk file
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [None]:
may_crosswalk = pd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-05-22.parquet")

In [None]:
may_crosswalk.shape

In [None]:
may_crosswalk.columns

### Stack all of the crosswalk files together and then join onto `operator_profiles`

In [None]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
op_profiles_df = pd.read_parquet(op_profiles_url)

In [None]:
len(op_profiles_df)

In [None]:
op_profiles_df.info()

#### Attempt to Stack

In [None]:
time_series_utils.concatenate_datasets_across_dates??

In [None]:
all_dates = (rt_dates.y2024_dates + rt_dates.y2023_dates + 
             rt_dates.oct2023_week + rt_dates.apr2023_week + 
             rt_dates.apr2024_week
            )

In [None]:
SCHED_GCS

In [None]:
FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [None]:
ntd_cols = [
        "schedule_gtfs_dataset_key",
        "counties_served",
        "service_area_sq_miles",
        "hq_city",
        "uza_name",
        "service_area_pop",
        "organization_type",
        "primary_uza",
        "reporter_type"
    ]

In [None]:
crossalk_all_dates = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            FILE,
            all_dates,
            data_type="df",
            columns=ntd_cols
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

In [None]:
crossalk_all_dates.shape

In [None]:
crossalk_all_dates.head(1)

In [None]:
crossalk_all_dates.service_date.nunique()

In [None]:
crossalk_all_dates.service_date.unique()

### Merge
* Why do the merged dfs become so huge??

In [None]:
op_profiles_df1 = pd.merge(op_profiles_df, crossalk_all_dates, on = ["schedule_gtfs_dataset_key"], how = "left")

In [None]:
op_profiles_df1.shape

In [None]:
op_profiles_df2 = pd.merge(op_profiles_df, crossalk_all_dates, on = ["schedule_gtfs_dataset_key", "service_date"], how = "left")

In [None]:
op_profiles_df2.shape

In [None]:
op_profiles_df2.schedule_gtfs_dataset_key.value_counts().head(10)

In [None]:
op_profiles_df.schedule_gtfs_dataset_key.value_counts().head()

In [None]:
op_profiles_df.schedule_gtfs_dataset_key.value_counts().describe()

In [None]:
# op_profiles_df2.loc[op_profiles_df2.schedule_gtfs_dataset_key == "e359e3617344263ad00858db2149a288"]

In [None]:
# op_profiles_df2.loc[op_profiles_df2.schedule_gtfs_dataset_key == "e8d0fd2f1c4b13707a24909a0f206271"]

In [None]:
len(op_profiles_df2.drop_duplicates(subset = ["schedule_gtfs_dataset_key", "service_date"]))

In [None]:
len(op_profiles_df2.drop_duplicates(subset = list(op_profiles_df2.columns)))

In [None]:
op_profiles_df3 = op_profiles_df2.drop_duplicates(subset = ["schedule_gtfs_dataset_key", "service_date"])

In [None]:
# op_profiles_df3.loc[op_profiles_df3.schedule_gtfs_dataset_key == "e8d0fd2f1c4b13707a24909a0f206271"]

In [None]:
op_profiles_df.shape

In [None]:
op_profiles_df.service_date.nunique()

In [None]:
op_profiles_df.head(1)

In [None]:
crossalk_all_dates.head(1)

#### Check out export file in `gtfs_digest_dataset`

In [None]:
OP_PROFILE_EXPORT = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profile_portfolio_view}.parquet"

In [None]:
OP_PROFILE_EXPORT

In [None]:
operator_file = pd.read_parquet(OP_PROFILE_EXPORT)

In [None]:
operator_file.shape

In [None]:
operator_file.head(2)

In [None]:
operator_file.info()

#### Make sure `section_1.load_operator_ntd_profile()` works perfectly

In [None]:
import _section1_utils

In [None]:
organization_name = "City and County of San Francisco"

In [None]:
sf = _section1_utils.load_operator_ntd_profile(organization_name)

In [None]:
sf

In [None]:
placer = _section1_utils.load_operator_ntd_profile("Placer County")

In [None]:
placer