## Adjusting `service_hours` and `operator_profiles` with NTD data to be published on the Public GCS Page 

In [1]:
import geopandas as gpd
import pandas as pd

from pathlib import Path
from typing import Literal

from calitp_data_analysis import utils
from shared_utils import publish_utils
from update_vars import GTFS_DATA_DICT, SCHED_GCS
from segment_speed_utils import helpers, time_series_utils
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [2]:
from shared_utils import catalog_utils, rt_dates

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
import _gtfs_digest_dataset

### Check out Crosswalk `gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization`

In [5]:
import sys

sys.path.append("../gtfs_funnel")
import crosswalk_gtfs_dataset_key_to_organization

In [6]:
# Check out crosswalk file
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [7]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

'crosswalk/gtfs_key_organization'

In [8]:
may_crosswalk = pd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-05-22.parquet")

In [9]:
may_crosswalk.shape

(168, 32)

In [10]:
may_crosswalk.columns

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'itp_id', 'caltrans_district', 'ntd_id_2022', 'counties_served',
       'hq_city', 'hq_county', 'is_public_entity', 'is_publicly_operating',
       'funding_sources', 'on_demand_vehicles_at_max_service',
       'vehicles_at_max_service', 'number_of_state_counties', 'uza_name',
       'density', 'number_of_counties_with_service',
       'state_admin_funds_expended', 'service_area_sq_miles', 'population',
       'service_area_pop', 'subrecipient_type', 'primary_uza', 'reporter_type',
       'organization_type', 'voms_pt', 'voms_do', 'year'],
      dtype='object')

### Stack all of the crosswalk files together and then join onto `operator_profiles`

In [11]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [12]:
op_profiles_df = pd.read_parquet(op_profiles_url)

In [13]:
len(op_profiles_df)

1947

In [14]:
op_profiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1947 entries, 0 to 1946
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   schedule_gtfs_dataset_key      1947 non-null   object        
 1   operator_n_routes              1947 non-null   int64         
 2   operator_n_trips               1947 non-null   int64         
 3   operator_n_shapes              1947 non-null   int64         
 4   operator_n_stops               1947 non-null   int64         
 5   operator_n_arrivals            1947 non-null   int64         
 6   operator_route_length_miles    1947 non-null   float64       
 7   operator_arrivals_per_stop     1947 non-null   float64       
 8   n_downtown_local_routes        1947 non-null   int64         
 9   n_local_routes                 1947 non-null   int64         
 10  n_coverage_routes              1947 non-null   int64         
 11  n_rapid_routes   

#### Attempt to Stack

In [15]:
time_series_utils.concatenate_datasets_across_dates??

[0;31mSignature:[0m
[0mtime_series_utils[0m[0;34m.[0m[0mconcatenate_datasets_across_dates[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mgcs_bucket[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdataset_name[0m[0;34m:[0m [0mLiteral[0m[0;34m[[0m[0;34m'speeds_route_dir_segments'[0m[0;34m,[0m [0;34m'speeds_route_dir'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdate_list[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m:[0m [0mLiteral[0m[0;34m[[0m[0;34m'df'[0m[0;34m,[0m [0;34m'gdf'[0m[0;34m][0m [0;34m=[0m [0;34m'gdf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mget_pandas[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31m

In [16]:
all_dates = (rt_dates.y2024_dates + rt_dates.y2023_dates + 
             rt_dates.oct2023_week + rt_dates.apr2023_week + 
             rt_dates.apr2024_week
            )

In [17]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [18]:
FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [19]:
ntd_cols = [
        "schedule_gtfs_dataset_key",
        "counties_served",
        "service_area_sq_miles",
        "hq_city",
        "uza_name",
        "service_area_pop",
        "organization_type",
        "primary_uza",
        "reporter_type"
    ]

In [20]:
crossalk_all_dates = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            FILE,
            all_dates,
            data_type="df",
            columns=ntd_cols
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

In [21]:
crossalk_all_dates.shape

(6090, 10)

In [22]:
crossalk_all_dates.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type,service_date
0,9b4c9cf681dfcc10dffe9523b43598bf,San Mateo,425.0,San Carlos,"San Francisco--Oakland, CA",3632769.0,Public Agency or Authority of Transit Service,,Full Reporter,2023-03-15


In [23]:
crossalk_all_dates.service_date.nunique()

34

In [24]:
crossalk_all_dates.service_date.unique()

array(['2023-03-15T00:00:00.000000000', '2023-04-10T00:00:00.000000000',
       '2023-04-11T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-04-13T00:00:00.000000000', '2023-04-14T00:00:00.000000000',
       '2023-04-15T00:00:00.000000000', '2023-04-16T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-09T00:00:00.000000000',
       '2023-10-10T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-10-12T00:00:00.000000000', '2023-10-13T00:00:00.000000000',
       '2023-10-14T00:00:00.000000000', '2023-10-15T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-15T00:00:00.000000000',
       '2024-04-16T00:00:00.000000000', '2024-04-17

### Merge
* Why do the merged dfs become so huge??

In [25]:
op_profiles_df1 = pd.merge(op_profiles_df, crossalk_all_dates, on = ["schedule_gtfs_dataset_key"], how = "left")

In [26]:
op_profiles_df1.shape

(53633, 27)

In [27]:
op_profiles_df2 = pd.merge(op_profiles_df, crossalk_all_dates, on = ["schedule_gtfs_dataset_key", "service_date"], how = "left")

In [28]:
op_profiles_df2.shape

(2363, 26)

In [29]:
op_profiles_df2.schedule_gtfs_dataset_key.value_counts().head(10)

62b3c362d08fa3a771a7b7b2002c8a16    17
eaabdf2b0bb899b7953ea81047fdd00d    17
50c7a90344fdafe80e22b628e9739eba    17
e359e3617344263ad00858db2149a288    17
e524db270831632bdcf71df1d7e74d25    17
4e2936d8f27a9bca79289ec062a1691a    17
4be4d02062a1a82dae5c039092ee8541    17
4b59b468244e0d5139d91fc698acc9d6    17
e681c3a8dafa2c80e5b8e2cdd01f917a    17
e8d0fd2f1c4b13707a24909a0f206271    17
Name: schedule_gtfs_dataset_key, dtype: int64

In [30]:
op_profiles_df.schedule_gtfs_dataset_key.value_counts().head()

63029a23cb0e73f2a5d98a345c5e2e40    14
e681c3a8dafa2c80e5b8e2cdd01f917a    14
55a01ef72af21906934ae8ffb4786e86    14
5456c80d420043e15c8eb7368a8a4d89    14
50c7a90344fdafe80e22b628e9739eba    14
Name: schedule_gtfs_dataset_key, dtype: int64

In [31]:
op_profiles_df.schedule_gtfs_dataset_key.value_counts().describe()

count   260.00
mean      7.49
std       5.03
min       1.00
25%       2.00
50%       7.00
75%      13.00
max      14.00
Name: schedule_gtfs_dataset_key, dtype: float64

In [44]:
# op_profiles_df2.loc[op_profiles_df2.schedule_gtfs_dataset_key == "e359e3617344263ad00858db2149a288"]

In [43]:
# op_profiles_df2.loc[op_profiles_df2.schedule_gtfs_dataset_key == "e8d0fd2f1c4b13707a24909a0f206271"]

In [34]:
len(op_profiles_df2.drop_duplicates(subset = ["schedule_gtfs_dataset_key", "service_date"]))

1947

In [35]:
len(op_profiles_df2.drop_duplicates(subset = list(op_profiles_df2.columns)))

1947

In [36]:
op_profiles_df3 = op_profiles_df2.drop_duplicates(subset = ["schedule_gtfs_dataset_key", "service_date"])

In [42]:
# op_profiles_df3.loc[op_profiles_df3.schedule_gtfs_dataset_key == "e8d0fd2f1c4b13707a24909a0f206271"]

In [38]:
op_profiles_df.shape

(1947, 18)

In [39]:
op_profiles_df.service_date.nunique()

14

In [40]:
op_profiles_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date
0,014d0998350083249a9eb310635548c2,8,137,8,159,3771,79.5,23.72,2,0,6,8,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11


In [41]:
crossalk_all_dates.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type,service_date
0,9b4c9cf681dfcc10dffe9523b43598bf,San Mateo,425.0,San Carlos,"San Francisco--Oakland, CA",3632769.0,Public Agency or Authority of Transit Service,,Full Reporter,2023-03-15


#### Check out export file in `gtfs_digest_dataset`

In [46]:
OP_PROFILE_EXPORT = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profile_portfolio_view}.parquet"

In [47]:
OP_PROFILE_EXPORT

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profile_portfolio_view.parquet'

In [48]:
operator_file = pd.read_parquet(OP_PROFILE_EXPORT)

In [49]:
operator_file.shape

(1947, 26)

In [50]:
operator_file.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
0,014d0998350083249a9eb310635548c2,8,137,8,159,3771,79.5,23.72,2,0,6,8,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11,San Luis Obispo,130.0,San Luis Obispo,"San Luis Obispo, CA",206008.0,Public Agency or Authority of Transit Service,,Full Reporter
1,014d0998350083249a9eb310635548c2,9,139,10,159,3803,85.88,23.92,2,0,8,10,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-11-15,San Luis Obispo,130.0,San Luis Obispo,"San Luis Obispo, CA",206008.0,Public Agency or Authority of Transit Service,,Full Reporter


In [51]:
operator_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1947 entries, 0 to 1946
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   schedule_gtfs_dataset_key      1947 non-null   object        
 1   operator_n_routes              1947 non-null   int64         
 2   operator_n_trips               1947 non-null   int64         
 3   operator_n_shapes              1947 non-null   int64         
 4   operator_n_stops               1947 non-null   int64         
 5   operator_n_arrivals            1947 non-null   int64         
 6   operator_route_length_miles    1947 non-null   float64       
 7   operator_arrivals_per_stop     1947 non-null   float64       
 8   n_downtown_local_routes        1947 non-null   int64         
 9   n_local_routes                 1947 non-null   int64         
 10  n_coverage_routes              1947 non-null   int64         
 11  n_rapid_routes   

#### Make sure `section_1.load_operator_ntd_profile()` works perfectly

In [52]:
import _section1_utils

In [53]:
organization_name = "City and County of San Francisco"

In [54]:
sf = _section1_utils.load_operator_ntd_profile(organization_name)

In [55]:
sf

Unnamed: 0,schedule_gtfs_dataset_key,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop,# Downtown Local Route Types,# Local Route Types,# Coverage Route Types,# Rapid Route Types,# Express Route Types,# Rail Route Types,Transit Operator,Organization ID,Organization,Date,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
11,7cc0cb1871dfd558f11a2885c145d144,68,9766,282,3270,360701,504.99,110.31,126,2,0,20,6,18,Bay Area 511 Muni Schedule,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,San Francisco,49.0,San Francisco,"San Francisco--Oakland, CA",842754.0,County or Local Government Unit or Department of Transportation,,Full Reporter


In [56]:
placer = _section1_utils.load_operator_ntd_profile("Placer County")

In [57]:
placer

Unnamed: 0,schedule_gtfs_dataset_key,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop,# Downtown Local Route Types,# Local Route Types,# Coverage Route Types,# Rapid Route Types,# Express Route Types,# Rail Route Types,Transit Operator,Organization ID,Organization,Date,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
13,8de1f1a3b9ae172c6b8255b1c82c340f,8,115,24,148,2002,213.3,13.53,0,0,11,7,2,0,Placer Schedule,recDD2rnkl2m7IV8u,Placer County,2024-04-17,Placer,169.0,Auburn,"Sacramento, CA",392258.0,County or Local Government Unit or Department of Transportation,,Full Reporter
