## Double checking `routes` in `operator profile`
* [File that I'm checking](https://github.com/cal-itp/data-analyses/blob/56e8bc08327e44b591ce21e95a331fad462ad828/gtfs_digest/merge_operator_data.py#L109)
* Looking at BART, there are 12 unique routes for April 2024. However, they only seem to have 6 routes.

In [10]:
import geopandas as gpd
import pandas as pd

from calitp_data_analysis import utils
from segment_speed_utils import time_series_utils,  gtfs_schedule_wrangling, helpers

from update_vars import GTFS_DATA_DICT, SCHED_GCS, RT_SCHED_GCS

In [2]:
from merge_data import merge_in_standardized_route_names

In [3]:
from shared_utils import rt_dates
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates 

In [5]:
analysis_date_list = analysis_date_list[0:3]

In [7]:
analysis_date = analysis_date_list[0]

In [26]:
analysis_date

'2024-01-17'

### `concatenate_operator_stats`
* This just takes a list of files and concats them all together.

In [6]:
def concatenate_operator_stats(
    date_list: list
) -> pd.DataFrame:
    FILE = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats
    
    df = time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS,
        FILE,
        date_list,
        data_type = "df",
    ).sort_values(sort_cols).reset_index(drop=True)
    
    return df

### `operator_scheduled_stats`
[Here](https://github.com/cal-itp/data-analyses/blob/56e8bc08327e44b591ce21e95a331fad462ad828/gtfs_funnel/operator_scheduled_stats.py)

#### `schedule_stats_by_operator`

In [16]:
bart = "San Francisco Bay Area Rapid Transit District"

In [18]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [19]:
bart_df = pd.read_parquet(
    op_profiles_url,
    filters=[[("organization_name", "==", bart)]])

In [21]:
bart_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date
0,8a1405af8da1379acc062e346187ac98,12,1025,22,50,15475,492.84,309.5,10,0,0,0,0,10,Bay Area 511 BART Schedule,recoQLeNRISCKF8I0,San Francisco Bay Area Rapid Transit District,2023-03-15


In [22]:
bart_df.schedule_gtfs_dataset_key.nunique()

1

In [28]:
bart_df.service_date.unique()

array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [29]:
bart_df.loc[bart_df.service_date == '2024-01-17T00:00:00.000000000']

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date
10,8a1405af8da1379acc062e346187ac98,12,1005,26,50,15257,489.44,305.14,2,0,0,0,0,2,Bay Area 511 BART Schedule,recoQLeNRISCKF8I0,San Francisco Bay Area Rapid Transit District,2024-01-17


In [11]:
trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "route_id",
                  "trip_instance_key", "shape_array_key"],
        get_pandas = True
    )

In [23]:
bart_trips = trips.loc[trips.schedule_gtfs_dataset_key == "8a1405af8da1379acc062e346187ac98"]

In [25]:
bart_trips.shape

(1005, 4)

In [30]:
bart_trips.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key
30777,8a1405af8da1379acc062e346187ac98,Yellow-N,7dcd8655dfa527fe26d045030f555456,
30778,8a1405af8da1379acc062e346187ac98,Beige-N,1c6a3fa1fcda41ed77e9ca005c73f114,b33198f862352e303d8a184eead0bb42
30779,8a1405af8da1379acc062e346187ac98,Beige-N,eba8b76d0acad35ad8e3ffeab9eb9b4c,b33198f862352e303d8a184eead0bb42
30780,8a1405af8da1379acc062e346187ac98,Beige-N,003473cabda532b854f1992ada65ed9f,b33198f862352e303d8a184eead0bb42
30781,8a1405af8da1379acc062e346187ac98,Beige-N,f9a44134cc015e560eb16590bb5f4d17,b33198f862352e303d8a184eead0bb42


#### BART: it looks like they count the same line as "two routes" because one trip goes South and the other North.

In [31]:
bart_trips.route_id.unique()

array(['Yellow-N', 'Beige-N', 'Beige-S', 'Yellow-S', 'Orange-N',
       'Orange-S', 'Green-S', 'Green-N', 'Red-S', 'Red-N', 'Blue-S',
       'Blue-N'], dtype=object)

In [33]:
group_cols = ["schedule_gtfs_dataset_key"]

In [34]:
nunique_cols = [
        "route_id", "trip_instance_key", "shape_array_key"
    ]
trip_stats = (bart_trips
                  .groupby(group_cols, 
                           observed=True, group_keys=False)
                  .agg({
                      **{c: "nunique" for c in nunique_cols}
                  }).reset_index()
                  .rename(columns = {
                      "route_id": "operator_n_routes",
                      "trip_instance_key": "operator_n_trips",
                      "shape_array_key": "operator_n_shapes",
                  })
                 )

In [35]:
trip_stats

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes
0,8a1405af8da1379acc062e346187ac98,12,1005,26


#### SF Muni
* There's about 70 routes excluding ones that aren't suspended.
* https://www.sfmta.com/getting-around/muni/routes-stops

In [36]:
sf_df = pd.read_parquet(
    op_profiles_url,
    filters=[[("organization_name", "==", "City and County of San Francisco")]])

In [39]:
sf_gtfs_keys = list(sf_df.schedule_gtfs_dataset_key.unique())

In [40]:
sf_trips = trips.loc[trips.schedule_gtfs_dataset_key.isin(sf_gtfs_keys)]

In [41]:
sf_trips.route_id.unique()

array(['5R', '8AX', '8BX', 'LOWL', 'NOWL', 'S', 'PH', '1', '714', '2',
       '9R', 'KBUS', 'NBUS', '15', '55', 'LBUS', 'TBUS', 'T', '1X', '5',
       '6', '7', '8', '9', '12', '14', '18', '19', '21', '22', '23', '24',
       '25', '27', '28', '29', '30', '31', '33', '35', '36', '37', '38',
       '39', '43', '44', '45', '48', '49', '52', '54', '56', '57', '58',
       'PM', 'CA', '66', '67', '90', '91', 'J', 'K', 'M', 'N', '14R',
       '28R', '38R', 'F'], dtype=object)

In [42]:
sf_trips.route_id.nunique()

68