# How to use `gtfs_utils_v2`

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(400_000_000_000)

import datetime
import geopandas as gpd
import pandas as pd

from siuba import *

from shared_utils import gtfs_utils_v2
analysis_date = datetime.date(2023, 1, 17)



## Step 1: Airtable's `dim_gtfs_datasets` with `fct_daily_scheduled_feeds`

* Filter out ones that are not deprecated (use `data_quality_pipeline`)
* Allow any other custom filtering to be done, such as the default of getting scheduled data only
* Analysts now should look at output and decide if there's additional filtering needed. 
* Once filtering is done, input the df in to merge with `fct_daily_feeds` to get `feed_keys`
* Use `feed_key` to traverse all the other tables

## `feed_options`
* "customer_facing"
* "use_subfeeds", 
* "current_feeds"
* "include_precursor"
* "include_precursor_and_future"

In [2]:
full_df = gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = analysis_date,
    keep_cols = None,
    get_df = True,
    feed_option = "")

In [3]:
full_df.shape

(227, 9)

In [4]:
def num_rows_and_other_stats(df: pd.DataFrame, feed_option: str):
    """
    Get stats for different filtering to double check.
    """
    subset_df = df >> gtfs_utils_v2.filter_feed_options(feed_option) 
    
    print(f"# rows: {len(subset_df)}")
    print("---------------")
    print(f"regional_feed_type: {subset_df.regional_feed_type.value_counts()}")
    print("---------------")
    print(f"is_future: {subset_df.is_future.value_counts()}")

In [5]:
num_rows_and_other_stats(full_df, "customer_facing")

# rows: 152
---------------
regional_feed_type: Combined Regional Feed    2
Name: regional_feed_type, dtype: int64
---------------
is_future: False    152
Name: is_future, dtype: int64


In [6]:
num_rows_and_other_stats(full_df, "use_subfeeds")

# rows: 186
---------------
regional_feed_type: Regional Subfeed          35
Combined Regional Feed     1
Name: regional_feed_type, dtype: int64
---------------
is_future: False    186
Name: is_future, dtype: int64


In [7]:
num_rows_and_other_stats(full_df, "current_feeds")

# rows: 187
---------------
regional_feed_type: Regional Subfeed          35
Combined Regional Feed     2
Name: regional_feed_type, dtype: int64
---------------
is_future: False    187
Name: is_future, dtype: int64


In [8]:
num_rows_and_other_stats(full_df, "include_precursor") 

# rows: 227
---------------
regional_feed_type: Regional Precursor Feed    40
Regional Subfeed           35
Combined Regional Feed      2
Name: regional_feed_type, dtype: int64
---------------
is_future: False    227
Name: is_future, dtype: int64


In [9]:
num_rows_and_other_stats(full_df, "include_precursor_and_future") 

# rows: 227
---------------
regional_feed_type: Regional Precursor Feed    40
Regional Subfeed           35
Combined Regional Feed      2
Name: regional_feed_type, dtype: int64
---------------
is_future: False    227
Name: is_future, dtype: int64


In [10]:
def display_outputs(df):
    display(df.head())
    print(f"shape: {df.shape}")
    print(f"columns: {df.columns}")
    
    if isinstance(df, gpd.GeoDataFrame):
        print(f"CRS: {df.crs}")

## Step 1: get feeds and orgs for the day

In [11]:
schedule_datasets = gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = analysis_date,
    keep_cols = ["date", "feed_key", "type", 
                 "regional_feed_type", "name"],
    get_df = True,
    feed_option = "use_subfeeds",
) 

display_outputs(schedule_datasets)
print(schedule_datasets.regional_feed_type.value_counts())


Unnamed: 0,date,feed_key,type,regional_feed_type,name
0,2023-01-17,1523384dc06c3d8147eeffadbcfad049,schedule,Regional Subfeed,Bay Area 511 Vacaville City Coach Schedule
1,2023-01-17,1c78afa6c7ce96a5f5aefca2e0a07d78,schedule,Regional Subfeed,Bay Area 511 SamTrans Schedule
2,2023-01-17,21214a8a56b33b94a05cfe2ec211410a,schedule,Regional Subfeed,Bay Area 511 Commute.org Schedule
3,2023-01-17,227c21ddaeb14e27fc2e6ba1076d9d5e,schedule,Regional Subfeed,Bay Area 511 Tri-Valley Wheels Schedule
4,2023-01-17,2e0478675d2b2bcd6c93da9354d92755,schedule,Regional Subfeed,Bay Area 511 AC Transit Schedule


shape: (36, 5)
columns: Index(['date', 'feed_key', 'type', 'regional_feed_type', 'name'], dtype='object')
Regional Subfeed          35
Combined Regional Feed     1
Name: regional_feed_type, dtype: int64


In [12]:
schedule_datasets.regional_feed_type.value_counts()

Regional Subfeed          35
Combined Regional Feed     1
Name: regional_feed_type, dtype: int64

### Set test cases for filtering

In [13]:
test_cases = [
    "Big Blue Bus Schedule", 
    "Metrolink Schedule"
]

test_feed_keys = [
    "008d5112a7e531d0562d26e34d77869d", # Sacramento Schedule
    "f8d3bfd9e780aa3b3ce1340b2116513f" # Long Beach Schedule
]

In [14]:
df_filter_by_name = (
    gtfs_utils_v2.schedule_daily_feed_to_organization(
        selected_date = analysis_date,
        keep_cols = None,
        get_df = False,
        feed_option = "use_subfeeds"
    ) >> gtfs_utils_v2.filter_operator(test_cases, include_name = True)
    >> collect()
)

df_filter_by_name

Unnamed: 0,key,date,feed_key,base64_url,gtfs_dataset_key,is_future,type,name,regional_feed_type
0,7ab7457165b694acaa4e0e7114fe386c,2023-01-17,90e78003416c5b09f77a9de8f266c2be,aHR0cHM6Ly93d3cubWV0cm9saW5rdHJhaW5zLmNvbS9nbG...,recR28oQlTW8GMJue,False,schedule,Metrolink Schedule,
1,c908315e18a11ca09e5ead11595ee15c,2023-01-17,9d4387dc55091d50c717582348508bae,aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...,recpN1dPaxhZvZQV0,False,schedule,Big Blue Bus Schedule,


In [15]:
df_filter_by_feed_key = (
    gtfs_utils_v2.schedule_daily_feed_to_organization(
        selected_date = analysis_date,
        keep_cols = None,
        get_df = False,
        feed_option = "use_subfeeds"
    ) >> gtfs_utils_v2.filter_operator(test_feed_keys, include_name = False)
    >> collect()
)

df_filter_by_feed_key

Unnamed: 0,key,date,feed_key,base64_url,gtfs_dataset_key,is_future,type,name,regional_feed_type
0,58d22b05010728b166883376ae9763b2,2023-01-17,008d5112a7e531d0562d26e34d77869d,aHR0cHM6Ly9pcG9ydGFsLnNhY3J0LmNvbS9HVEZTL1NSVE...,recbzZQUIdMmFvm1r,False,schedule,Sacramento Schedule,
1,8e384ef2a07eeb0e0a30884acf7f7409,2023-01-17,f8d3bfd9e780aa3b3ce1340b2116513f,aHR0cHM6Ly9sYnRyYW5zaXQuYm94LmNvbS9zaGFyZWQvc3...,recCv3CF4elAx0dUg,False,schedule,Long Beach Schedule,


## Step 2: trips

In [16]:
trips = gtfs_utils_v2.get_trips(
    selected_date = analysis_date,
    operator_feeds = test_feed_keys,
    trip_cols = ["feed_key", "trip_id", "trip_key", 
                 "route_id", "route_key", 
                 "shape_array_key", "direction_id",
                 "service_hours", "trip_first_departure_sec", 
                 "trip_last_arrival_sec"
                ],
    get_df = True,
)

In [17]:
display_outputs(trips)

Unnamed: 0,feed_key,trip_id,trip_key,route_id,route_key,shape_array_key,direction_id,service_hours,trip_first_departure_sec,trip_last_arrival_sec
0,f8d3bfd9e780aa3b3ce1340b2116513f,9646473,b578166ecbdb0aa3882bb506e30a5149,21,6784353a6261afbdf38872fd8153e229,5baab14baacdde9bac9eee94ef2d8999,0,0.8,22020,24900
1,f8d3bfd9e780aa3b3ce1340b2116513f,9646121,8f58aa7959e9375f8fd22d31d9838d06,46,07f6f7fac93f37eede3c6a2be52ae649,66cd904df7d5ec775c2ce3afa1545bae,1,0.583333,74100,76200
2,f8d3bfd9e780aa3b3ce1340b2116513f,9645953,27c37a1bf0e7ada069d2d95847ff7e21,46,07f6f7fac93f37eede3c6a2be52ae649,939fc9637f6c1bc24ac92fb846f0fdd4,0,0.45,28080,29700
3,f8d3bfd9e780aa3b3ce1340b2116513f,9645514,599b3be786dc4bdf4fe2e8206cdb73c3,111,841ef90324f0def61a0ff91e53a0bae7,0399356a52aa382aa95f00fd47c6d28f,0,1.033333,49980,53700
4,008d5112a7e531d0562d26e34d77869d,1083910,c584eef754ab0427874f57a3f4d8948e,103,120f08a3de93a86678e9092bb3da5668,6d11e595080e640abf6c5cb7c77bd46f,1,0.683333,59640,62100


shape: (4245, 10)
columns: Index(['feed_key', 'trip_id', 'trip_key', 'route_id', 'route_key',
       'shape_array_key', 'direction_id', 'service_hours',
       'trip_first_departure_sec', 'trip_last_arrival_sec'],
      dtype='object')


In [18]:
trips2 = gtfs_utils_v2.get_trips(
    selected_date = analysis_date,
    operator_feeds = test_cases,
    trip_cols = ["feed_key", "name", "trip_id",  
                 "route_id", 
                 "shape_id", "shape_array_key", 
                 "direction_id",
                ],
    get_df = True,
)

In [19]:
display_outputs(trips2)

Unnamed: 0,feed_key,name,trip_id,route_id,shape_id,shape_array_key,direction_id
0,9d4387dc55091d50c717582348508bae,Big Blue Bus Schedule,893177,3554,26164,c6100a0effdfa54bdbed6bffdd594fe5,1
1,9d4387dc55091d50c717582348508bae,Big Blue Bus Schedule,893708,3556,26171,80e0378df44a749a23d291f5404bf481,0
2,9d4387dc55091d50c717582348508bae,Big Blue Bus Schedule,894198,3560,26176,390a22dccf019ec6254bbecf01748f17,0
3,9d4387dc55091d50c717582348508bae,Big Blue Bus Schedule,894423,3561,26183,fd89b9cc6b5bedd99d9fabffaf3dc7c7,0
4,9d4387dc55091d50c717582348508bae,Big Blue Bus Schedule,895086,3565,26200,07bd7a677d359d367968ebb6e2da3d93,0


shape: (1530, 7)
columns: Index(['feed_key', 'name', 'trip_id', 'route_id', 'shape_id',
       'shape_array_key', 'direction_id'],
      dtype='object')


## Step 2: stops

In [20]:
stops = gtfs_utils_v2.get_stops(
    selected_date = analysis_date,
    operator_feeds = test_feed_keys,
    stop_cols = ["feed_key", "stop_id", "stop_name", 
                 "route_type_3", ],
    get_df = True,
    crs = "EPSG:2229",
)

In [21]:
display_outputs(stops)

Unnamed: 0,feed_key,stop_id,stop_name,route_type_3,geometry
0,f8d3bfd9e780aa3b3ce1340b2116513f,1014,7TH & REDONDO SE,77.0,POINT (6515419.340 1740634.671)
1,f8d3bfd9e780aa3b3ce1340b2116513f,1631,STUDEBAKER & AUTO SQUARE NE,83.0,POINT (6531454.224 1772274.779)
2,f8d3bfd9e780aa3b3ce1340b2116513f,1772,ATHERTON & XIMENO SE,104.0,POINT (6519565.316 1745900.909)
3,008d5112a7e531d0562d26e34d77869d,11202,NATOMA ST & WALES DR (WB),9.0,POINT (5656698.628 3542050.189)
4,f8d3bfd9e780aa3b3ce1340b2116513f,540,BELLFLOWER & ANAHEIM RD NW,48.0,POINT (6524426.217 1742169.788)


shape: (4718, 5)
columns: Index(['feed_key', 'stop_id', 'stop_name', 'route_type_3', 'geometry'], dtype='object')
CRS: {"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", "type": "ProjectedCRS", "name": "NAD83 / California zone 5 (ftUS)", "base_crs": {"name": "NAD83", "datum": {"type": "GeodeticReferenceFrame", "name": "North American Datum 1983", "ellipsoid": {"name": "GRS 1980", "semi_major_axis": 6378137, "inverse_flattening": 298.257222101}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "id": {"authority": "EPSG", "code": 4269}}, "conversion": {"name": "SPCS83 California zone 5 (US Survey feet)", "method": {"name": "Lambert Conic Conformal (2SP)", "id": {"authority": "EPSG", "code": 9802}}, "parameters": [{"name": "Latitude of false origin", "value": 33.5, "unit": "degr

## Step 3: shapes

In [24]:
shapes = gtfs_utils_v2.get_shapes(
    selected_date = analysis_date,
    operator_feeds = [test_feed_keys[0]],
    shape_cols = ["feed_key", "shape_id", "shape_array_key",
                 "n_trips"],
    get_df = True,
    crs = "EPSG:3310",
)

In [25]:
display_outputs(shapes)

Unnamed: 0,feed_key,shape_id,shape_array_key,n_trips,geometry
0,008d5112a7e531d0562d26e34d77869d,45077,9addaaaaf4f08ca6e5074b3c3f2413b3,1,"LINESTRING (-116557.927 59531.520, -116593.664..."
1,008d5112a7e531d0562d26e34d77869d,45059,0a7e09807dc6920b21634f6fe7ad9768,1,"LINESTRING (-123256.003 56118.691, -123255.778..."
2,008d5112a7e531d0562d26e34d77869d,45082,b62192d1e8356ce4370997b39d968507,1,"LINESTRING (-100598.776 75674.582, -100599.672..."
3,008d5112a7e531d0562d26e34d77869d,44923,a8cc37ec5556191157afe820e4db37ac,1,"LINESTRING (-129506.802 64236.820, -129558.723..."
4,008d5112a7e531d0562d26e34d77869d,45075,592927d68b4755462ee870d2ebce53af,1,"LINESTRING (-129688.400 60095.833, -129694.646..."


shape: (164, 5)
columns: Index(['feed_key', 'shape_id', 'shape_array_key', 'n_trips', 'geometry'], dtype='object')
CRS: {"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", "type": "ProjectedCRS", "name": "NAD83 / California Albers", "base_crs": {"name": "NAD83", "datum": {"type": "GeodeticReferenceFrame", "name": "North American Datum 1983", "ellipsoid": {"name": "GRS 1980", "semi_major_axis": 6378137, "inverse_flattening": 298.257222101}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "id": {"authority": "EPSG", "code": 4269}}, "conversion": {"name": "California Albers", "method": {"name": "Albers Equal Area", "id": {"authority": "EPSG", "code": 9822}}, "parameters": [{"name": "Latitude of false origin", "value": 0, "unit": "degree", "id": {"authority": "EPSG", "code": 8821

## Step 4: stop_times

In [26]:
# Select from the 2 test cases, the first 5 trip_ids
test_trips = trips[trips.name.isin(test_cases)
                  ].trip_id.unique().tolist()[:5]

In [27]:
# Input this as our trip_df
sample_trips = trips[trips.trip_id.isin(test_trips)]

In [28]:
# Grab feed key, or else can't subset...
feed_key_for_sample_trips = sample_trips.feed_key.unique().tolist()
feed_key_for_sample_trips

['9d4387dc55091d50c717582348508bae']

In [29]:
stop_times = gtfs_utils_v2.get_stop_times(
    selected_date = analysis_date,
    operator_feeds = feed_key_for_sample_trips,
    stop_time_cols = ["feed_key", "trip_id", "stop_id", 
                      "stop_sequence", 
                      "arrival_sec", "departure_sec"
                ],
    get_df = True,
    trip_df = sample_trips
)

In [30]:
display_outputs(stop_times)

Unnamed: 0,feed_key,trip_id,stop_id,stop_sequence,arrival_sec,departure_sec,arrival_hour,departure_hour
0,9d4387dc55091d50c717582348508bae,893177,34,32,51076,51076,14,14
1,9d4387dc55091d50c717582348508bae,894198,355,16,69036,69036,19,19
2,9d4387dc55091d50c717582348508bae,893708,541,12,78027,78027,21,21
3,9d4387dc55091d50c717582348508bae,895086,1097,20,22500,22500,6,6
4,9d4387dc55091d50c717582348508bae,894423,214,29,80154,80154,22,22


shape: (217, 8)
columns: Index(['feed_key', 'trip_id', 'stop_id', 'stop_sequence', 'arrival_sec',
       'departure_sec', 'arrival_hour', 'departure_hour'],
      dtype='object')


In [31]:
print(stop_times.arrival_hour.value_counts())
print(stop_times.departure_hour.value_counts())

19    49
22    46
21    35
6     32
14    23
13    22
5      6
18     4
Name: arrival_hour, dtype: int64
19    49
22    46
21    35
6     32
14    23
13    22
5      6
18     4
Name: departure_hour, dtype: int64
