# Route Identification Over Time, Approach 2
See `05_route_identification_over_time.ipynb` for previous work.
<br>Determined that more data is needed than what is given from `fct_monthly_routes`. exploring data from `helpers.import_scheduled_trips` to get feed_key, route short/long name/id/desc and other. 
<br>Cannot rely on feed_key as it is unstable over time. Next best is gtfs_schedule key, but absolute best is source_record_id.
<br>Start with running `make install_env` to install everything needed. `helpers`,`rt_dates`,`sched_gcs`
<br>Query data from warehouse, then use this snippet from `make_routes_gdf` from `_shared_utils/shared_utils/geography_utils.py`.
* ddf["geometry"] = ddf.pt_array.apply(make_linestring)
    
<br> then save out as geo parquet to the `gtfs_schedule` folder in GCS (so versioning and history stays) using 
* utils.geoparquet_gcs_export(
                vp_gdf,
                SEGMENT_GCS,
                f"vp_{analysis_date}"
            )

<br>also add `route_long_name` to query. then see if we are able to see any changes in routes via:
* feed key
* name
* route id
* route short name
* route long name
* route desc
    

In [30]:
import pandas as pd
#rt_segment_speeds/segment_speed_utils
from segment_speed_utils import helpers

#rt_segment_speeds/segment_speed_utils/project_vars.py
from segment_speed_utils.project_vars import SCHED_GCS, COMPILED_CACHED_VIEWS

#_shared_utils/shared_utils/rt_dates.py
from shared_utils import rt_dates

import geopandas as gpd


In [2]:
sep_date = rt_dates.DATES["sep2023"]
aug_date = rt_dates.DATES["aug2023"]

# grabbing additional dates
april_date = rt_dates.DATES["apr2023"]
may_date = rt_dates.DATES["may2023"]
june_date = rt_dates.DATES["jun2023"]
july_date = rt_dates.DATES["jul2023"]

In [20]:
may_date

'2023-05-17'

In [3]:
# list of all the months
months_list = [sep_date, aug_date, april_date, may_date, june_date, july_date]

In [17]:
months_list

['2023-09-13',
 '2023-08-15',
 '2023-04-12',
 '2023-05-17',
 '2023-06-14',
 '2023-07-12']

In [4]:
# test loop of import_scheduled_trips, iterated through months_list

# empty list
month_trips = {}

for months in months_list:
    df = helpers.import_scheduled_trips(
        months,
        columns=[
            "feed_key",
            "name",
            "gtfs_dataset_key",
            "route_id",
            "route_short_name",
            "route_long_name",
            "route_desc",
        ],
        filters=[[("name", "==", "Sacramento Schedule")]],
        get_pandas=True,
    )
    month_trips[months] = df

In [22]:
COMPILED_CACHED_VIEWS

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'

Index(['feed_key', 'gtfs_dataset_key', 'name', 'regional_feed_type',
       'service_date', 'trip_start_date_pacific', 'trip_id',
       'trip_instance_key', 'route_key', 'route_id', 'route_type',
       'route_short_name', 'route_long_name', 'route_desc', 'direction_id',
       'shape_array_key', 'shape_id', 'trip_first_departure_datetime_pacific',
       'trip_last_arrival_datetime_pacific', 'service_hours',
       'trip_start_date_local_tz', 'trip_first_departure_datetime_local_tz',
       'trip_last_arrival_datetime_local_tz'],
      dtype='object')

In [5]:
month_trips.keys()

dict_keys(['2023-09-13', '2023-08-15', '2023-04-12', '2023-05-17', '2023-06-14', '2023-07-12'])

In [6]:
april = month_trips['2023-04-12'].assign(month='april')
may = month_trips['2023-05-17'].assign(month='may')
june = month_trips['2023-06-14'].assign(month='june')
july = month_trips['2023-07-12'].assign(month='july')
aug = month_trips['2023-08-15'].assign(month='aug')
sep = month_trips['2023-09-13'].assign(month='sep')

In [7]:
# use this pseudo code to concat all the different months data
# pd.concat can stack DFs on top of others.
# pd.concat([df1, df2], axis=0)

In [8]:
#test to see if i can use a dictionary in concat
all_trips = pd.concat([april,may,june,july,aug,sep],axis=0)

In [9]:
all_trips.shape

(326, 8)

In [18]:
all_trips.head()

Unnamed: 0,feed_key,name,schedule_gtfs_dataset_key,route_id,route_short_name,route_long_name,route_desc,month
0,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,1,1,GREENBACK,,april
1,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,11,11,NATOMAS/LAND PARK,,april
2,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,13,13,NATOMAS/ARDEN,,april
3,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,15,15,DEL PASO HEIGHTS,,april
4,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,19,19,RIO LINDA,,april


In [10]:
# 3 unique feed keys
all_trips.feed_key.value_counts()

55c36112e9a6a849c0ddc65eb54de7f1    130
e91cfff58ee7410589d8bf9940ed1c41    130
294252b3b4b42fbb31a31ce184fcb3f5     66
Name: feed_key, dtype: int64

In [11]:
# 2 unique gtfs dataset keys, as opposed to 3 unique feed keys
all_trips.schedule_gtfs_dataset_key.value_counts()

43a1e46d592a1ee647bce8422c68460c    260
cb3074eb8b423dfc5acfeeb0de95eb82     66
Name: schedule_gtfs_dataset_key, dtype: int64

In [12]:
#expect to see 5 instances of each route id (1 for each month, except june). but some routes only happen 1 time? route 10 and 137
all_trips.route_id.value_counts()

001    5
011    5
138    5
142    5
161    5
      ..
078    5
075    5
F10    4
10     1
137    1
Name: route_id, Length: 67, dtype: int64

In [13]:
#similar results with route short name
all_trips.route_short_name.value_counts()

30     10
1       5
228     5
138     5
142     5
       ..
78      5
75      5
F10     4
10      1
137     1
Name: route_short_name, Length: 66, dtype: int64

In [14]:
#further investigation of route 10 and 137, see what their rows say
#only 1 row for each of these routes, both occuring in september
display(all_trips[all_trips['route_id']=='10'])
display(all_trips[all_trips['route_id']=='137'])

Unnamed: 0,feed_key,name,schedule_gtfs_dataset_key,route_id,route_short_name,route_long_name,route_desc,month
54,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,10,10,FSL Route 10,,sep


Unnamed: 0,feed_key,name,schedule_gtfs_dataset_key,route_id,route_short_name,route_long_name,route_desc,month
56,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,137,137,UCDMC,,sep


In [15]:
#other routes return multiple rows as expected. 
all_trips[all_trips['route_id']=='F10']

Unnamed: 0,feed_key,name,schedule_gtfs_dataset_key,route_id,route_short_name,route_long_name,route_desc,month
57,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,F10,F10,FSL Route 10,,april
60,55c36112e9a6a849c0ddc65eb54de7f1,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,F10,F10,FSL Route 10,,may
58,e91cfff58ee7410589d8bf9940ed1c41,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,F10,F10,FSL Route 10,,july
57,e91cfff58ee7410589d8bf9940ed1c41,Sacramento Schedule,43a1e46d592a1ee647bce8422c68460c,F10,F10,FSL Route 10,,aug


In [16]:
#would like shape Id and pt array from previous approach, but need a join to make this work
#seek out preliminary joins via metabase


## Amanda's Stuff

In [42]:
from calitp_data_analysis.tables import tbls
from calitp_data_analysis import utils
from siuba import *


In [87]:
all_trips.columns

Index(['feed_key', 'name', 'schedule_gtfs_dataset_key', 'route_id',
       'route_short_name', 'route_long_name', 'route_desc', 'month'],
      dtype='object')

In [31]:
aprl_sept_2023_routes = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/route_identification_2023_m04_m09.parquet")

In [33]:
sac = aprl_sept_2023_routes[aprl_sept_2023_routes['name'] == 'Sacramento Schedule']

In [34]:
sac.sample()

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,geometry
618,dc9448ee3e4039db4fa5a482b7f69aef,recbzZQUIdMmFvm1r,Sacramento Schedule,11,45888,8,2023,


In [76]:
sac.shape

(393, 8)

In [35]:
sac.source_record_id.value_counts()

recbzZQUIdMmFvm1r    393
Name: source_record_id, dtype: int64

* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.dim_organizations

In [77]:
dim_provider_gtfs_data = (tbls.mart_transit_database.dim_provider_gtfs_data()
              >> select( _.organization_source_record_id,
                        _.schedule_gtfs_dataset_key,
                        _.service_name,
                        _._is_current)
              >> filter(_._is_current == True)
              >> collect()
             )

In [58]:
dim_provider_gtfs_data.shape

(1269, 3)

In [80]:
dim_provider_gtfs_data2 = dim_provider_gtfs_data.dropna(subset = ['schedule_gtfs_dataset_key','service_name'])

In [81]:
dim_provider_gtfs_data2.shape

(307, 4)

In [84]:
# dim_provider_gtfs_data2.service_name.unique()

In [85]:
dim_provider_gtfs_data2.schedule_gtfs_dataset_key.nunique()

237

In [88]:
all_trips.shape

(326, 8)

In [86]:
pd.merge(all_trips, dim_provider_gtfs_data2, on = "schedule_gtfs_dataset_key", how = "outer", indicator = True)[['_merge']].value_counts()

_merge    
right_only    304
left_only     260
both          198
dtype: int64

In [91]:
all_trips2 = pd.merge(all_trips, dim_provider_gtfs_data, on = "schedule_gtfs_dataset_key", how = "inner")

In [90]:
all_trips2.head()

Unnamed: 0,feed_key,name,schedule_gtfs_dataset_key,route_id,route_short_name,route_long_name,route_desc,month,organization_source_record_id,service_name,_is_current
0,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,1,1,GREENBACK,,sep,rec43oyrfhtPDdRHj,Rancho CordoVan,True
1,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,1,1,GREENBACK,,sep,recX9lccSE1jmjsmG,SacRT Light Rail,True
2,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,1,1,GREENBACK,,sep,recX9lccSE1jmjsmG,Sacramento Regional Transit District Bus,True
3,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,11,11,NATOMAS/LAND PARK,,sep,rec43oyrfhtPDdRHj,Rancho CordoVan,True
4,294252b3b4b42fbb31a31ce184fcb3f5,Sacramento Schedule,cb3074eb8b423dfc5acfeeb0de95eb82,11,11,NATOMAS/LAND PARK,,sep,recX9lccSE1jmjsmG,SacRT Light Rail,True


In [69]:
sac.columns

Index(['key', 'source_record_id', 'name', 'route_id', 'shape_id', 'month',
       'year', 'geometry'],
      dtype='object')

In [92]:
pd.merge(all_trips2, sac, left_on = "organization_source_record_id", right_on = "source_record_id", how = "outer", indicator = True)[['_merge']].value_counts()

_merge    
right_only    393
left_only     198
both            0
dtype: int64

In [93]:
pd.merge(all_trips2, sac, on = "name", how = "outer", indicator = True)[['_merge']].value_counts()

_merge    
both          77814
left_only         0
right_only        0
dtype: int64

In [94]:
pd.merge(all_trips2, sac, on = "route_id", how = "outer", indicator = True)[['_merge']].value_counts()

_merge    
both          1164
right_only       5
left_only        0
dtype: int64

In [96]:
sac.columns

Index(['key', 'source_record_id', 'name', 'route_id', 'shape_id', 'month',
       'year', 'geometry'],
      dtype='object')

In [73]:
m1 = pd.merge(all_trips2, sac, left_on = "schedule_gtfs_dataset_key", right_on = ", how = "inner", indicator = True)