# Route Identification Over Time

Recent observations shows small chages in routes over time. Specifically in the following fields:
* route ID
* route short name
* route long name
* route desc

Need to observe these route changes in order to account for these changes in future analyses.

## Objective
1. Query data from `fct_monthly_routes` to help identify variences in Routes. Query for 2023, a couple of months. 
2. Save data to GCS `gtfs_schedule` bucket
3. Filter down data to `Sacramento Regional Transit`, identify and observe routes for any variences


## function from `open_data/download_vehicle_position.py`
    
    import datetime
    import gcsfs
    import geopandas as gpd
    import pandas as pd
    import shapely
    import sys

    from calitp_data_analysis.tables import tbls
    from calitp_data_analysis import utils
    from loguru import logger
    from siuba import *

    from shared_utils import schedule_rt_utils
    
    def download_vehicle_positions(
        date: str,
        operator_names: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_vehicle_locations()
              >> filter(_.service_date == date)
              >> filter(_.gtfs_dataset_name.isin(operator_names))
              >> select(_.gtfs_dataset_key, _.gtfs_dataset_name,
                        _.schedule_gtfs_dataset_key,
                        _.trip_id, _.trip_instance_key,
                        _.location_timestamp,
                        _.location)
                  >> collect()
             )

In [1]:
#imports copied from download_vehicle_position.py script

import datetime
import gcsfs
import geopandas as gpd
import pandas as pd
import shapely
import sys

from calitp_data_analysis.tables import tbls
from calitp_data_analysis import utils
from loguru import logger
from siuba import *

from shared_utils import schedule_rt_utils



In [2]:
# test to query fct_monthly_routes
#do i need to use gpd.dataframe?
def get_monthly_routes(
        year: str,
        months: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_monthly_routes()
              >> filter(_.year == year)
              >> filter(_.month.isin(months))
              >> select(_.key, _.source_record_id,
                        _.name,
                        _.route_id, _.shape_id,
                        _.month,
                        _.year,
                       _.pt_array)
                  >> collect()
             )
        return df

In [3]:
df = get_monthly_routes(2023, [3, 4, 5])

  sqlalchemy.util.warn(


In [4]:
#make gpd of df 

In [5]:
#testing export to GCS > csuyat_folder

# 'gs://calitp-analytics-data/data-analyses/csuyat_folder/##FILENAME##.parquet'
# df.to_parquet()

#sucsessfully written to GCS, to csuyat_folder. need to export to gtfs_schedule folder 

#df.to_parquet('gs://calitp-analytics-data/data-analyses/csuyat_folder/route_identification_2023_m03_m05.parquet')

---

In [6]:
#peaking into df to make sure everything looks good

#shape shows 11,927 rows and 8 columns
display(df.shape)

#type shows data is in df
display(type(df))

#columns return all the columns we listed in the function
display(list(df.columns))

#value_counts confirm df only has rows from 2023 March to May
display(df.value_counts(subset=['year','month']))

(11927, 8)

pandas.core.frame.DataFrame

['key',
 'source_record_id',
 'name',
 'route_id',
 'shape_id',
 'month',
 'year',
 'pt_array']

year  month
2023  5        4180
      3        3899
      4        3848
dtype: int64

In [7]:
#creating sub-df for 'Sacramento Schedule'
#195 rows, 8 columns
sac = df[df['name'] == 'Sacramento Schedule']

In [8]:
display(sac.shape)
display(sac.head(3))

(195, 8)

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
25,2bf17a56167a52716e84bb22ab15d3a4,recbzZQUIdMmFvm1r,Sacramento Schedule,102,45355,5,2023,[]
78,7f76b61c556dfb64a39d044b67982527,recbzZQUIdMmFvm1r,Sacramento Schedule,86,45346,5,2023,[]
128,7aab855c833e7042d5a55daeb91acfb1,recbzZQUIdMmFvm1r,Sacramento Schedule,206,45053,3,2023,[]


In [9]:
sac_routes = sac['route_id'].unique()


array(['102', '086', '206', '124', '030', '088', '227', '213', 'F20',
       '082', '109', '519', '228', '252', '106', '247', '023', '093',
       '533', '176', '019', '210', '129', '105', '214', '215', '021',
       '001', '067', '205', '025', '226', '248', '30', '026', '078',
       '246', '211', '084', '113', '011', 'F10', '081', '038', '051',
       '068', '033', '062', '087', '142', '175', '015', '507', '255',
       '177', '075', '134', '138', '072', '061', '103', '161', '013',
       '212', '056'], dtype=object)

In [19]:
len(sac_routes)

65

## Next Steps

for every `name` and `route_id` in routes, need to see if each row is the same or not. Need to identify any variation in the routes. 



---

Trying to use a loop that will create a df for every route_id in sac_routes. but would need to do this for every `name` in the `fct_monthly_routes` df eventually?


In [10]:
#empty dictionary
sac_sub_route_ids = {}

#each element in sac_routes will be called route.
#for each route in sac_routes, query each row related to that route.(where ever you use the variable route, go 1-by-1 the differnet
#then, create a dataframe for each route and place it into the dictionary sub_dataframes
for route in sac_routes:
    sub_df = sac[sac['route_id'] == route]
    sac_sub_route_ids[route] = sub_df

In [18]:
len(sac_sub_route_ids)

65

In [11]:
#testing dictionary with route 23 and 88
sac_sub_route_ids['023']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1463,754ea57f3eca954838e87899b729e7ba,recbzZQUIdMmFvm1r,Sacramento Schedule,23,44912,3,2023,[]
2769,1127c9acaecd57fd9543132998a72e3e,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,4,2023,"[POINT(-121.26715 38.67912), POINT(-121.26712 ..."
3874,d80d73174d1d318341a7af73a6fdc452,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,5,2023,[]


---

In [12]:
#list of unique route names from initial df
route_names = df['name'].unique()


In [13]:
#new loop that creates a dictionary of each unique schedule name with all its routes.
sub_route_name = {}

for name in route_names:
    sub_df2 = df[df['name'] == name]
    sub_route_name[name] = sub_df2
    

In [14]:
#test to see if new dictionary works
sub_route_name['Auburn Schedule']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
3086,d8f952c0d2e81564c71be5846e408212,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,[]
8942,d4a1fd712ec226a51120b701119508cf,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."
11834,7f2990f969db2d363954feefb54a3134,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,5,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."


## Now I have `sac_sub_route_ids` and `sub_route_name` dictionaries

Examples of some noteable routes with slight variations over time.

In [15]:
#General observations for Sacramento Schedule: 
#shape_id changes every month. 
#pt_array changes every month, however, did get a warning upon initial query of data so may need to review query to account for geodata
#Month 4 has the point geom data

display(sac_sub_route_ids['088'])
display(sac_sub_route_ids['023'])
display(sac_sub_route_ids['105'])
display(sac_sub_route_ids['F20'])
display(sac_sub_route_ids['215'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
367,fc4aecdbb2e90697aaca21929881da0c,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,4,2023,"[POINT(-121.491718 38.579866), POINT(-121.4915..."
10054,79872e3f47d4f8c9579aeb7c141649ad,recbzZQUIdMmFvm1r,Sacramento Schedule,88,44974,3,2023,[]
10721,559586f993036a5d2aba23bbddb5645f,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1463,754ea57f3eca954838e87899b729e7ba,recbzZQUIdMmFvm1r,Sacramento Schedule,23,44912,3,2023,[]
2769,1127c9acaecd57fd9543132998a72e3e,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,4,2023,"[POINT(-121.26715 38.67912), POINT(-121.26712 ..."
3874,d80d73174d1d318341a7af73a6fdc452,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
2255,4cbfc3213874266d76ce4a2699894f0c,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45360,5,2023,[]
8105,6a310f55a2c009eb82bd463b1da2a233,recbzZQUIdMmFvm1r,Sacramento Schedule,105,44983,3,2023,[]
11578,59ef04d42732bd19260312593f2c7a92,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45359,4,2023,"[POINT(-121.390273 38.468759), POINT(-121.3902..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
544,87a3f535201f3e670f4c0d362c28d1f0,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,5,2023,[]
4922,2ed9e87018bcd053ef2ae87d3ebd7211,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,4,2023,"[POINT(-121.103892 38.656366), POINT(-121.1040..."
11339,e679982fb613ed9752b114e9187d2bab,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45166,3,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
2379,0f363ffa5be5b8eb4f2e64a04541cc9a,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,5,2023,[]
4531,20edead480e1353c64ea919d72b81d51,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45065,3,2023,[]
8307,391282f0af7e612a0574c68ca695dac0,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,4,2023,"[POINT(-121.441354 38.560567), POINT(-121.4419..."


In [16]:
#test to see other route names
#other route names have more point geometry than Sacramento.
display(sub_route_name['Santa Cruz Schedule'])
display(sub_route_name['Merced Schedule'])
display(sub_route_name['San Diego Schedule'])
display(sub_route_name['Roseville Schedule'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
126,d4ed292a00561394dded3329bdfd32f7,recpLgqMr7Y888HmN,Santa Cruz Schedule,74S,shp-74S-03,5,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."
168,769e5860f6a82ed40d46ab45970918ff,recpLgqMr7Y888HmN,Santa Cruz Schedule,10,shp-10-02,4,2023,"[POINT(-122.024574 36.971077), POINT(-122.0246..."
294,23a9f4169c53cd48d623ed691ddf680e,recpLgqMr7Y888HmN,Santa Cruz Schedule,20,shp-20-08,5,2023,"[POINT(-122.024574 36.971077), POINT(-122.0246..."
410,a07fb556d29c8d8c9909f97f1600efc4,recpLgqMr7Y888HmN,Santa Cruz Schedule,10,shp-10-02,3,2023,"[POINT(-122.024574 36.971077), POINT(-122.0246..."
508,01047c62c8b1adf840a493ab5bab437e,recpLgqMr7Y888HmN,Santa Cruz Schedule,69A,shp-69A-01,4,2023,"[POINT(-122.024475 36.970616), POINT(-122.0244..."
...,...,...,...,...,...,...,...,...
11150,07e6063587f383c5688f0caa77a62c75,recpLgqMr7Y888HmN,Santa Cruz Schedule,75,shp-75-01,3,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."
11206,8753a4f8765a0bdf57ba480a9bd6526c,recpLgqMr7Y888HmN,Santa Cruz Schedule,74S,shp-74S-03,4,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."
11237,d514fedf271b839eb6347a4554b68111,recpLgqMr7Y888HmN,Santa Cruz Schedule,41,shp-41-01,5,2023,"[POINT(-122.024475 36.970796), POINT(-122.0242..."
11270,5bd3acf8e37dc0a377deba30d879cc52,recpLgqMr7Y888HmN,Santa Cruz Schedule,WC,shp-WC-01,3,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
116,917593ed76150bb7fcd4ed1d5d097d09,reck9IsndFO0GtbQD,Merced Schedule,1201,p_111317,4,2023,"[POINT(-120.488108344 37.301915396), POINT(-12..."
245,28d3ef5f3e74db3c913e1b1cf71969d4,reck9IsndFO0GtbQD,Merced Schedule,1212,p_3398,3,2023,"[POINT(-120.488200486 37.301777334), POINT(-12..."
648,32af8f6fd952af9948fd3c6a6d7ce1d6,reck9IsndFO0GtbQD,Merced Schedule,1212,p_3398,5,2023,"[POINT(-120.488200486 37.301777334), POINT(-12..."
698,07a6637ad0c2ff61a73338c9f056fc06,reck9IsndFO0GtbQD,Merced Schedule,1207,p_3428,3,2023,"[POINT(-120.42700784 37.365634899), POINT(-120..."
875,49a58177bf7803a9e1890348594a687e,reck9IsndFO0GtbQD,Merced Schedule,1212,p_3398,4,2023,"[POINT(-120.488200486 37.301777334), POINT(-12..."
1097,a3fa07beadbfc32d9d58dc01d753626c,reck9IsndFO0GtbQD,Merced Schedule,1196,p_111309,4,2023,"[POINT(-120.610352 37.341354), POINT(-120.6101..."
1365,cdaa955a4459a8eb2329fa5460051c0b,reck9IsndFO0GtbQD,Merced Schedule,1205,p_3410,5,2023,"[POINT(-120.488224626 37.301779467), POINT(-12..."
1898,c4b550d3071779c512a82656ec6bd80f,reck9IsndFO0GtbQD,Merced Schedule,1881,p_1303791,4,2023,"[POINT(-120.875639 37.054647), POINT(-120.8757..."
1923,6eec64c460aaf2b12ca1a09a06a8cdb3,reck9IsndFO0GtbQD,Merced Schedule,1210,p_3424,5,2023,"[POINT(-120.488174 37.301979), POINT(-120.4878..."
2088,971a90a5ec3cc04da6214ab3445ad93e,reck9IsndFO0GtbQD,Merced Schedule,1204,p_1425522,5,2023,"[POINT(-120.47724 37.357109), POINT(-120.47723..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
232,c1247a4cbecb56b2803e24d05e4b956c,recfZ9iWkptccoONX,San Diego Schedule,709,709_3_122,4,2023,"[POINT(-116.965605 32.627924), POINT(-116.9657..."
236,03cc1f6f1b95b051f69c1c8300d2d2de,recfZ9iWkptccoONX,San Diego Schedule,815,815_3_20,3,2023,"[POINT(-116.92939 32.801228), POINT(-116.92850..."
299,11fa72d6fcdc1a236daf68bd2b993bfc,recfZ9iWkptccoONX,San Diego Schedule,856,856_1_130,3,2023,"[POINT(-117.071622 32.77309), POINT(-117.07159..."
308,9193292efc53a90d91edc3f4e9b2bde0,recfZ9iWkptccoONX,San Diego Schedule,974,974_8_4,4,2023,"[POINT(-117.225251 32.902155), POINT(-117.2252..."
322,9e4b89dc53a7a65c7fd3580b77c4d3f4,recfZ9iWkptccoONX,San Diego Schedule,921,921_2_65,5,2023,"[POINT(-117.21383667 32.86932755), POINT(-117...."
...,...,...,...,...,...,...,...,...
11857,f35b0a49641b28b308537ff23630f997,recfZ9iWkptccoONX,San Diego Schedule,964,964_3_41,4,2023,"[POINT(-117.095897 32.895395), POINT(-117.0958..."
11875,725cf8a0760267eba1ec442aedcc6413,recfZ9iWkptccoONX,San Diego Schedule,204,204_8_7,5,2023,"[POINT(-117.21379852 32.86833954), POINT(-117...."
11879,6ebb5534ce901421048683259ceff7c7,recfZ9iWkptccoONX,San Diego Schedule,520,520_3_265,3,2023,"[POINT(-116.975713 32.805203), POINT(-116.9757..."
11907,3d7e7921a0f8ab1f75476aa52b927a6c,recfZ9iWkptccoONX,San Diego Schedule,2,2_0_81,4,2023,"[POINT(-117.165795 32.71882), POINT(-117.1656 ..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
188,b39e777f5907e9335ddfcc7ff444e957,rec90jC43naXJz9lr,Roseville Schedule,2_AM,38257,5,2023,"[POINT(-121.309928 38.75917), POINT(-121.30992..."
301,fb3a327482baba9629433776beaa2ff3,rec90jC43naXJz9lr,Roseville Schedule,C,38283,3,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
467,49248bddf141b6e85d63fcd4d31081dc,rec90jC43naXJz9lr,Roseville Schedule,7_PM,38269,4,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
796,abd2cbff64d1c209a835798d23789acc,rec90jC43naXJz9lr,Roseville Schedule,F,38288,5,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
846,bb7ae03d2dfd3834f869599ec08c6c0c,rec90jC43naXJz9lr,Roseville Schedule,B,38282,5,2023,"[POINT(-121.284084 38.74894), POINT(-121.28519..."
...,...,...,...,...,...,...,...,...
11297,e2a062df19a670b8d7d5c7249b389ac3,rec90jC43naXJz9lr,Roseville Schedule,F,38288,3,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
11503,dc5135aa666b0e67ee17b89b0a889cc6,rec90jC43naXJz9lr,Roseville Schedule,9_PM,38275,3,2023,"[POINT(-121.259103 38.759223), POINT(-121.2584..."
11533,8d4e6db62f2f6c8500e40bae3bccafa0,rec90jC43naXJz9lr,Roseville Schedule,L,38291,3,2023,"[POINT(-121.282849 38.749896), POINT(-121.2840..."
11819,7878558ff14c481f34f792a498382256,rec90jC43naXJz9lr,Roseville Schedule,1_PM,38254,3,2023,"[POINT(-121.486544 38.578468), POINT(-121.4867..."


In [17]:
import importlib

importlib.reload(segment_speed_utils)
from segment_speed_utils.project_vars import SCHED_GCS, analysis_date

NameError: name 'segment_speed_utils' is not defined