# Route Identification Over Time

Recent observations shows small chages in routes over time. Specifically in the following fields:
* route ID
* route short name
* route long name
* route desc

Need to observe these route changes in order to account for these changes in future analyses.

## Objective
1. Query data from `fct_monthly_routes` to help identify variences in Routes. Query for 2023, a couple of months. 
2. Save data to GCS `gtfs_schedule` bucket
3. Filter down data to `Sacramento Regional Transit`, identify and observe routes for any variences


## function from `open_data/download_vehicle_position.py`
    
    import datetime
    import gcsfs
    import geopandas as gpd
    import pandas as pd
    import shapely
    import sys

    from calitp_data_analysis.tables import tbls
    from calitp_data_analysis import utils
    from loguru import logger
    from siuba import *

    from shared_utils import schedule_rt_utils
    
    def download_vehicle_positions(
        date: str,
        operator_names: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_vehicle_locations()
              >> filter(_.service_date == date)
              >> filter(_.gtfs_dataset_name.isin(operator_names))
              >> select(_.gtfs_dataset_key, _.gtfs_dataset_name,
                        _.schedule_gtfs_dataset_key,
                        _.trip_id, _.trip_instance_key,
                        _.location_timestamp,
                        _.location)
                  >> collect()
             )

In [1]:
#imports copied from download_vehicle_position.py script

import datetime
import gcsfs
import geopandas as gpd
import pandas as pd
import shapely
import sys

from calitp_data_analysis.tables import tbls
from calitp_data_analysis import utils
from loguru import logger
from siuba import *

from shared_utils import schedule_rt_utils



ModuleNotFoundError: No module named 'shared_utils'

In [2]:
# test to query fct_monthly_routes
#do i need to use gpd.dataframe?
def get_monthly_routes(
        year: str,
        months: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_monthly_routes()
              >> filter(_.year == year)
              >> filter(_.month.isin(months))
              >> select(_.key, _.source_record_id,
                        _.name,
                        _.route_id, _.shape_id,
                        _.month,
                        _.year,
                       _.pt_array)
                  >> collect()
             )
        return df

In [3]:
df = get_monthly_routes(2023, [3, 4, 5])

  sqlalchemy.util.warn(


In [None]:
#make gpd of df 

In [None]:
#testing export to GCS > csuyat_folder

# 'gs://calitp-analytics-data/data-analyses/csuyat_folder/##FILENAME##.parquet'
# df.to_parquet()

#sucsessfully written to GCS, to csuyat_folder. need to export to gtfs_schedule folder 

#df.to_parquet('gs://calitp-analytics-data/data-analyses/csuyat_folder/route_identification_2023_m03_m05.parquet')

---

In [None]:
#peaking into df to make sure everything looks good

#shape shows 11,927 rows and 8 columns
display(df.shape)

#type shows data is in df
display(type(df))

#columns return all the columns we listed in the function
display(list(df.columns))

#value_counts confirm df only has rows from 2023 March to May
display(df.value_counts(subset=['year','month']))

In [4]:
#creating sub-df for 'Sacramento Schedule'
#195 rows, 8 columns
sac = df[df['name'] == 'Sacramento Schedule']

In [6]:
display(sac.shape)
display(sac.head(3))

(195, 8)

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
124,8f9cdae3394a95b1c82c1e293b1cd29c,recbzZQUIdMmFvm1r,Sacramento Schedule,176,45408,5,2023,[]
127,e89fb44e20a0f2f99e18b66c007109d1,recbzZQUIdMmFvm1r,Sacramento Schedule,84,45343,5,2023,[]
151,20a544c78a4d54b42c8b1127684d2c1d,recbzZQUIdMmFvm1r,Sacramento Schedule,86,45346,4,2023,"[POINT(-121.4921 38.57997), POINT(-121.49168 3..."


In [5]:
sac_routes = sac['route_id'].unique()
sac_routes

array(['176', '084', '086', '214', '134', '103', '011', 'F20', '30',
       '088', '129', '210', 'F10', '072', '030', '023', '033', '138',
       '507', '215', '105', '093', '109', '038', '205', '142', '067',
       '051', '227', '161', '113', '252', '206', '081', '078', '211',
       '025', '106', '021', '062', '019', '213', '519', '087', '228',
       '001', '075', '177', '248', '247', '056', '255', '533', '175',
       '026', '061', '013', '102', '068', '015', '226', '246', '124',
       '082', '212'], dtype=object)

## Next Steps

for every `name` and `route_id` in routes, need to see if each row is the same or not. Need to identify any variation in the routes. 



---

Trying to use a loop that will create a df for every route_id in sac_routes. but would need to do this for every `name` in the `fct_monthly_routes` df eventually?


In [7]:
#empty dictionary
sac_sub_route_ids = {}

#each element in sac_routes will be called route.
#for each route in sac_routes, query each row related to that route.(where ever you use the variable route, go 1-by-1 the differnet
#then, create a dataframe for each route and place it into the dictionary sub_dataframes
for route in sac_routes:
    sub_df = sac[sac['route_id'] == route]
    sac_sub_route_ids[route] = sub_df

In [15]:
#testing dictionary with route 23 and 88
sac_sub_route_ids['023']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
944,d80d73174d1d318341a7af73a6fdc452,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,5,2023,[]
6322,1127c9acaecd57fd9543132998a72e3e,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,4,2023,"[POINT(-121.26715 38.67912), POINT(-121.26712 ..."
7176,754ea57f3eca954838e87899b729e7ba,recbzZQUIdMmFvm1r,Sacramento Schedule,23,44912,3,2023,[]


---

In [9]:
#list of unique route names from initial df
route_names = df['name'].unique()


In [10]:
#new loop that creates a dictionary of each unique schedule name with all its routes.
sub_route_name = {}

for name in route_names:
    sub_df2 = df[df['name'] == name]
    sub_route_name[name] = sub_df2
    

In [11]:
#test to see if new dictionary works
sub_route_name['Auburn Schedule']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
98,7f2990f969db2d363954feefb54a3134,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,5,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."
5195,d4a1fd712ec226a51120b701119508cf,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."
9762,d8f952c0d2e81564c71be5846e408212,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,[]


## Now I have `sac_sub_route_ids` and `sub_route_name` dictionaries

Examples of some noteable routes with slight variations over time.

In [12]:
#General observations for Sacramento Schedule: 
#shape_id changes every month. 
#pt_array changes every month, however, did get a warning upon initial query of data so may need to review query to account for geodata
#Month 4 has the point geom data

display(sac_sub_route_ids['088'])
display(sac_sub_route_ids['023'])
display(sac_sub_route_ids['105'])
display(sac_sub_route_ids['F20'])
display(sac_sub_route_ids['215'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
611,79872e3f47d4f8c9579aeb7c141649ad,recbzZQUIdMmFvm1r,Sacramento Schedule,88,44974,3,2023,[]
3425,fc4aecdbb2e90697aaca21929881da0c,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,4,2023,"[POINT(-121.491718 38.579866), POINT(-121.4915..."
7972,559586f993036a5d2aba23bbddb5645f,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
944,d80d73174d1d318341a7af73a6fdc452,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,5,2023,[]
6322,1127c9acaecd57fd9543132998a72e3e,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,4,2023,"[POINT(-121.26715 38.67912), POINT(-121.26712 ..."
7176,754ea57f3eca954838e87899b729e7ba,recbzZQUIdMmFvm1r,Sacramento Schedule,23,44912,3,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1522,6a310f55a2c009eb82bd463b1da2a233,recbzZQUIdMmFvm1r,Sacramento Schedule,105,44984,3,2023,[]
4798,4cbfc3213874266d76ce4a2699894f0c,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45360,5,2023,[]
5524,59ef04d42732bd19260312593f2c7a92,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45359,4,2023,"[POINT(-121.390273 38.468759), POINT(-121.3902..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
460,2ed9e87018bcd053ef2ae87d3ebd7211,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,4,2023,"[POINT(-121.103892 38.656366), POINT(-121.1040..."
1857,87a3f535201f3e670f4c0d362c28d1f0,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,5,2023,[]
2238,e679982fb613ed9752b114e9187d2bab,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45166,3,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1122,20edead480e1353c64ea919d72b81d51,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45065,3,2023,[]
6264,391282f0af7e612a0574c68ca695dac0,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,4,2023,"[POINT(-121.441354 38.560567), POINT(-121.4419..."
11269,0f363ffa5be5b8eb4f2e64a04541cc9a,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,5,2023,[]


In [13]:
#test to see other route names
#other route names have more point geometry than Sacramento.
display(sub_route_name['Santa Cruz Schedule'])
display(sub_route_name['Merced Schedule'])
display(sub_route_name['San Diego Schedule'])
display(sub_route_name['Roseville Schedule'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
248,cdca257b9ea92e5c290b30cfd77c5510,recpLgqMr7Y888HmN,Santa Cruz Schedule,68,shp-68-51,3,2023,"[POINT(-121.966217 36.975434), POINT(-121.9661..."
386,d514fedf271b839eb6347a4554b68111,recpLgqMr7Y888HmN,Santa Cruz Schedule,41,shp-41-51,5,2023,"[POINT(-122.150681 37.041912), POINT(-122.1505..."
804,b4af55ee91577d1f1bd32acf75e6a424,recpLgqMr7Y888HmN,Santa Cruz Schedule,72W,shp-72W-01,3,2023,"[POINT(-121.760071 36.909864), POINT(-121.7601..."
842,201ea89b2c29f55e5ce502a7996b8ed8,recpLgqMr7Y888HmN,Santa Cruz Schedule,42,shp-42-02,3,2023,"[POINT(-122.024475 36.970796), POINT(-122.0242..."
1107,d8e645e9fad457cbe5beab50c8a25625,recpLgqMr7Y888HmN,Santa Cruz Schedule,17,shp-17-51,4,2023,"[POINT(-121.902687 37.330288), POINT(-121.9026..."
...,...,...,...,...,...,...,...,...
11358,3f9f659217f9889f4543ea34f0999b1f,recpLgqMr7Y888HmN,Santa Cruz Schedule,42,shp-42-02,4,2023,"[POINT(-122.024475 36.970796), POINT(-122.0242..."
11399,98c9501609a3e22fc5ee8b53b0a0d357,recpLgqMr7Y888HmN,Santa Cruz Schedule,69A,shp-69A-01,5,2023,"[POINT(-122.024475 36.970616), POINT(-122.0244..."
11453,23a9f4169c53cd48d623ed691ddf680e,recpLgqMr7Y888HmN,Santa Cruz Schedule,20,shp-20-08,5,2023,"[POINT(-122.024574 36.971077), POINT(-122.0246..."
11510,475a6a0d8738c49e27334cd558041978,recpLgqMr7Y888HmN,Santa Cruz Schedule,17,shp-17-51,5,2023,"[POINT(-121.902687 37.330288), POINT(-121.9026..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
52,d8b6a849a3c0e00123bb98d0fff5d89c,reck9IsndFO0GtbQD,Merced Schedule,1196,p_111309,3,2023,"[POINT(-120.610352 37.341354), POINT(-120.6101..."
93,2c18f0ba8492cf8dc06a5257a42a7167,reck9IsndFO0GtbQD,Merced Schedule,1195,p_3370,3,2023,"[POINT(-120.610352 37.341354), POINT(-120.6101..."
299,6eec64c460aaf2b12ca1a09a06a8cdb3,reck9IsndFO0GtbQD,Merced Schedule,1210,p_3424,5,2023,"[POINT(-120.488174 37.301979), POINT(-120.4878..."
635,0099cf7f82bfbfdef7b8555e773976e9,reck9IsndFO0GtbQD,Merced Schedule,1211,p_3395,3,2023,"[POINT(-120.4873462 37.30157144), POINT(-120.4..."
1207,cdaa955a4459a8eb2329fa5460051c0b,reck9IsndFO0GtbQD,Merced Schedule,1205,p_3410,5,2023,"[POINT(-120.488224626 37.301779467), POINT(-12..."
1447,ef13d3231db6aeda35707fcdd74fd5e4,reck9IsndFO0GtbQD,Merced Schedule,1203,p_3413,5,2023,"[POINT(-120.488082 37.301944), POINT(-120.4878..."
2047,8517f4f9e6bce7fd1ed21e6fe8609267,reck9IsndFO0GtbQD,Merced Schedule,1207,p_3428,5,2023,"[POINT(-120.42700784 37.365634899), POINT(-120..."
2362,7dae88686d10055f33a2254b173ac457,reck9IsndFO0GtbQD,Merced Schedule,1210,p_3424,4,2023,"[POINT(-120.488174 37.301979), POINT(-120.4878..."
2823,3bec3db529df20ead5d5db8b1828b97b,reck9IsndFO0GtbQD,Merced Schedule,1201,p_111317,3,2023,"[POINT(-120.488108344 37.301915396), POINT(-12..."
3158,e6d9897907cfb8eafe22e75a660addab,reck9IsndFO0GtbQD,Merced Schedule,1208,p_1425548,3,2023,"[POINT(-120.73366 37.385471), POINT(-120.73366..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
21,773e951f8f141210b880ce4020afa282,recfZ9iWkptccoONX,San Diego Schedule,888,888_3_26,3,2023,"[POINT(-116.184481 32.617817), POINT(-116.1844..."
89,50ec5b704e452ceea2b333bb625ccd30,recfZ9iWkptccoONX,San Diego Schedule,202,202_8_33,4,2023,"[POINT(-117.213401 32.86879), POINT(-117.21339..."
103,352bd3d297d7619636ebdd19f04569f4,recfZ9iWkptccoONX,San Diego Schedule,891,891_2_11,5,2023,"[POINT(-116.97585297 32.79213333), POINT(-116...."
156,fef9a743793034e1f8881df37f26873d,recfZ9iWkptccoONX,San Diego Schedule,904,904_0_27,3,2023,"[POINT(-117.172515 32.67699), POINT(-117.17425..."
170,911d6233c41120aae8bbdd01920dae62,recfZ9iWkptccoONX,San Diego Schedule,972,972_8_17,3,2023,"[POINT(-117.224907 32.902712), POINT(-117.2246..."
...,...,...,...,...,...,...,...,...
11778,731d8510b183e6277d1fbd22a8ab286c,recfZ9iWkptccoONX,San Diego Schedule,891,891_3_12,4,2023,"[POINT(-116.375646 33.256837), POINT(-116.3756..."
11781,c5878c92afa55f849b8943a5791e3ab3,recfZ9iWkptccoONX,San Diego Schedule,967,967_9_51,3,2023,"[POINT(-117.107392 32.661632), POINT(-117.1070..."
11800,aca46669d1deaff4bc4e67e899fe78fe,recfZ9iWkptccoONX,San Diego Schedule,202,202_8_33,5,2023,"[POINT(-117.21340179 32.86878967), POINT(-117...."
11862,229659f4e2409007f0b112011a31f675,recfZ9iWkptccoONX,San Diego Schedule,945,945_1_14,4,2023,"[POINT(-117.080459 33.01678), POINT(-117.08056..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
61,524ce19358119f5d29ab59ad21cb031c,rec90jC43naXJz9lr,Roseville Schedule,9_AM,38273,3,2023,"[POINT(-121.335359 38.767776), POINT(-121.3354..."
388,0eb095a5ed520520db2f5d508be77849,rec90jC43naXJz9lr,Roseville Schedule,5_PM,38265,3,2023,"[POINT(-121.486544 38.578468), POINT(-121.4867..."
447,057d53bc5c378325001c04f19b87bf38,rec90jC43naXJz9lr,Roseville Schedule,8_AM,38270,5,2023,"[POINT(-121.335486 38.767616), POINT(-121.3354..."
572,fb3a327482baba9629433776beaa2ff3,rec90jC43naXJz9lr,Roseville Schedule,C,38283,3,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
718,9984090461487c525cb1b39c41acdced,rec90jC43naXJz9lr,Roseville Schedule,2_PM,38258,3,2023,"[POINT(-121.486544 38.578468), POINT(-121.4867..."
...,...,...,...,...,...,...,...,...
11135,181e285128289835571368cd87153072,rec90jC43naXJz9lr,Roseville Schedule,10_PM,38256,4,2023,"[POINT(-121.258986 38.759161), POINT(-121.2583..."
11251,a1f3ff483b324839cc9b4310f1386d6d,rec90jC43naXJz9lr,Roseville Schedule,9_AM,38273,5,2023,"[POINT(-121.335359 38.767776), POINT(-121.3354..."
11276,2757d2ab4d2efd28a79ad9bad26ed014,rec90jC43naXJz9lr,Roseville Schedule,9_AM,38273,4,2023,"[POINT(-121.335359 38.767776), POINT(-121.3354..."
11650,8b59539ef0af6882d063f8833f8efa62,rec90jC43naXJz9lr,Roseville Schedule,7_PM,38269,3,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."


In [18]:
import importlib

importlib.reload(segment_speed_utils)
from segment_speed_utils.project_vars import SCHED_GCS, analysis_date

NameError: name 'segment_speed_utils' is not defined