# Route Identification Over Time

Recent observations shows small chages in routes over time. Specifically in the following fields:
* route ID
* route short name
* route long name
* route desc

Need to observe these route changes in order to account for these changes in future analyses.

## Objective
1. Query data from `fct_monthly_routes` to help identify variences in Routes. Query for 2023, a couple of months. 
2. Save data to GCS `gtfs_schedule` bucket
3. Filter down data to `Sacramento Regional Transit`, identify and observe routes for any variences


## function from `open_data/download_vehicle_position.py`
    
    import datetime
    import gcsfs
    import geopandas as gpd
    import pandas as pd
    import shapely
    import sys

    from calitp_data_analysis.tables import tbls
    from calitp_data_analysis import utils
    from loguru import logger
    from siuba import *

    from shared_utils import schedule_rt_utils
    
    def download_vehicle_positions(
        date: str,
        operator_names: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_vehicle_locations()
              >> filter(_.service_date == date)
              >> filter(_.gtfs_dataset_name.isin(operator_names))
              >> select(_.gtfs_dataset_key, _.gtfs_dataset_name,
                        _.schedule_gtfs_dataset_key,
                        _.trip_id, _.trip_instance_key,
                        _.location_timestamp,
                        _.location)
                  >> collect()
             )

In [1]:
#imports

import datetime
import gcsfs
import geopandas as gpd
import pandas as pd
import shapely
import sys

from calitp_data_analysis.tables import tbls
from calitp_data_analysis import utils
from loguru import logger
from siuba import *

from shared_utils import schedule_rt_utils



ModuleNotFoundError: No module named 'shared_utils'

In [2]:
# test to query fct_monthly_routes
def get_monthly_routes(
        year: str,
        months: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_monthly_routes()
              >> filter(_.year == year)
              >> filter(_.month.isin(months))
              >> select(_.key, _.source_record_id,
                        _.name,
                        _.route_id, _.shape_id,
                        _.month,
                        _.year,
                       _.pt_array)
                  >> collect()
             )
        return df

In [3]:
df = get_monthly_routes(2023, [3, 4, 5])

  sqlalchemy.util.warn(


In [None]:
#testing export to GCS > csuyat_folder

# 'gs://calitp-analytics-data/data-analyses/csuyat_folder/##FILENAME##.parquet'
# df.to_parquet()

#sucsessfully written to GCS, to csuyat_folder. need to export to 

#df.to_parquet('gs://calitp-analytics-data/data-analyses/csuyat_folder/route_identification_2023_m03_m05.parquet')

---

In [None]:
#peaking into df to make sure everything looks good

#shape shows 11,927 rows and 8 columns
display(df.shape)

#type shows data is in df
display(type(df))

#columns return all the columns we listed in the function
display(list(df.columns))

#value_counts confirm df only has rows from 2023 March to May
display(df.value_counts(subset=['year','month']))

In [6]:
#creating sub-df for 'Sacramento Schedule'
#195 rows, 8 columns
sac = df[df['name'] == 'Sacramento Schedule']

In [7]:
display(sac.shape)
display(sac.head(3))

(195, 8)

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
13,7d5bc4199721d12c326e68b76ed4e40b,recbzZQUIdMmFvm1r,Sacramento Schedule,62,45316,4,2023,"[POINT(-121.53569 38.48545), POINT(-121.535756..."
20,d63a9754f51a8834d558e84020e10f34,recbzZQUIdMmFvm1r,Sacramento Schedule,138,45389,4,2023,"[POINT(-121.753874 38.539257), POINT(-121.7536..."
268,274a0e075046066ef749de889c204785,recbzZQUIdMmFvm1r,Sacramento Schedule,248,45451,5,2023,[]


In [59]:
sac_routes = sac['route_id'].unique()
sac_routes

array(['062', '138', '248', 'F10', '227', '013', '142', '124', '226',
       '033', '001', '067', '088', '211', '113', '093', '011', '023',
       '210', '177', '084', '026', '081', '087', '206', 'F20', '025',
       '056', '175', '102', '213', '015', '019', '30', '533', '075',
       '252', '030', '205', '109', '228', '068', '105', '176', '021',
       '507', '255', '082', '247', '061', '106', '051', '519', '134',
       '214', '086', '103', '161', '246', '072', '215', '129', '038',
       '212', '078'], dtype=object)

## Next Steps

for every `name` and `route_id` in routes, need to see if each row is the same or not. Need to identify any variation in the routes. 



---

Trying to use a loop that will create a df for every route_id in sac_routes. but would need to do this for every `name` in the `fct_monthly_routes` df eventually?


In [45]:
#empty dictionary
sac_sub_route_ids = {}

#each element in sac_routes will be called route.
#for each route in sac_routes, query each row related to that route.(where ever you use the variable route, go 1-by-1 the differnet
#then, create a dataframe for each route and place it into the dictionary sub_dataframes
for route in sac_routes:
    sub_df = sac[sac['route_id'] == route]
    sac_sub_route_ids[route] = sub_df

In [46]:
#testing dictionary with route 23 and 88
sac_sub_route_ids['088']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1173,fc4aecdbb2e90697aaca21929881da0c,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,4,2023,"[POINT(-121.491718 38.579866), POINT(-121.4915..."
4271,79872e3f47d4f8c9579aeb7c141649ad,recbzZQUIdMmFvm1r,Sacramento Schedule,88,44974,3,2023,[]
10792,559586f993036a5d2aba23bbddb5645f,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,5,2023,[]


---

In [40]:
#list of unique route names from initial df
route_names = df['name'].unique()


In [41]:
#new loop that creates a dictionary of each unique schedule name with all its routes.
sub_route_name = {}

for name in route_names:
    sub_df2 = df[df['name'] == name]
    sub_route_name[name] = sub_df2
    

In [60]:
#test to see if new dictionary works
sub_route_name['Auburn Schedule']

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
5931,7f2990f969db2d363954feefb54a3134,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,5,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."
8874,d8f952c0d2e81564c71be5846e408212,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,[]
11383,d4a1fd712ec226a51120b701119508cf,recPN6fJseWncWhDZ,Auburn Schedule,32372,p_1303321,4,2023,"[POINT(-121.082834 38.903344), POINT(-121.0828..."


## Now I have `sac_sub_route_ids` and `sub_route_name` dictionaries

Examples of some noteable routes with slight variations over time.

In [51]:
#General observations for Sacramento Schedule: 
#shape_id changes every month. 
#pt_array changes every month, however, did get a warning upon initial query of data so may need to review query to account for geodata
#Month 4 has the point geom data

display(sac_sub_route_ids['088'])
display(sac_sub_route_ids['023'])
display(sac_sub_route_ids['105'])
display(sac_sub_route_ids['F20'])
display(sac_sub_route_ids['215'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1173,fc4aecdbb2e90697aaca21929881da0c,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,4,2023,"[POINT(-121.491718 38.579866), POINT(-121.4915..."
4271,79872e3f47d4f8c9579aeb7c141649ad,recbzZQUIdMmFvm1r,Sacramento Schedule,88,44974,3,2023,[]
10792,559586f993036a5d2aba23bbddb5645f,recbzZQUIdMmFvm1r,Sacramento Schedule,88,45350,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
1582,754ea57f3eca954838e87899b729e7ba,recbzZQUIdMmFvm1r,Sacramento Schedule,23,44912,3,2023,[]
11028,1127c9acaecd57fd9543132998a72e3e,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,4,2023,"[POINT(-121.26715 38.67912), POINT(-121.26712 ..."
11331,d80d73174d1d318341a7af73a6fdc452,recbzZQUIdMmFvm1r,Sacramento Schedule,23,45286,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
3687,59ef04d42732bd19260312593f2c7a92,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45360,4,2023,"[POINT(-121.467411 38.483331), POINT(-121.4674..."
9173,6a310f55a2c009eb82bd463b1da2a233,recbzZQUIdMmFvm1r,Sacramento Schedule,105,44984,3,2023,[]
10411,4cbfc3213874266d76ce4a2699894f0c,recbzZQUIdMmFvm1r,Sacramento Schedule,105,45360,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
2153,e679982fb613ed9752b114e9187d2bab,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45166,3,2023,[]
5073,2ed9e87018bcd053ef2ae87d3ebd7211,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,4,2023,"[POINT(-121.103892 38.656366), POINT(-121.1040..."
6702,87a3f535201f3e670f4c0d362c28d1f0,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,5,2023,[]


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
7571,20edead480e1353c64ea919d72b81d51,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45065,3,2023,[]
10307,0f363ffa5be5b8eb4f2e64a04541cc9a,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,5,2023,[]
11016,391282f0af7e612a0574c68ca695dac0,recbzZQUIdMmFvm1r,Sacramento Schedule,215,45443,4,2023,"[POINT(-121.441354 38.560567), POINT(-121.4419..."


In [58]:
#test to see other route names
#other route names have more point geometry than Sacramento.
display(sub_route_name['Santa Cruz Schedule'])
display(sub_route_name['Merced Schedule'])
display(sub_route_name['San Diego Schedule'])
display(sub_route_name['Roseville Schedule'])

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
48,bffb6dac22447af1f6bed8fc40babcb4,recpLgqMr7Y888HmN,Santa Cruz Schedule,42,shp-42-02,5,2023,"[POINT(-122.024475 36.970796), POINT(-122.0242..."
50,6bca3879727f4e45dfde8523e6c03600,recpLgqMr7Y888HmN,Santa Cruz Schedule,69W,shp-69W-04,5,2023,"[POINT(-121.760071 36.909864), POINT(-121.7601..."
531,5bd3acf8e37dc0a377deba30d879cc52,recpLgqMr7Y888HmN,Santa Cruz Schedule,WC,shp-WC-01,3,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."
640,e656afe5bcc78f76963841e361e2f3d2,recpLgqMr7Y888HmN,Santa Cruz Schedule,18,shp-18-02,5,2023,"[POINT(-122.024574 36.971077), POINT(-122.0246..."
643,567edc958c5e6424f45e44e473601297,recpLgqMr7Y888HmN,Santa Cruz Schedule,15,shp-15-04,3,2023,"[POINT(-122.024574 36.971077), POINT(-122.0245..."
...,...,...,...,...,...,...,...,...
11504,7a46b99ecc96fb8451a76c1eab94fe86,recpLgqMr7Y888HmN,Santa Cruz Schedule,79,shp-79-03,4,2023,"[POINT(-121.760223 36.909776), POINT(-121.7600..."
11570,298ebedceb9926fe0e2b9aaa9f57cece,recpLgqMr7Y888HmN,Santa Cruz Schedule,69W,shp-69W-01,3,2023,[]
11581,de2a8d5c5abdf6253a4cc9eb3ae777a0,recpLgqMr7Y888HmN,Santa Cruz Schedule,69W,shp-69W-04,4,2023,"[POINT(-121.760071 36.909864), POINT(-121.7601..."
11764,3b3232ded18276c66c564612841d2945,recpLgqMr7Y888HmN,Santa Cruz Schedule,17,shp-17-51,3,2023,"[POINT(-121.902687 37.330288), POINT(-121.9026..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
54,07a6637ad0c2ff61a73338c9f056fc06,reck9IsndFO0GtbQD,Merced Schedule,1207,p_3428,3,2023,"[POINT(-120.42700784 37.365634899), POINT(-120..."
217,6eec64c460aaf2b12ca1a09a06a8cdb3,reck9IsndFO0GtbQD,Merced Schedule,1210,p_3424,5,2023,"[POINT(-120.488174 37.301979), POINT(-120.4878..."
223,0099cf7f82bfbfdef7b8555e773976e9,reck9IsndFO0GtbQD,Merced Schedule,1211,p_3395,3,2023,"[POINT(-120.4873462 37.30157144), POINT(-120.4..."
442,37cf18a1b5ae721c510800678bf69fcb,reck9IsndFO0GtbQD,Merced Schedule,1211,p_3395,4,2023,"[POINT(-120.4873462 37.30157144), POINT(-120.4..."
784,5cb9d3f7eb829e7e6892f15237094e25,reck9IsndFO0GtbQD,Merced Schedule,1208,p_1425548,4,2023,"[POINT(-120.73366 37.385471), POINT(-120.73366..."
964,ef13d3231db6aeda35707fcdd74fd5e4,reck9IsndFO0GtbQD,Merced Schedule,1203,p_3413,5,2023,"[POINT(-120.488082 37.301944), POINT(-120.4878..."
989,3bec3db529df20ead5d5db8b1828b97b,reck9IsndFO0GtbQD,Merced Schedule,1201,p_111317,3,2023,"[POINT(-120.488108344 37.301915396), POINT(-12..."
1843,818e8b7bc1b22a38487d748b654a73e2,reck9IsndFO0GtbQD,Merced Schedule,1881,p_1303791,5,2023,"[POINT(-120.875639 37.054647), POINT(-120.8757..."
1911,2854a583f0638b47919cd6059afcebae,reck9IsndFO0GtbQD,Merced Schedule,1206,p_3388,5,2023,"[POINT(-120.4881804 37.30196936), POINT(-120.4..."
2248,2c18f0ba8492cf8dc06a5257a42a7167,reck9IsndFO0GtbQD,Merced Schedule,1195,p_3370,3,2023,"[POINT(-120.610352 37.341354), POINT(-120.6101..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
93,74e177fcaf362d9657fcf68b3cfed554,recfZ9iWkptccoONX,San Diego Schedule,945,945_1_14,5,2023,"[POINT(-117.08045959 33.01678085), POINT(-117...."
115,6a88e700bc09a6b6e020a44e472c807d,recfZ9iWkptccoONX,San Diego Schedule,917,917_9_22,3,2023,"[POINT(-117.08576 32.709909), POINT(-117.08583..."
211,5905cdb0d2fbf99b8e5a1089f2ff5f5d,recfZ9iWkptccoONX,San Diego Schedule,904,904_1_30,4,2023,"[POINT(-117.166815 32.69444), POINT(-117.16680..."
250,cebe1e13a810d4312aa70f92e626c5bc,recfZ9iWkptccoONX,San Diego Schedule,20,20_1_229,3,2023,"[POINT(-117.080415 33.017061), POINT(-117.0802..."
297,8b4a07b5b12823684994183ba592180e,recfZ9iWkptccoONX,San Diego Schedule,872,872_9_22,4,2023,"[POINT(-116.975519 32.791553), POINT(-116.9755..."
...,...,...,...,...,...,...,...,...
11745,1d2d016b1952866f9003c45ae9463ede,recfZ9iWkptccoONX,San Diego Schedule,923,923_2_72,5,2023,"[POINT(-117.25450134 32.73608398), POINT(-117...."
11848,e081da9ef6d25a2ec0e56dc09f06a6a0,recfZ9iWkptccoONX,San Diego Schedule,225,225_0_17,3,2023,"[POINT(-116.939763 32.553277), POINT(-116.9398..."
11859,b7a82364488c1ee3e07fb7a2474d3c59,recfZ9iWkptccoONX,San Diego Schedule,712,712_2_83,3,2023,"[POINT(-117.084711 32.602847), POINT(-117.0847..."
11867,217ba6c2cb835855b8127f77d5f1843f,recfZ9iWkptccoONX,San Diego Schedule,892,892_2_12,3,2023,"[POINT(-116.975853 32.792134), POINT(-116.9758..."


Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
6,386c7e74022bb83f7bdf6a9a5c084339,rec90jC43naXJz9lr,Roseville Schedule,L,38291,4,2023,"[POINT(-121.282849 38.749896), POINT(-121.2840..."
113,8d6f07add3a0012e58c2c284c7dc3210,rec90jC43naXJz9lr,Roseville Schedule,M,38292,4,2023,"[POINT(-121.271435 38.772103), POINT(-121.2713..."
385,94f1e6d03d580c9cfe3dea0fd39e7fa0,rec90jC43naXJz9lr,Roseville Schedule,2_PM,38258,5,2023,"[POINT(-121.486544 38.578468), POINT(-121.4867..."
397,6a83c63e750643ab9ead65e60121db37,rec90jC43naXJz9lr,Roseville Schedule,D,38286,4,2023,"[POINT(-121.285198 38.748078), POINT(-121.2840..."
455,813a000f92acbaa797503f132ceee4ea,rec90jC43naXJz9lr,Roseville Schedule,9_PM,38275,5,2023,"[POINT(-121.259103 38.759223), POINT(-121.2584..."
...,...,...,...,...,...,...,...,...
10992,09c1b03517cf23f7696c8e930102de26,rec90jC43naXJz9lr,Roseville Schedule,3_AM,38259,5,2023,"[POINT(-121.309928 38.75917), POINT(-121.30992..."
11248,d0ba982cb191a431174852c2996e11b5,rec90jC43naXJz9lr,Roseville Schedule,A,38278,4,2023,"[POINT(-121.289661 38.722875), POINT(-121.2896..."
11250,c2a3488d5aa3377645c5b40f345845dc,rec90jC43naXJz9lr,Roseville Schedule,10_PM,38256,5,2023,"[POINT(-121.258986 38.759161), POINT(-121.2583..."
11275,82c6ce9ecd42327fb1aa6a42a56eb442,rec90jC43naXJz9lr,Roseville Schedule,7_PM,38269,5,2023,"[POINT(-121.264939 38.748362), POINT(-121.2646..."
