# Route Identification Over Time

Recent observations shows small chages in routes over time. Specifically in the following fields:
* route ID
* route short name
* route long name
* route desc

Need to observe these route changes in order to account for these changes in future analyses.

## Objective
1. Query data from `fct_monthly_routes` to help identify variences in Routes. Query for 2023, a couple of months. 
2. Save data to GCS `gtfs_schedule` bucket
3. Filter down data to `Sacramento Regional Transit`, identify and observe routes for any variences


## function from `open_data/download_vehicle_position.py`
    
    import datetime
    import gcsfs
    import geopandas as gpd
    import pandas as pd
    import shapely
    import sys

    from calitp_data_analysis.tables import tbls
    from calitp_data_analysis import utils
    from loguru import logger
    from siuba import *

    from shared_utils import schedule_rt_utils
    
    def download_vehicle_positions(
        date: str,
        operator_names: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_vehicle_locations()
              >> filter(_.service_date == date)
              >> filter(_.gtfs_dataset_name.isin(operator_names))
              >> select(_.gtfs_dataset_key, _.gtfs_dataset_name,
                        _.schedule_gtfs_dataset_key,
                        _.trip_id, _.trip_instance_key,
                        _.location_timestamp,
                        _.location)
                  >> collect()
             )

In [3]:
#imports

import datetime
import gcsfs
import geopandas as gpd
import pandas as pd
import shapely
import sys

from calitp_data_analysis.tables import tbls
from calitp_data_analysis import utils
from loguru import logger
from siuba import *

from shared_utils import schedule_rt_utils

ModuleNotFoundError: No module named 'shared_utils'

In [4]:
# test to query fct_monthly_routes
def get_monthly_routes(
        year: str,
        months: list
    ) -> pd.DataFrame:    
    
        df = (tbls.mart_gtfs.fct_monthly_routes()
              >> filter(_.year == year)
              >> filter(_.month.isin(months))
              >> select(_.key, _.source_record_id,
                        _.name,
                        _.route_id, _.shape_id,
                        _.month,
                        _.year,
                       _.pt_array)
                  >> collect()
             )
        return df

In [5]:
df = get_monthly_routes(2023, [3, 4, 5])

  sqlalchemy.util.warn(


In [None]:
#testing export to GCS > csuyat_folder
# 'gs://calitp-analytics-data/data-analyses/csuyat_folder/##FILENAME##.parquet'
# df.to_parquet()

#sucsessfully written to GCS

df.to_parquet('gs://calitp-analytics-data/data-analyses/csuyat_folder/route_identification_2023_m03_m05.parquet')

---

In [6]:
#peaking into df to make sure everything looks good

#shape shows 11,927 rows and 8 columns
display(df.shape)

#type shows data is in df
display(type(df))

#columns return all the columns we listed in the function
display(list(df.columns))

#value_counts confirm df only has rows from 2023 March to May
display(df.value_counts(subset=['year','month']))

(11927, 8)

pandas.core.frame.DataFrame

['key',
 'source_record_id',
 'name',
 'route_id',
 'shape_id',
 'month',
 'year',
 'pt_array']

year  month
2023  5        4180
      3        3899
      4        3848
dtype: int64

In [8]:
#creating sub-df for 'Sacramento Schedule'
#195 rows, 8 columns
sac = df[df['name'] == 'Sacramento Schedule']

In [10]:
display(sac.shape)
display(sac.head(3))
display(sac.route_id.unique())

(195, 8)

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
30,0e71513e4352259016b51891442e272d,recbzZQUIdMmFvm1r,Sacramento Schedule,051,45303,4,2023,"[POINT(-121.434585 38.50019), POINT(-121.43444..."
71,d63a9754f51a8834d558e84020e10f34,recbzZQUIdMmFvm1r,Sacramento Schedule,138,45389,4,2023,"[POINT(-121.753874 38.539257), POINT(-121.7536..."
168,87a3f535201f3e670f4c0d362c28d1f0,recbzZQUIdMmFvm1r,Sacramento Schedule,F20,45548,5,2023,[]


array(['051', '138', 'F20', '081', '105', '019', '106', '030', '212',
       '026', '206', '082', '175', '033', '088', '093', '062', '247',
       '213', '124', '067', 'F10', '252', '078', '214', '176', '246',
       '102', '161', '072', '086', '013', '001', '075', '177', '507',
       '068', '015', '056', '038', '248', '023', '109', '533', '255',
       '30', '025', '205', '211', '228', '129', '113', '519', '061',
       '103', '227', '142', '226', '084', '021', '210', '011', '134',
       '087', '215'], dtype=object)

In [54]:
sac_routes = pd.Series(sac['route_id'].unique())

In [55]:
sac_routes

0     051
1     138
2     F20
3     081
4     105
     ... 
60    210
61    011
62    134
63    087
64    215
Length: 65, dtype: object

In [63]:
#test to see if i can select certain rows 
sac[sac['route_id']==sac_routes.iloc[63]]

Unnamed: 0,key,source_record_id,name,route_id,shape_id,month,year,pt_array
8495,09d85c24a2d1149571fd5838c9e4fb7c,recbzZQUIdMmFvm1r,Sacramento Schedule,87,45347,5,2023,[]
9855,49d3ba651367edca3da47d9bf021dc8f,recbzZQUIdMmFvm1r,Sacramento Schedule,87,45347,4,2023,"[POINT(-121.427541 38.552818), POINT(-121.4273..."
10948,97fc1ffd6d98c985a5119adc039c7b40,recbzZQUIdMmFvm1r,Sacramento Schedule,87,44971,3,2023,[]


In [None]:
#fucntion that pulls a list of unique route IDs from the df
#then checks each route's to see if all columns match for each month

def route_checker(df):
    #get list of unique route_id
    df_routes = pd.Series(df['route_id'].unique())
    
    #for every unique route, pull all the rows for that route, then check to see if all rows in that sub set are equal.
    route_check = df_routes[
    