## Tiger Census
* https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
* S1200 - secondary road
* S1100 - primary road
* S1400 - local roads


In [1]:
# import dask.dataframe as dd
# import dask_geopandas as dg

import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import datetime


from segment_speed_utils.project_vars import analysis_date
from segment_speed_utils import helpers
from shared_utils import  dask_utils, geography_utils, utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
SHARED_GCS = f"{GCS_FILE_PATH}shared_data/"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### GTFS Shapes

In [3]:
# test.plot()

In [4]:
def gtfs_shapes_operators(date):
    """
    Load and merge gtfs_shapes 
    with trips to get operator and 
    feed key information.
    
    Args:
        date: date wanted for the datasets to be drawn from
        
    Returns:
        GeoDataFrame
    """
    gtfs_shapes = helpers.import_scheduled_shapes(date).compute()
    
    trips = helpers.import_scheduled_trips(date,(),['name','shape_array_key']).compute().drop_duplicates()
    
    m1 = pd.merge(gtfs_shapes, trips, how="outer", on="shape_array_key")
    return m1

In [5]:
def order_operators(date) -> list:
    """
    Re order a list of operators 
    so some of the largest ones will be at the top of 
    the list.
     
    Args:
        date: date wanted for the datasets to be drawn from   
    """
    operator_list = helpers.import_scheduled_trips(analysis_date,(),['name']).compute().sort_values('name')
    operator_list = operator_list.name.unique().tolist()
    
    # Reorder list so the biggest operators are at the beginning
    # based on NTD services data 
    big_operators = ['LA DOT Schedule',
     'LA Metro Bus Schedule',
     'LA Metro Rail Schedule',
     'Bay Area 511 Muni Schedule',
     'Bay Area 511 AC Transit Schedule',
     'Bay Area 511 Santa Clara Transit Schedule',
     'Bay Area 511 BART Schedule',
     'San Diego Schedule','OCTA Schedule','Sacramento Schedule',
    ]
    i = 0
    for operator in big_operators:
        operator_list.remove(operator)
        operator_list.insert(i, operator)
        ++i
    return operator_list

### Tiger - Load Roads

In [6]:
def load_roads(road_type_wanted:list, buffer_or_not: bool = False):
    """
    Load some or all of the roads based on what you filter.
    Can also buffer the roads or not. 
    
    Args:
        road_type_wanted (list): the type of roads you want.
        https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
        
        buffer_or_not (bool): add a buffer of 200 or not
    
    Returns: 
        GeoDataFrame
    """
    df = gpd.read_parquet(
         f"{SHARED_GCS}all_roads_2020_state06.parquet", 
         filters =[('MTFCC', 'in', road_type_wanted)],
         columns = ["LINEARID","geometry", "FULLNAME"]
    ).to_crs(geography_utils.CA_NAD83Albers)
    
    if buffer_or_not:
        df = df.assign(
            geometry = df.geometry.buffer(200)
        )
    else:
        df
    df = to_snakecase(df)
    
    return df

In [7]:
all_roads_no_buffer = load_roads(['S1100','S1200','S1400'])

In [8]:
all_roads_no_buffer.sample()

Unnamed: 0,linearid,geometry,fullname
30847,110173506431,"LINESTRING (-182411.409 -68991.959, -182392.508 -68997.262, -182363.936 -69005.222, -182336.085 -69013.834, -182317.371 -69019.578, -182300.868 -69025.385, -182256.805 -69040.909, -182251.360 -69042.695, -182234.848 -69048.058)",Moraga Dr


In [9]:
len(all_roads_no_buffer), all_roads_no_buffer.linearid.nunique()

(954665, 953914)

In [10]:
test = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/shared_data/all_roads_gtfs_shapes.parquet')

In [11]:
test.linearid.nunique(), len(test)

(317793, 317946)

In [12]:
len(test.drop_duplicates())

317793

In [13]:
test.sample()

Unnamed: 0,linearid,geometry,fullname
249210,110413811260,"LINESTRING (-200859.901 -48683.627, -200837.093 -48711.307)",Commons Ln


### Tiger Local Roads

#### Test with one operator first

In [14]:
#one_op = 'Bear Schedule'

In [15]:
#shapes_filtered = shapes.loc[shapes.name == one_op].reset_index(drop = True)

In [16]:
#sjoin1 = gpd.sjoin(
#        local_roads,
#       shapes_filtered,
 #       how = "inner",
 #     predicate = "intersects").drop_duplicates().reset_index(drop=True)

In [17]:
#sjoin1.shape

In [18]:
#sjoin1.linearid.nunique()

In [19]:
#sjoin1.columns

In [20]:
#linearid_del = sjoin1.linearid.unique().tolist()

In [21]:
#localroads_filtered = local_roads[~local_roads.linearid.isin(linearid_del)].reset_index(drop = True)

In [22]:
#local_roads.linearid.nunique() - localroads_filtered.linearid.nunique()

#### A few operators

In [23]:
# shapes = gtfs_shapes_operators(analysis_date)

In [24]:
# local_roads.linearid.nunique(), local_roads.linearid.count()

In [25]:
# local_roads.linearid.count()-local_roads.linearid.nunique()

In [26]:
# shapes_filtered.name.unique()

In [27]:
# sjoin_full_results = pd.DataFrame()

In [28]:
"""
for operator in test_operators:
        try:
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()
            local_roads = local_roads[~local_roads.linearid.isin(linearid_to_delete)].reset_index(drop = True)
        except:
            pass
        
        shapes_filtered = shapes.loc[shapes.name == operator].reset_index(drop = True)
    
        sjoin1 = gpd.sjoin(
        local_roads,
        shapes_filtered,
        how = "inner",
        predicate = "intersects").drop_duplicates().reset_index(drop=True)
        
        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)"""

'\nfor operator in test_operators:\n        try:\n            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()\n            local_roads = local_roads[~local_roads.linearid.isin(linearid_to_delete)].reset_index(drop = True)\n        except:\n            pass\n        \n        shapes_filtered = shapes.loc[shapes.name == operator].reset_index(drop = True)\n    \n        sjoin1 = gpd.sjoin(\n        local_roads,\n        shapes_filtered,\n        how = "inner",\n        predicate = "intersects").drop_duplicates().reset_index(drop=True)\n        \n        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)'

In [29]:
#sjoin_full_results.shape

In [30]:
#sjoin_full_results.linearid.nunique()

In [31]:
#sjoin_full_results.linearid.count()

In [32]:
#sjoin_full_results.drop(columns = ['geometry']).sample(5)

In [33]:
# test_operators = ['LA DOT Schedule','Bell Gardens Schedule','Nevada County Schedule','San Diego Schedule','OCTA Schedule','Sacramento Schedule']

In [34]:
# test_localroads = join_local_roads(analysis_date)

In [35]:
# test_localroads.shape

In [36]:
# f"{GCS_FILE_PATH}shared_data/"

In [37]:
# f'{SHARED_GCS}local_roads_gtfs_shapes.parquet'

In [38]:
# test_localroads.linearid.nunique()

In [39]:
# local_roads = load_roads(['S1400'])

In [40]:
# local_roads.linearid.nunique()

In [41]:
# test_localroads.linearid.nunique()

### Concat local roads and primary/secondary ones

In [42]:
def join_local_roads(date):
    
    # Load Shapes
    gtfs_shape_gdf = gtfs_shapes_operators(date)
    
    # Load local roads - buffered
    local_roads_buffered = load_roads(['S1400'], True)

    # Load local roads - not buffered
    local_roads_og = load_roads(['S1400'], False)
    
    # Find all unique operators, ordered by
    # largest operators first
    operators_list = order_operators(date)
    
    # Empty dataframe
    sjoin_full_results = pd.DataFrame()
    
    # Loop through and join by operator
    for operator in operators_list:
        
        shapes_filtered = gtfs_shape_gdf.loc[gtfs_shape_gdf.name == operator].reset_index(drop = True)
        
        # Delete any local road linearids that have already been found by an operator
        try:
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()
            local_roads_buffered = local_roads_buffered[~local_roads_buffered.linearid.isin(linearid_to_delete)].reset_index(drop = True)
        except:
            pass
        
        sjoin1 = (gpd.sjoin(
        local_roads_buffered,
        shapes_filtered,
        how = "inner",
        predicate = "intersects")
                  [['linearid']]
                  .drop_duplicates()
                  .reset_index(drop=True)
                 )
        
        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)
    
    sjoin_full_results = sjoin_full_results.drop_duplicates()
    sjoin_full_results.to_parquet(f'{SHARED_GCS}local_roads_gtfs_shapes.parquet')
    
    # Merge back to original local roads -> a GDF
    merge1 = pd.merge(
        local_roads_og,
        sjoin_full_results,
        on = "linearid",
        how = "inner")
    
    # Save
    merge1.to_parquet(f'{SHARED_GCS}local_roads_gtfs_shapes_m1.parquet')
    print('Done with local roads')
    
    return merge1

In [43]:
# local_roads = join_local_roads(analysis_date) 

In [44]:
# local_roads_gdf = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/shared_data/local_roads_gtfs_shapes_m1.parquet')

In [45]:
# type(local_roads_gdf)

In [46]:
# local_roads_gdf.geom_type.value_counts()

In [47]:
#primary_secondary_mtfcc = ['S1100','S1200']
#primary_secondary_roads = load_roads(primary_secondary_mtfcc)

In [48]:
#all_roads = pd.concat([primary_secondary_roads, local_roads_gdf], axis=0)
    
# Save
#all_roads.to_parquet(f'{SHARED_GCS}all_roads_gtfs_shapes.parquet')

In [49]:
# all_roads.geom_type.value_counts()

In [50]:
# all_roads.shape

In [51]:
# all_roads.linearid.nunique()

In [52]:
"""
segments = geography_utils.cut_segments(
        all_roads,
        ["linearid", "fullname"],
        1_000 # 1 km segments
    )
"""

'\nsegments = geography_utils.cut_segments(\n        all_roads,\n        ["linearid", "fullname"],\n        1_000 # 1 km segments\n    )\n'

In [53]:
def gtfs_shapes_all_roads(date):
    
    start = datetime.datetime.now()
    print(start)
    
    # Find all primary and secondary roads
    # regardless of intersection
    primary_secondary_mtfcc = ['S1100','S1200']
    primary_secondary_roads = load_roads(primary_secondary_mtfcc)
    
    # Find local roads that intersect  with GTFS shapes
    local_roads_gdf = join_local_roads(date)
    
    # Concat 
    all_roads = pd.concat([primary_secondary_roads, local_roads_gdf], axis=0)
    
    # Save
    all_roads.to_parquet(f'{SHARED_GCS}intersected_local_all_primary_sec_roads.parquet')
    
    """
    # Find segments
    segments = geography_utils.cut_segments(
        all_roads,
        ["linearid", "fullname"],
        1_000 # 1 km segments
    )
    
    segments.to_parquet(f"{SHARED_GCS}census_road_segments.parquet")
    """
    end = datetime.datetime.now()
    print(f"time lapsed: {end-start}")
    return all_roads

In [54]:
test = gtfs_shapes_all_roads(analysis_date)

2023-04-06 16:12:18.624442
Done with local roads
time lapsed: 0:18:14.259149


In [55]:
#test.sample()

In [56]:
#type(test)

In [57]:
#test.shape

In [58]:
#test.geom_type.value_counts()

### Draft

In [59]:
"""
def join_primary_secondary(gtfs_shape_gdf):
    
    # Load secondary-primary roads
    primary_secondary_mtfcc = ['S1100','S1200']
    primary_secondary_roads = load_roads(primary_secondary_mtfcc)
    
    sjoin1 = gpd.sjoin(
        primary_secondary_roads,
        gtfs_shape_gdf,
        how = "inner",
        predicate = "intersects"
    ).drop_duplicates().reset_index(drop=True)
    
    # Save to GCS
    sjoin1.to_parquet(f'{SHARED_GCS}primary_secondary_roads_gtfs_shapes.parquet')
    print('Done with primary and secondary roads')
    return sjoin1
    """

'\ndef join_primary_secondary(gtfs_shape_gdf):\n    \n    # Load secondary-primary roads\n    primary_secondary_mtfcc = [\'S1100\',\'S1200\']\n    primary_secondary_roads = load_roads(primary_secondary_mtfcc)\n    \n    sjoin1 = gpd.sjoin(\n        primary_secondary_roads,\n        gtfs_shape_gdf,\n        how = "inner",\n        predicate = "intersects"\n    ).drop_duplicates().reset_index(drop=True)\n    \n    # Save to GCS\n    sjoin1.to_parquet(f\'{SHARED_GCS}primary_secondary_roads_gtfs_shapes.parquet\')\n    print(\'Done with primary and secondary roads\')\n    return sjoin1\n    '