## Tiger Census
* https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
* S1200 - secondary road
* S1100 - primary road
* S1400 - local roads


In [None]:
import dask.dataframe as dd
import dask_geopandas as dg

import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import datetime


from segment_speed_utils.project_vars import analysis_date
from segment_speed_utils import helpers
from shared_utils import  dask_utils, geography_utils, utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
SHARED_GCS = f"{GCS_FILE_PATH}shared_data/"

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### GTFS Shapes

In [None]:
def gtfs_shapes_operators(date):
    """
    Load and merge gtfs_shapes 
    with trips to get operator and 
    feed key information.
    
    Returns a gpd.DataFrame.
    """
    gtfs_shapes = helpers.import_scheduled_shapes(date).compute()
    
    trips = helpers.import_scheduled_trips(date,(),['feed_key','name','shape_array_key']).compute().drop_duplicates()
    
    m1 = pd.merge(gtfs_shapes, trips, how="outer", on="shape_array_key")
    return m1

In [None]:
# merge1 = gtfs_shapes_operators(analysis_date)

In [None]:
# merge1.shape

In [None]:
# type(merge1)

In [None]:
# merge1.plot()

* https://www.energy.ca.gov/data-reports/energy-almanac/transportation-energy/public-transit-california

In [None]:
def order_operators(date):
    operator_list = helpers.import_scheduled_trips(analysis_date,(),['name']).compute().sort_values('name')
    operator_list = operator_list.name.unique().tolist()
    
    # Reorder list so the biggest operators are at the beginning
    # based on NTD services data 
    big_operators = ['LA DOT Schedule',
     'LA Metro Bus Schedule',
     'LA Metro Rail Schedule',
     'Bay Area 511 Muni Schedule',
     'Bay Area 511 AC Transit Schedule',
     'Bay Area 511 Santa Clara Transit Schedule',
     'Bay Area 511 BART Schedule',
     'San Diego Schedule','OCTA Schedule','Sacramento Schedule',
    ]
    i = 0
    for operator in big_operators:
        operator_list.remove(operator)
        operator_list.insert(i, operator)
        ++i
    return operator_list

In [None]:
# test_list = order_operators(analysis_date)

### Tiger Primary Secondary Roads

In [None]:
#roads = to_snakecase(gpd.read_parquet(
  #       f"{SHARED_GCS}all_roads_2020_state06.parquet")).to_crs(geography_utils.CA_NAD83Albers)

In [None]:
primary_secondary_mtfcc = ['S1100','S1200']

In [None]:
def load_roads(road_type_wanted:list):
    df = gpd.read_parquet(
         f"{SHARED_GCS}all_roads_2020_state06.parquet", 
         filters =[('MTFCC', 'in', road_type_wanted)],
         columns = ["LINEARID", "MTFCC", "geometry"]
    ).to_crs(geography_utils.CA_NAD83Albers)
    
    df = df.assign(
        geometry = df.geometry.buffer(200)
    )
    
    df = to_snakecase(df)
    
    return df

In [None]:
# primary_secondary_roads = load_roads(primary_secondary_mtfcc)

In [None]:
# primary_secondary_roads.mtfcc.value_counts()

In [None]:
# primary_secondary_roads.shape

In [None]:
# primary_secondary_roads.plot()

In [None]:
def join_primary_secondary_v1():
    start = datetime.datetime.now()
    # Load Shapes
    shapes = gtfs_shapes_operators(analysis_date)
    
    # Load secondary-primary shapes
    primary_secondary_mtfcc = ['S1100','S1200']
    primary_secondary_roads = load_roads(primary_secondary_mtfcc)
    
    sjoin1 = gpd.sjoin(
        primary_secondary_roads,
        shapes,
        how = "inner",
        predicate = "intersects"
    ).drop_duplicates().reset_index(drop=True)
    
    end = datetime.datetime.now()
    print(f"time lapsed: {end-start}")
    
    return sjoin1

In [None]:
#test = join_primary_secondary()

In [None]:
#test.plot()

### Tiger Local Roads

#### Test with one operator first

In [None]:
#one_op = 'Bear Schedule'

In [None]:
#shapes_filtered = shapes.loc[shapes.name == one_op].reset_index(drop = True)

In [None]:
#sjoin1 = gpd.sjoin(
#        local_roads,
#       shapes_filtered,
 #       how = "inner",
 #     predicate = "intersects").drop_duplicates().reset_index(drop=True)

In [None]:
#sjoin1.shape

In [None]:
#sjoin1.linearid.nunique()

In [None]:
#sjoin1.columns

In [None]:
#linearid_del = sjoin1.linearid.unique().tolist()

In [None]:
#localroads_filtered = local_roads[~local_roads.linearid.isin(linearid_del)].reset_index(drop = True)

In [None]:
#local_roads.linearid.nunique() - localroads_filtered.linearid.nunique()

#### A few operators

In [None]:
# shapes = gtfs_shapes_operators(analysis_date)

In [None]:
# local_roads.linearid.nunique(), local_roads.linearid.count()

In [None]:
# local_roads.linearid.count()-local_roads.linearid.nunique()

In [None]:
# shapes_filtered.name.unique()

In [None]:
# sjoin_full_results = pd.DataFrame()

In [None]:
"""
for operator in test_operators:
        try:
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()
            local_roads = local_roads[~local_roads.linearid.isin(linearid_to_delete)].reset_index(drop = True)
        except:
            pass
        
        shapes_filtered = shapes.loc[shapes.name == operator].reset_index(drop = True)
    
        sjoin1 = gpd.sjoin(
        local_roads,
        shapes_filtered,
        how = "inner",
        predicate = "intersects").drop_duplicates().reset_index(drop=True)
        
        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)"""

In [None]:
#sjoin_full_results.shape

In [None]:
#sjoin_full_results.linearid.nunique()

In [None]:
#sjoin_full_results.linearid.count()

In [None]:
#sjoin_full_results.drop(columns = ['geometry']).sample(5)

In [None]:
def join_local_roads_v2(date):
    
    start = datetime.datetime.now()
    print(start)
 
    # Load GTFS Shapes
    shapes = gtfs_shapes_operators(date)
    
    # Load local roads
    local_roads = load_roads(['S1400'])
    
    # Loop through operators 
    operators_list = order_operators(date)
    
    # Empty dataframe
    sjoin_full_results = pd.DataFrame()
    
    for operator in operators_list:
        # Delete any local road linearids that have already been found by an operator
        try:
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()
            local_roads = local_roads[~local_roads.linearid.isin(linearid_to_delete)].reset_index(drop = True)
        except:
            pass
        
        shapes_filtered = shapes.loc[shapes.name == operator].reset_index(drop = True)
    
        sjoin1 = gpd.sjoin(
        local_roads,
        shapes_filtered,
        how = "inner",
        predicate = "intersects").drop_duplicates().reset_index(drop=True)
        
        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)
        
    # Save
    sjoin_full_results.to_parquet(f'{SHARED_GCS}local_roads_gtfs_shapes.parquet')
    
    end = datetime.datetime.now()
   
    print(f"time lapsed: {end-start}")    
    return sjoin_full_results

In [None]:
# test_operators = ['LA DOT Schedule','Bell Gardens Schedule','Nevada County Schedule','San Diego Schedule','OCTA Schedule','Sacramento Schedule']

In [None]:
test_localroads = join_local_roads(analysis_date)

In [None]:
# test_localroads.shape

In [None]:
# f"{GCS_FILE_PATH}shared_data/"

In [None]:
# f'{SHARED_GCS}local_roads_gtfs_shapes.parquet'

In [None]:
test_localroads.linearid.nunique()

In [None]:
# local_roads = load_roads(['S1400'])

In [None]:
# local_roads.linearid.nunique()

In [None]:
# test_localroads.linearid.nunique()

### Concat local roads and primary/secondary ones

In [None]:
def join_primary_secondary(gtfs_shape_gdf):
    
    # Load secondary-primary shapes
    primary_secondary_mtfcc = ['S1100','S1200']
    primary_secondary_roads = load_roads(primary_secondary_mtfcc)
    
    sjoin1 = gpd.sjoin(
        primary_secondary_roads,
        gtfs_shape_gdf,
        how = "inner",
        predicate = "intersects"
    ).drop_duplicates().reset_index(drop=True)
    
    # Save to GCS
    sjoin1.to_parquet(f'{SHARED_GCS}primary_secondary_roads_gtfs_shapes.parquet')
    print('Done with primary and secondary roads')
    return sjoin1

In [None]:
def join_local_roads(gtfs_shape_gdf):
     # Load local roads
    local_roads = load_roads(['S1400'])
    
    # Find all unique operators, ordered by
    # largest operators first
    operators_list = order_operators(date)
    
    # Empty dataframe
    sjoin_full_results = pd.DataFrame()
    
    # Loop through and join by operator
    for operator in operators_list:
        # Delete any local road linearids that have already been found by an operator
        try:
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()
            local_roads = local_roads[~local_roads.linearid.isin(linearid_to_delete)].reset_index(drop = True)
        except:
            pass
        
        shapes_filtered = shapes.loc[shapes.name == operator].reset_index(drop = True)
    
        sjoin1 = gpd.sjoin(
        local_roads,
        shapes_filtered,
        how = "inner",
        predicate = "intersects").drop_duplicates().reset_index(drop=True)
        
        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)
        
    # Save
    sjoin_full_results.to_parquet(f'{SHARED_GCS}local_roads_gtfs_shapes.parquet')
    print('Done with local roads')
    
    return sjoin_full_results

In [None]:
def gtfs_shapes_all_roads(date):
    start = datetime.datetime.now()
    # Load Shapes
    shapes = gtfs_shapes_operators(date)
    
    # Find primary & secondary raods
    primary_secondary_roads_gdf = join_primary_secondary(shapes)
    
    # Find local raods
    local_roads_gdf = join_local_roads(shapes)
    
    # Concat 
    all_roads = pd.concat([primary_secondary_roads_gdf, local_roads_gdf], axis=0)
    
    # Save
    all_roads.to_parquet(f'{SHARED_GCS}all_roads_gtfs_shapes.parquet')
    end = datetime.datetime.now()
    
    print(f"time lapsed: {end-start}")
    
    return all_roads