## Tiger Census
* https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
* S1200 - secondary road
* S1100 - primary road
* S1400 - local roads
* Build off scripts/cut_road_segments.py


In [2]:
import datetime

import dask.dataframe as dd
import dask_geopandas as dg
import geopandas
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from dask import compute, delayed
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import analysis_date
from shared_utils import dask_utils, geography_utils, utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
SHARED_GCS = f"{GCS_FILE_PATH}shared_data/"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Tiger - Load Roads
* TO DO: dissolve by linearid because there are mulitple rows associated with one id -> the same road

In [4]:
def load_roads(road_type_wanted: list, buffer_or_not: bool = False) -> gpd.GeoDataFrame:
    """
    Load roads based on what you filter.
    Can also buffer the roads or not.

    Args:
        road_type_wanted (list): the type of roads you want.

        https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
        buffer_or_not (bool): add a buffer of 200.

    Returns:
        GDF. As of 4/18/23, returns 953914 nunique linearid
    """
    df = gpd.read_parquet(
        f"{SHARED_GCS}all_roads_2020_state06.parquet",
        filters=[("MTFCC", "in", road_type_wanted)],
        columns=["LINEARID", "geometry", "FULLNAME"],
    ).to_crs(geography_utils.CA_NAD83Albers)

    # If a road has mutliple rows but the same
    # linear ID, dissolve it so it becomes one row.
    df = (
        df.drop_duplicates()
        .dissolve(by=["LINEARID"])
        .reset_index()
        .drop_duplicates()
        .reset_index(drop=True)
    )

    if buffer_or_not:
        df = df.assign(geometry=df.geometry.buffer(200))

    df = to_snakecase(df)

    return df

In [6]:
# len(og_tiger)

In [7]:
# og_tiger.linearid.nunique()

In [8]:
# len(og_tiger.drop_duplicates())

In [9]:
# more_than_1 = og_tiger.linearid.value_counts().loc[lambda x: x>1].reset_index()['index']

In [10]:
# more_than_1 = list(more_than_1)

In [11]:
# len(more_than_1)

#### Cesar Chavez Test

In [12]:
# cesar_chavez = og_tiger[og_tiger.fullname == "Cesar Chavez"].reset_index()

In [13]:
# cesar_chavez

### GTFS Shapes

In [14]:
def gtfs_stops_operators(date) -> gpd.GeoDataFrame:
    """
    Load stops with operator and
    feed key information.

    Args:
        date: date wanted for the datasets to be drawn from

    Returns:
        GDF
    """
    stops = (
        helpers.import_scheduled_stops(
            date, (), ["feed_key", "stop_id", "stop_key", "geometry"]
        )
        .compute()
        .drop_duplicates()
    )

    stops = stops.set_crs(geography_utils.CA_NAD83Albers)

    # Buffer each stop by 50 feet
    stops = stops.assign(buffered_geometry=stops.geometry.buffer(50))

    # Set geometry
    stops = stops.set_geometry("buffered_geometry")

    # Merge for operator information
    trips = (
        helpers.import_scheduled_trips(analysis_date, (), ["name", "feed_key"])
        .compute()
        .drop_duplicates()
    )

    m1 = pd.merge(stops, trips, on=["feed_key"], how="left")

    # Fill in na
    m1.name = m1.name.fillna("None")

    return m1

In [15]:
# stops = gtfs_stops_operators(analysis_date)

In [16]:
def gtfs_routes_operators(date) -> gpd.GeoDataFrame:
    """
    Load routes with operator and feed key information.

    Args:
        date: date wanted for the datasets to be drawn from
    """
    gtfs_shapes = helpers.import_scheduled_shapes(date).compute().drop_duplicates()

    gtfs_shapes = gtfs_shapes.set_crs(geography_utils.CA_NAD83Albers)

    trips = (
        helpers.import_scheduled_trips(date, (), ["name", "shape_array_key"])
        .compute()
        .drop_duplicates()
    )

    m1 = pd.merge(gtfs_shapes, trips, how="left", on="shape_array_key")

    return m1

In [17]:
def order_operators(date) -> list:
    """
    Re order a list of operators the largest
    ones will be at the top of the list.

    Args:
        date: date wanted for the datasets to be drawn from
    """
    operator_list = (
        helpers.import_scheduled_trips(date, (), ["name"]).compute().sort_values("name")
    )
    operator_list = operator_list.name.unique().tolist()

    # Reorder list so the biggest operators are at the beginning
    # based on NTD services data
    big_operators = [
        "LA DOT Schedule",
        "LA Metro Bus Schedule",
        "LA Metro Rail Schedule",
        "Bay Area 511 Muni Schedule",
        "Bay Area 511 AC Transit Schedule",
        "Bay Area 511 Santa Clara Transit Schedule",
        "Bay Area 511 BART Schedule",
        "San Diego Schedule",
        "OCTA Schedule",
        "Sacramento Schedule",
        "Bay Area 511 Sonoma-Marin Area Rail Transit Schedule",
        "Bay Area 511 SFO AirTrain Schedule",
        "Bay Area 511 South San Francisco Shuttle Schedule",
        "Bay Area 511 Marin Schedule",
        "Bay Area 511 County Connection Schedule",
        "Bay Area 511 MVGO Schedule",
        "Bay Area 511 Commute.org Schedule",
        "Bay Area 511 Union City Transit Schedule",
        "Bay Area 511 BART Schedule",
        "Bay Area 511 Caltrain Schedule",
        "Bay Area 511 Fairfield and Suisun Transit Schedule",
        "Bay Area 511 Dumbarton Express Schedule",
        "Bay Area 511 SamTrans Schedule",
        "Bay Area 511 Vine Transit Schedule",
        "Bay Area 511 Tri-Valley Wheels Schedule",
        "Bay Area 511 Sonoma County Transit Schedule",
        "Bay Area 511 Santa Rosa CityBus Schedule",
        "Bay Area 511 Golden Gate Transit Schedule",
        "Bay Area 511 Golden Gate Ferry Schedule",
        "Bay Area 511 San Francisco Bay Ferry Schedule",
        "Bay Area 511 SolTrans Schedule",
        "Bay Area 511 ACE Schedule",
        "Bay Area 511 Emery Go-Round Schedule",
        "Bay Area 511 Tri Delta Schedule",
        "Bay Area 511 Petaluma Schedule",
        "Bay Area 511 Capitol Corridor Schedule",
    ]

    # Delete off the big operators
    operator_list = list(set(operator_list) - set(big_operators))

    # Add back in the operators
    final_list = big_operators + operator_list

    return final_list

### Tiger Local Roads

#### Cut all roads - stops 1st then routes 
* Use some small operators to test.

In [18]:
def loop_sjoin(date, local_roads_gdf, gdf_routes_stops) -> gpd.GeoDataFrame:
    """
    By operator, sjoin its routes/stops to
    local roads gdf. Delete off any linear ids that are joined.

    Args:
        local_roads_gdf: local roads gdf (should be buffered roads).
        gdf_routes_stops: stops or routes gdf
        date: date wanted for the datasets to be drawn from
    """
    # Empty dataframe
    sjoin_full_results = pd.DataFrame()

    # Find all unique operators, ordered by largest operators first
    operators_list = order_operators(date)

    # Loop through and sjoin by operator
    for operator in operators_list:
        shapes_filtered = gdf_routes_stops.loc[
            gdf_routes_stops.name == operator
        ].reset_index(drop=True)

        # Delete any local road linear ids that have already been found by an operator
        try:
            # List of linear IDS
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()

            # Filter out the linear IDS in buffered local roads
            local_roads_gdf = local_roads_gdf[
                ~local_roads_gdf.linearid.isin(linearid_to_delete)
            ].reset_index(drop=True)
        except:
            pass

        # Do a sjoin but  keep the linearid as the only column
        sjoin1 = (
            gpd.sjoin(
                local_roads_gdf,
                shapes_filtered,
                how="inner",
                predicate="intersects",
            )[["linearid"]]
            .drop_duplicates()
            .reset_index(drop=True)
        )

        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)

    sjoin_full_results = sjoin_full_results.drop_duplicates()

    return sjoin_full_results

In [19]:
def sjoin_stops(buffered_roads, original_roads, date):
    """
    Sjoin stops to local roads.

    Returns:
        A list of linear IDs that have already
        been found and a GDF.
    """
    start = datetime.datetime.now()

    # Load stops
    gtfs_stops = gtfs_stops_operators(date)

    # Loop through and sjoin by operator
    stops_sjoin = loop_sjoin(date, buffered_roads, gtfs_stops)

    # Merge back to original local roads gdf, so we have the
    # non buffered geometry.
    m1 = pd.merge(original_roads, stops_sjoin, on="linearid", how="inner")

    # Fill in null values for fullname
    m1.fullname = m1.fullname.fillna("None")

    # Find linear ids to delete
    linearid_to_delete = m1.linearid.unique().tolist()

    # Save
    m1.to_parquet(f"{SHARED_GCS}local_roads_stops_sjoin.parquet")

    end = datetime.datetime.now()
    print(f"Done with sjoin with stops with local roads. Time lapsed: {end-start}")

    return m1, linearid_to_delete

In [20]:
def sjoin_routes(buffered_roads, original_roads, date, linearid_to_delete: list):
    start = datetime.datetime.now()

    # Load stops
    gtfs_routes = gtfs_routes_operators(date)

    # Delete out linear ids that have already been found
    local_roads_buffered = buffered_roads[
        ~buffered_roads.linearid.isin(linearid_to_delete)
    ].reset_index(drop=True)
    local_roads_og = original_roads[
        ~original_roads.linearid.isin(linearid_to_delete)
    ].reset_index(drop=True)

    routes_sjoin = loop_sjoin(date, local_roads_buffered, gtfs_routes)

    # Merge back to original local roads, so we have the
    # non buffered geometry.
    m1 = pd.merge(local_roads_og, routes_sjoin, on="linearid", how="inner")

    # Fill in null values for fullname
    m1.fullname = m1.fullname.fillna("None")

    # Save
    m1.to_parquet(f"{SHARED_GCS}local_roads_routes_sjoin.parquet")

    end = datetime.datetime.now()
    print(f"Done with sjoin with routes and local roads. Time lapsed: {end-start}")

    return m1

In [21]:
def sjoin_local_roads(date):
    """
    Sjoin local roads with stops first, then routes.
    """
    start = datetime.datetime.now()
    print(start)

    # Load local roads - buffered
    local_roads_buffered = load_roads(["S1400"], True)

    # Load local roads - not buffered
    local_roads_og = load_roads(["S1400"], False)

    # Deal with stops first
    stops_sjoin, linear_id_stops = sjoin_stops(
        local_roads_buffered, local_roads_og, date
    )

    # Move onto routes
    routes_sjoin = sjoin_routes(
        local_roads_buffered, local_roads_og, date, linear_id_stops
    )

    # Stack
    all_local_roads = pd.concat([stops_sjoin, routes_sjoin], axis=0)

    all_local_roads.to_parquet(
        f"{SHARED_GCS}local_roads_all_routes_stops_sjoin.parquet"
    )
    end = datetime.datetime.now()

    print(f"Done with doing an sjoin for all local roads. Time lapsed: {end-start}")
    return all_local_roads

In [22]:
# all_ops = cut_local_roads(analysis_date)

In [23]:
# all_ops.shape

In [24]:
#  all_ops.sample()

In [25]:
# all_ops.plot()

In [26]:
def chunk_dask_df(gdf, chunk_row_size: int):
    """
    Break up dataframes by a certain
    number of rows, turn them into a dask
    dataframe

    Args:
        gdf: the local roads that intersect w/ stops and routes
        chunk_row_size(int): how many rows each dataframe should
        be after splitting it out.

    Returns:
        List of dask dataframes. Length of how many dask dataframes
        are returned after cutting.
    """
    # Specify how many rows I want the gdf to broken into per df
    n = chunk_row_size

    # Break it out
    list_df = [gdf[i : i + n] for i in range(0, gdf.shape[0], n)]

    # Turn each dataframe to a dask one
    my_ddfs = []
    for df in list_df:
        ddf = dd.from_pandas(df, npartitions=1)
        my_ddfs.append(ddf)

    return my_ddfs, len(my_ddfs)

In [27]:
# ddfs, length = chunk_dask_df(all_ops, 10000)

In [28]:
# len(ddfs)

In [29]:
# type(ddfs)

In [30]:
# type(ddfs[0]), type(ddfs[15])

In [31]:
def dask_segment(ddf_list: list, ddfs_range: list) -> gpd.GeoDataFrame:
    """
    Use dask to cut roads into segments. Compute the results
    back to a GDF.

    Args:
        ddf_list: dask dataframes stored in a list.
        ddf_list[0] will yield a ddf.

        ddfs_range: how many items are in the ddf_list.
    """
    # Empty dataframe
    my_results = []

    # For each dask dataframe int the list
    # cut them and append the results into the empty df.
    for i in ddfs_range:
        my_df = ddf_list[i]
        cut_geometry = delayed(geography_utils.cut_segments)(
            my_df, ["linearid", "fullname"], 1_000
        )
        my_results.append(cut_geometry)
        print(f"done with {i}")

    # Compute results into a normal gdf
    compute_results = [compute(i)[0] for i in my_results]

    # Concat results
    results_gdf = pd.concat(compute_results)

    return results_gdf

In [32]:
# test = dask_segment(ddfs, [0,1])

In [33]:
# test.shape

In [34]:
def cut_local_roads(date, chunk_row_size: int) -> gpd.GeoDataFrame:
    start = datetime.datetime.now()
    print(f"Cut: local roads {start}")

    # Find all local roads that intersect with
    # stops and routes.
    local_roads_unsegmented = sjoin_local_roads(date)

    # Divide the gdf into equal sized chunks (roughly)
    # and turn them into dask gdfs
    ddfs, length = chunk_dask_df(local_roads_unsegmented, chunk_row_size)

    # Split the list of split dask dataframes
    # into half.
    length_list = [*range(0, length)]
    ddf1 = length_list[: len(length_list) // 2]
    ddf2 = length_list[len(length_list) // 2 :]

    # Cut geometry
    part1 = dask_segment(ddfs, ddf1)
    part1.to_parquet(f"{SHARED_GCS}segmented_local_rds_first_pt.parquet")
    print("Done with cutting part1")

    part2 = dask_segment(ddfs, ddf2)
    part2.to_parquet(f"{SHARED_GCS}segmented_local_rds_second_pt.parquet")
    print("Done with cutting part2")

    segmented_local_roads = pd.concat([part1, part2])
    segmented_local_roads.to_parquet(f"{SHARED_GCS}segmented_local_rds.parquet")

    end = datetime.datetime.now()
    print(f"Done cutting local roads in {end-start} minutes")
    return segmented_local_roads

In [35]:
# test = cut_local_roads(analysis_date, 10000)

### Concat local roads and primary/secondary ones

In [36]:
def cut_primary_secondary_roads():
    start = datetime.datetime.now()
    print(f"Cutting primary/secondary roads {start}")

    # Find all primary and secondary roads
    # regardless of intersection w/ GTFS shapes
    primary_secondary_mtfcc = ["S1100", "S1200"]
    primary_secondary_roads = load_roads(primary_secondary_mtfcc, False)

    segments = geography_utils.cut_segments(
        primary_secondary_roads, ["linearid", "fullname"], 1_000  # 1 km segments
    )

    segments.to_parquet(f"{SHARED_GCS}segmented_primary_secondary_roads.parquet")

    end = datetime.datetime.now()
    print(f"Done cutting primary & secondary roads: {end-start}")
    return segments

In [37]:
# primary_secondary = cut_primary_secondary_roads()

In [38]:
def cut_all_roads(date, chunk_row_size):
    """
    Takes about 1.5 hours.
    """
    start = datetime.datetime.now()
    print(f"Cutting all local roads/primary/secondary roads {start}")

    # Find local roads that intersect  with GTFS shapes, then
    # segment them
    local_roads_gdf = cut_local_roads(date, chunk_row_size)

    # Segment primary and secondary roads
    segmented_primary_secondary_rds = cut_primary_secondary_roads()

    # Concat
    all_roads = pd.concat([segmented_primary_secondary_rds, local_roads_gdf], axis=0)
    all_roads.to_parquet(f"{SHARED_GCS}segmented_all_roads.parquet")

    end = datetime.datetime.now()
    print(f"time lapsed for cutting all roads: {end-start}")

In [39]:
# all_roads = cut_all_roads(analysis_date, 10_000)

### Dask 2

In [40]:
# Read in sjoined files
local_roads_unsegmented = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/shared_data/local_roads_all_routes_stops_sjoin.parquet"
)

In [41]:
local_roads_unsegmented.shape

(322720, 3)

In [42]:
# Turn sjoin local roads to dask
ddf1 = dd.from_pandas(local_roads_unsegmented, npartitions=1)

In [43]:
# Partition the sjoin stuff automatically
ddf1_partitioned = ddf1.repartition(partition_size="1MB")

In [44]:
#Save
ddf1_partitioned.to_parquet(f"{SHARED_GCS}daskutilstest")

In [45]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [46]:
# Find all the parquets again
def find_files(phrase_to_find: str):
    """
    Grab a list of files that contain the
    phrase inputted. E.g. "tmobile_no_coverage"
    """
    folder = "gs://calitp-analytics-data/data-analyses/shared_data/daskutilstest"
    # Create a list of all the files in my folder
    all_files_in_folder = fs.ls(folder)

    # Grab only files with the string "Verizon_no_coverage_"
    my_files = [i for i in all_files_in_folder if phrase_to_find in i]

    # String to add to read the files
    my_string = "gs://"
    my_files = [my_string + i for i in my_files]
    
    # Extract digit of parquet 
    return my_files

In [48]:
def extract_number(phrase_to_find: str) -> list:
    files = find_files(phrase_to_find)
    all_file_numbers = []
    for file in files:
    # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        file_number = "".join(i for i in file if i.isdigit())
        all_file_numbers.append(file_number)
    return all_file_numbers 

In [50]:
file_names_dask = extract_number("part")

In [48]:
# https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/
# create empty list
dataframes_list = []
 
# append datasets into the list
for i in range(len(file_names)):
    gcs_file_path = "gs://calitp-analytics-data/data-analyses/shared_data/daskutilstest/part."
    temp_df = dg.read_parquet(f"{gcs_file_path}{file_names[i]}.parquet")
    dataframes_list.append(temp_df)

In [49]:
len(dataframes_list)

50

In [50]:
dataframes_list[0]

Unnamed: 0_level_0,linearid,geometry,fullname
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,geometry,object
,...,...,...


In [51]:
# Empty dataframe
my_results = []

In [52]:
# For each dask dataframe int the list
# cut them and append the results into the empty df.
for ddf in dataframes_list:
    cut_geometry = delayed(geography_utils.cut_segments)(ddf, ["linearid", "fullname"], 1_000)
    my_results.append(cut_geometry)

In [53]:
my_results

[Delayed('cut_segments-7c0c5d9f-48d9-4eb7-9e1d-9245a7b3843d'),
 Delayed('cut_segments-3499a829-6eff-4498-8212-75374c3456c1'),
 Delayed('cut_segments-27bd3f38-9042-44c3-a13c-94b6033067eb'),
 Delayed('cut_segments-47b5d1db-f477-421f-b6d3-bc2bbad8371a'),
 Delayed('cut_segments-0606cfde-16db-4eeb-91d3-b2c8111b3cbc'),
 Delayed('cut_segments-43706995-e5a8-4ccd-9ec6-4344d3ff3b76'),
 Delayed('cut_segments-4a1bedcf-80b1-407e-bf7f-012991357f53'),
 Delayed('cut_segments-407a7c89-313e-41db-a8a3-48eed5098449'),
 Delayed('cut_segments-69f5069e-9843-47ac-904c-a3bac14ebb95'),
 Delayed('cut_segments-6739f55d-7336-434c-85a8-26f7971733f0'),
 Delayed('cut_segments-583118e6-d2c0-4d64-8040-b739bec0f891'),
 Delayed('cut_segments-3ebecc85-f1bc-45b8-a7ea-cfdb11ddc840'),
 Delayed('cut_segments-199fc757-bb39-440c-8d49-674097b1a257'),
 Delayed('cut_segments-3f6bda16-574f-4482-b6fd-fd51f2c4e27b'),
 Delayed('cut_segments-2caeaf7e-5d8c-4af8-92d4-bd76d829d50a'),
 Delayed('cut_segments-71af55ec-4a5c-41ec-a252-79ed77bf

In [54]:
# Took 1:06:37.299317
#start = datetime.datetime.now()
#print(start)
#results2 = [compute(i)[0] for i in my_results]  # 9;12
#end = datetime.datetime.now()
#print(end)

2023-05-01 17:19:40.544129
2023-05-01 18:26:17.843446


In [63]:
# print(end-start)

1:06:37.299317


In [55]:
# type(results2)

list

In [56]:
# testpd = pd.concat(results2, axis=0).reset_index(drop=True)

In [57]:
# type(testpd)

geopandas.geodataframe.GeoDataFrame

In [59]:
# testpd.to_parquet(f"{SHARED_GCS}dask_test2.parquet")

In [60]:
# testpd.shape

(406743, 4)

In [52]:
dask2 = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/shared_data/dask_test2.parquet")

In [53]:
dask2.linearid.nunique()

322649

In [68]:
dask2.linearid.value_counts().head()

11019653760031    203
1106092764328     193
110411099535       73
1106073054809      65
1106092765807      65
Name: linearid, dtype: int64

In [69]:
test_linearids = [ '11019653760031',
 '1106092764328',
 '11011135052903',
 '110411099535',
 '11011135055229',
 '11011135055553',
 '11011135056214',
 '11012028306122',
 '11012812027422',
 '11012812027505',
 '11012812035943',
 '11012812038881',
 '1106073054809']

In [76]:
og_tiger = load_roads(['S1400'], False)

In [77]:
og_tiger_filtered = og_tiger[og_tiger.linearid.isin(test_linearids)].reset_index()

In [81]:
# og_tiger_filtered.explore('linearid',cmap = "tab20c", style_kwds = {'weight':5})

In [70]:
dask2_filtered = dask2[dask2.linearid.isin(test_linearids)].reset_index()

In [71]:
dask2_filtered.shape

(547, 5)

In [79]:
dask2_filtered.sample()

Unnamed: 0,index,geometry,linearid,fullname,segment_sequence
145,183819,"LINESTRING (-165233.490 -163207.729, -165242.476 -163205.465, -165254.914 -163201.221, -165327.228 -163176.357, -165348.637 -163168.936, -165418.820 -163149.448, -165425.168 -163149.324, -165481.666 -163148.110, -165616.292 -163158.708, -165711.452 -163167.964, -165818.614 -163164.423, -165878.438 -163150.358, -165937.295 -163123.418, -166005.487 -163070.839, -166056.518 -163009.702, -166130.363 -162930.778, -166133.494 -162927.035)",1106073054809,Co Rd G16,59


In [80]:
# dask2_filtered.explore('segment_sequence',cmap = "tab20c", style_kwds = {'weight':5})