## Tiger Census
* https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
* S1200 - secondary road
* S1100 - primary road
* S1400 - local roads
* Build off scripts/cut_road_segments.py


In [1]:
import datetime
import gcsfs
fs = gcsfs.GCSFileSystem()
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from dask import compute, delayed
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import analysis_date
from shared_utils import dask_utils, geography_utils, utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
SHARED_GCS = f"{GCS_FILE_PATH}shared_data/"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Tiger - Load Roads
* TO DO: remove buffer, do it another step b/c now I have to dissolve twice and that takes too long

In [3]:
def load_roads(road_type_wanted: list) -> gpd.GeoDataFrame:
    """
    Load roads based on what you filter.

    Args:
        road_type_wanted (list): the type of roads you want.

        https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2019/TGRSHP2019_TechDoc.pdf
        buffer_or_not (bool): add a buffer of 200.

    Returns:
        GDF. As of 4/18/23, returns 953914 nunique linearid
    """
    df = gpd.read_parquet(
        f"{SHARED_GCS}all_roads_2020_state06.parquet",
        filters=[("MTFCC", "in", road_type_wanted)],
        columns=["LINEARID", "geometry", "FULLNAME"],
    ).to_crs(geography_utils.CA_NAD83Albers)

    # If a road has mutliple rows but the same
    # linear ID, dissolve it so it becomes one row.
    df = (
        df.drop_duplicates()
        .dissolve(by=["LINEARID"])
        .reset_index()
        .drop_duplicates()
        .reset_index(drop=True)
    )

    df = to_snakecase(df)

    return df

In [4]:
# len(og_tiger)

In [5]:
# og_tiger.linearid.nunique()

In [6]:
# len(og_tiger.drop_duplicates())

In [7]:
# more_than_1 = og_tiger.linearid.value_counts().loc[lambda x: x>1].reset_index()['index']

In [8]:
# more_than_1 = list(more_than_1)

In [9]:
# len(more_than_1)

#### Cesar Chavez Test

In [10]:
# cesar_chavez = og_tiger[og_tiger.fullname == "Cesar Chavez"].reset_index()

In [11]:
# cesar_chavez

### GTFS Shapes

In [12]:
def gtfs_stops_operators(date:str) -> gpd.GeoDataFrame:
    """
    Load stops with operator and feed key information.

    Args:
        date: analysis date
    """
    stops = (
        helpers.import_scheduled_stops(
            date, (), ["feed_key", "stop_id", "stop_key", "geometry"]
        )
        .compute()
        .drop_duplicates()
    )

    stops = stops.set_crs(geography_utils.CA_NAD83Albers)

    # Buffer each stop by 50 feet
    stops = stops.assign(buffered_geometry=stops.geometry.buffer(50))

    # Set geometry
    stops = stops.set_geometry("buffered_geometry")

    # Merge for operator information
    trips = (
        helpers.import_scheduled_trips(analysis_date, (), ["name", "feed_key"])
        .compute()
        .drop_duplicates()
    )

    m1 = pd.merge(stops, trips, on=["feed_key"], how="left")

    # Fill in na
    m1.name = m1.name.fillna("None")

    return m1

In [13]:
# stops = gtfs_stops_operators(analysis_date)

In [14]:
def gtfs_routes_operators(date:str) -> gpd.GeoDataFrame:
    """
    Load routes with operator and feed key information.

    Args:
        date: analysis date
    """
    gtfs_shapes = helpers.import_scheduled_shapes(date).compute().drop_duplicates()

    gtfs_shapes = gtfs_shapes.set_crs(geography_utils.CA_NAD83Albers)

    trips = (
        helpers.import_scheduled_trips(date, (), ["name", "shape_array_key"])
        .compute()
        .drop_duplicates()
    )

    m1 = pd.merge(gtfs_shapes, trips, how="left", on="shape_array_key")

    return m1

In [15]:
def order_operators(date:str) -> list:
    """
    Reorder a list of operators in which the largest
    ones will be at the top of the list.

    Args:
        date: analysis date
    """
    operator_list = (
        helpers.import_scheduled_trips(date, (), ["name"]).compute().sort_values("name")
    )
    operator_list = operator_list.name.unique().tolist()

    # Reorder list so the biggest operators are at the beginning
    # based on NTD services data
    big_operators = [
        "LA DOT Schedule",
        "LA Metro Bus Schedule",
        "LA Metro Rail Schedule",
        "Bay Area 511 Muni Schedule",
        "Bay Area 511 AC Transit Schedule",
        "Bay Area 511 Santa Clara Transit Schedule",
        "Bay Area 511 BART Schedule",
        "San Diego Schedule",
        "OCTA Schedule",
        "Sacramento Schedule",
        "Bay Area 511 Sonoma-Marin Area Rail Transit Schedule",
        "Bay Area 511 SFO AirTrain Schedule",
        "Bay Area 511 South San Francisco Shuttle Schedule",
        "Bay Area 511 Marin Schedule",
        "Bay Area 511 County Connection Schedule",
        "Bay Area 511 MVGO Schedule",
        "Bay Area 511 Commute.org Schedule",
        "Bay Area 511 Union City Transit Schedule",
        "Bay Area 511 BART Schedule",
        "Bay Area 511 Caltrain Schedule",
        "Bay Area 511 Fairfield and Suisun Transit Schedule",
        "Bay Area 511 Dumbarton Express Schedule",
        "Bay Area 511 SamTrans Schedule",
        "Bay Area 511 Vine Transit Schedule",
        "Bay Area 511 Tri-Valley Wheels Schedule",
        "Bay Area 511 Sonoma County Transit Schedule",
        "Bay Area 511 Santa Rosa CityBus Schedule",
        "Bay Area 511 Golden Gate Transit Schedule",
        "Bay Area 511 Golden Gate Ferry Schedule",
        "Bay Area 511 San Francisco Bay Ferry Schedule",
        "Bay Area 511 SolTrans Schedule",
        "Bay Area 511 ACE Schedule",
        "Bay Area 511 Emery Go-Round Schedule",
        "Bay Area 511 Tri Delta Schedule",
        "Bay Area 511 Petaluma Schedule",
        "Bay Area 511 Capitol Corridor Schedule",
    ]

    # Delete off the big operators
    operator_list = list(set(operator_list) - set(big_operators))

    # Add back in the operators
    final_list = big_operators + operator_list

    return final_list

### Tiger Local Roads

#### Cut all roads - stops 1st then routes 
* Use some small operators to test.

In [16]:
def loop_sjoin(date:str, local_roads_gdf:gpd.GeoDataFrame, gdf_routes_stops:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    By operator, sjoin either routes or stops to the tiger gdf. 
    Delete off any linear ids that have already been joined.

    Args:
        local_roads_gdf: local roads gdf, use the buffered version of Tiger 
        gdf_routes_stops: stops or routes gdf
        date: analysis date
    """
    # Empty dataframe
    sjoin_full_results = pd.DataFrame()

    # Find all unique operators, ordered by largest operators first
    operators_list = order_operators(date)

    # Loop through and sjoin by operator
    for operator in operators_list:
        shapes_filtered = gdf_routes_stops.loc[
            gdf_routes_stops.name == operator
        ].reset_index(drop=True)

        # Delete any local road linear ids that have already been found by an operator
        try:
            # List of linear IDS
            linearid_to_delete = sjoin_full_results.linearid.unique().tolist()

            # Filter out the linear IDS in buffered local roads
            local_roads_gdf = local_roads_gdf[
                ~local_roads_gdf.linearid.isin(linearid_to_delete)
            ].reset_index(drop=True)
        except:
            pass

        # Do a sjoin but  keep the linearid as the only column
        sjoin1 = (
            gpd.sjoin(
                local_roads_gdf,
                shapes_filtered,
                how="inner",
                predicate="intersects",
            )[["linearid"]]
            .drop_duplicates()
            .reset_index(drop=True)
        )

        sjoin_full_results = pd.concat([sjoin_full_results, sjoin1], axis=0)

    sjoin_full_results = sjoin_full_results.drop_duplicates()

    return sjoin_full_results

In [17]:
def sjoin_stops(buffered_roads:gpd.GeoDataFrame, original_roads:gpd.GeoDataFrame, date:str) -> gpd.GeoDataFrame:
    """
    Sjoin stops to local roads.
    
    Args:
        buffered_roads: local TIGER roads gdf that are buffered
        original_roads: original local TIGER roads gdf
        date: analysis date

    Returns:
        A list of linear IDs that have already
        been found and a GDF
    """
    start = datetime.datetime.now()

    # Load stops
    gtfs_stops = gtfs_stops_operators(date)

    # Loop through and sjoin by operator
    stops_sjoin = loop_sjoin(date, buffered_roads, gtfs_stops)

    # Merge back to original local roads gdf, so we have the
    # non buffered geometry.
    m1 = pd.merge(original_roads, stops_sjoin, on="linearid", how="inner")

    # Fill in null values for fullname
    m1.fullname = m1.fullname.fillna("None")

    # Find linear ids to delete
    linearid_to_delete = m1.linearid.unique().tolist()

    # Save
    m1.to_parquet(f"{SHARED_GCS}local_roads_stops_sjoin.parquet")

    end = datetime.datetime.now()
    print(f"Done with sjoin with stops with local roads. Time lapsed: {end-start}")

    return m1, linearid_to_delete

In [18]:
def sjoin_routes(buffered_roads: gpd.GeoDataFrame, original_roads: gpd.GeoDataFrame, date:str, linearid_to_delete: list) -> gpd.GeoDataFrame:
    """
    Sjoin routes to local roads.
    
    Args:
        buffered_roads: local TIGER roads that are buffered
        original_roads: original local Tiger roads
        date: analysis date
        linearid_to_delete: linear ids to delete that have already been found 
        while applying a sjoin to stops.
    """
    start = datetime.datetime.now()

    # Load stops
    gtfs_routes = gtfs_routes_operators(date)

    # Delete out linear ids that have already been found
    local_roads_buffered = buffered_roads[~buffered_roads.linearid.isin(linearid_to_delete)].reset_index(drop=True)
    local_roads_og = original_roads[~original_roads.linearid.isin(linearid_to_delete)].reset_index(drop=True)
    
    # Sjoin
    routes_sjoin = loop_sjoin(date, local_roads_buffered, gtfs_routes)

    # Merge back to original local roads, so we have the
    # non buffered geometry.
    m1 = pd.merge(local_roads_og, routes_sjoin, on="linearid", how="inner")

    # Fill in null values for fullname
    m1.fullname = m1.fullname.fillna("None")

    # Save
    m1.to_parquet(f"{SHARED_GCS}local_roads_routes_sjoin.parquet")

    end = datetime.datetime.now()
    print(f"Done with sjoin with routes and local roads. Time lapsed: {end-start}")

    return m1

In [19]:
def sjoin_local_roads(date:str) -> gpd.GeoDataFrame:
    """
    Sjoin local roads with stops first then routes.
    
    Args:
        date: analysis date
    """
    start = datetime.datetime.now()
    print(f"Begin sjoin")
    
    # Load local roads - not buffered
    local_roads_og = load_roads(["S1400"])
    
    # Load local roads - buffered
    local_roads_buffered = local_roads_og.assign(geometry=local_roads_og.geometry.buffer(200))
    local_roads_buffered = local_roads_buffered.set_geometry('geometry')
    
    print(f"Done buffering")
    
    # Deal with stops first
    stops_sjoin, linear_id_stops = sjoin_stops(
        local_roads_buffered, local_roads_og, date
    )

    # Move onto routes
    routes_sjoin = sjoin_routes(
        local_roads_buffered, local_roads_og, date, linear_id_stops
    )

    # Stack
    all_local_roads = pd.concat([stops_sjoin, routes_sjoin], axis=0)
    
    file_date = date.replace('-','_')
    all_local_roads.to_parquet(f"{SHARED_GCS}local_roads_all_routes_stops_sjoin_{file_date}.parquet")
    
    end = datetime.datetime.now()

    print(f"Done with doing an sjoin for all local roads. Time lapsed: {end-start}")
    return all_local_roads

In [20]:
# analysis_date

In [21]:
# all_ops = sjoin_local_roads(analysis_date)

#### Dask Redo

In [22]:
# Find all the parquets again
def find_files(phrase_to_find: str) -> list:
    """
    Grab a list of files that contain the
    phrase inputted. 
    """
    folder = f"{SHARED_GCS}partitioned_tiger"
    
    # Create a list of all the files in my folder
    all_files_in_folder = fs.ls(folder)

    # Grab only files with the string "Verizon_no_coverage_"
    my_files = [i for i in all_files_in_folder if phrase_to_find in i]

    # String to add to read the files
    my_string = "gs://"
    my_files = [my_string + i for i in my_files]
    
    # Extract digit of parquet 
    return my_files

In [23]:
def extract_number(phrase_to_find: str) -> list:
    """
    Extract the numeric portion of a file path.
    """
    files = find_files(phrase_to_find)
    all_file_numbers = []
    for file in files:
    # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        file_number = "".join(i for i in file if i.isdigit())
        all_file_numbers.append(file_number)
    return all_file_numbers 

In [24]:
def chunk_dask_df(gdf) -> list:
    """
    Break up dataframes by a certain
    number of rows, turn them into a dask
    dataframe

    Args:
        gdf: the local roads that intersect w/ stops and routes
        chunk_row_size(int): how many rows each dataframe should
        be after splitting it out.

    Returns:
        List of dask dataframes. Length of how many dask dataframes
        are returned after cutting.
    """
   # Turn sjoin local roads to dask
    ddf1 = dd.from_pandas(gdf, npartitions=1)

    # Partition the sjoin stuff automatically
    ddf1_partitioned = ddf1.repartition(partition_size="1MB")
    
    #Save out to GCS
    ddf1_partitioned.to_parquet(f"{SHARED_GCS}partitioned_tiger", overwrite = True)
    
    # Read back all the partitioned stuff - grab the file number
    #part0.parquet, part1.parquet
    file_names_dask = extract_number("part")
    
    # https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/
    # create empty list
    dataframes_list = []
 
    # append datasets into the list
    for i in range(len(file_names_dask)):
        gcs_file_path = f"{SHARED_GCS}partitioned_tiger/part."
        temp_df = dg.read_parquet(f"{gcs_file_path}{file_names_dask[i]}.parquet")
        dataframes_list.append(temp_df)
        
    return dataframes_list

In [25]:
def cut_geometry_compute(dask_dataframe_list:list) -> gpd.GeoDataFrame:
    # Cut geometry
    print("Cut geometry")
    cut_results = []
    for ddf in dask_dataframe_list:
        cut_geometry = delayed(geography_utils.cut_segments)(ddf, ["linearid", "fullname"], 1_000)
        cut_results.append(cut_geometry)
        
    print(f"Begin computing")
    # Compute 
    cut_results = [compute(i)[0] for i in cut_results]
    cut_df = pd.concat(cut_results, axis=0).reset_index(drop=True)
    
    return cut_df

In [26]:
def cut_local_roads(gdf, date:str) -> gpd.GeoDataFrame:
    """
    Cut all the local roads.
    
    gdf: the local roads to cut
    """
    start = datetime.datetime.now()
    print(f"Begin cutting local roads")

    # Divide the gdf into equal sized chunks (roughly)
    # and turn them into dask gdfs
    print("Split into mulitple parquets")
    ddf_list = chunk_dask_df(gdf)
    
    cut_df = cut_geometry_compute(ddf_list)
    
    """
    # Cut geometry
    cut_results = []
    for ddf in ddf_list:
        cut_geometry = delayed(geography_utils.cut_segments)(ddf, ["linearid", "fullname"], 1_000)
        cut_results.append(cut_geometry)
    print("Cut geometry")
    
    print(f"Begin computing")
    # Compute 
    cut_results = [compute(i)[0] for i in cut_results]
    cut_df = pd.concat(cut_results, axis=0).reset_index(drop=True)
    
    """
    file_date = date.replace('-','_')
    cut_df.to_parquet(f"{SHARED_GCS}segmented_local_rds_{file_date}.parquet")
    
    end = datetime.datetime.now()
    print(f"Done cutting local roads in {end-start} minutes")
    return cut_df

In [27]:
# test = cut_local_roads(analysis_date)

In [28]:
# all_ops.shape

In [29]:
#  all_ops.sample()

In [30]:
# all_ops.plot()

In [31]:
# ddfs, length = chunk_dask_df(all_ops, 10000)

In [32]:
# len(ddfs)

In [33]:
# type(ddfs)

In [34]:
# type(ddfs[0]), type(ddfs[15])

In [35]:
# test = dask_segment(ddfs, [0,1])

In [36]:
# test.shape

In [37]:
# test = cut_local_roads(analysis_date, 10000)

###  Monthly run 

In [52]:
def monthly_linearids(date:str, last_month_segmented_local_roads: str) -> gpd.GeoDataFrame:
    """
    Instead of re-cutting all the primary and secondary roads and 
    local roads found from the last run, only cut the new local roads
    that are found. Delete out any local roads that aren't found in 
    this month's routes. 
    
    Args:
        date: analysis_date
        last_month_segmented_local_roads: file name of last month's local roads that
        have been cut. Don't include .parquet.
    """
    start = datetime.datetime.now()
    print(f"Start: {start}")
    
    # Sjoin this month's data to tiger roads
    this_month_gdf = sjoin_local_roads(date)
    
    # Find this month's linearids
    this_month_linearid = set(this_month_gdf.linearid.unique().tolist())
    
    # Grab last month's results that have already been cut - local roads only
    last_month_gdf = gpd.read_parquet(f"{SHARED_GCS}{last_month_segmented_local_roads}.parquet")
    last_month_linearid = set(last_month_gdf.linearid.unique().tolist())
    
    # Have to cut linear ids that appear in this month but not last month
    linearids_to_cut = list(this_month_linearid - last_month_linearid)
    print(f"There are {len(linearids_to_cut)} new linear ids found this month that didn't appear last month.")
    
    # Have to delete linear ids that appear in last month but not this month.
    linearids_to_delete = list(last_month_linearid - this_month_linearid)
    print(f"There are {len(linearids_to_delete)} that didn't appear this month that will be deleted.")
    
    # Filter out linear ids that are no longer relevant to this month
    cut_linearid_1= last_month_gdf.loc[~last_month_gdf.linearid.isin(linearids_to_delete)].reset_index(drop = True)
    
    # Cut the linearids that are only found in this month
    cut_linearid_2 = this_month_gdf.loc[this_month_gdf.linearid.isin(linearids_to_cut)].reset_index(drop = True)
    cut_linearid_2 = cut_local_roads(cut_linearid_2, date)
    
    # Compare lengths of last versus this month's local roads
    this_month_local_roads = pd.concat([cut_linearid_1, cut_linearid_2], axis = 0)
    this_month_len = this_month_local_roads.geometry.length.sum()
    last_month_len = last_month_gdf.geometry.length.sum()
    print(f"This month's local roads length: {this_month_len}. Last month: {last_month_len}. Diff: {last_month_len-this_month_len}")
    
    # Read in primary & secondary roads that have already been cut
    primary_secondary = gpd.read_parquet(f"{SHARED_GCS}segmented_primary_secondary_roads.parquet")
    
    # Concat everything
    this_month_segmented = pd.concat([cut_linearid_1, cut_linearid_2, primary_secondary],axis=0)
    
    # Save
    file_date = date.replace('-','_')
    this_month_local_roads.to_parquet(f"{SHARED_GCS}segmented_local_roads_{file_date}.parquet")
    this_month_segmented.to_parquet(f"{SHARED_GCS}segmented_all_roads_{file_date}.parquet")
    
    end = datetime.datetime.now()
    print(f"Done: {end-start}")
    return this_month_segmented

In [39]:
april_month = "2023-04-12"

In [53]:
april_df = monthly_linearids(april_month, "segmented_local_rds")

Start: 2023-05-26 14:34:02.338678
Begin sjoin
Done buffering
Done with sjoin with stops with local roads. Time lapsed: 0:03:01.625797
Done with sjoin with routes and local roads. Time lapsed: 0:09:48.755078
Done with doing an sjoin for all local roads. Time lapsed: 0:22:02.441069
There are 11418 new linear ids found this month that didn't appear last month.
There are 8673 that didn't appear this month that will be deleted.
Begin cutting local roads
Split into mulitple parquets
Cut geometry
Begin computing
Done cutting local roads in 0:02:32.191740 minutes
This month's local roads length: 174846164.15138897. Last month: 171163870.38026056. Diff: -3682293.771128416
Done: 0:28:42.743699


In [None]:
#march_linearids = set(march_linearis.linearid.unique().tolist())

In [None]:
# all_ops = sjoin_local_roads(april_month, "April_2023")

In [None]:
#april_linearids = set(april_linearid.linearid.unique().tolist())

In [None]:
#len(april_linearids)

In [None]:
#len(march_linearids)

In [None]:
#new_linearids_to_cut = list(april_linearids - march_linearids)

In [None]:
# linearids found in this month but not last month's
#f"{len(new_linearids_to_cut)} new linearids"

In [None]:
# Linearids only found in last month but not this month's
#linearids_to_delete = list(march_linearids - april_linearids)

In [None]:
#f"{len(linearids_to_delete)} linearids to be deleted"

In [None]:
# Delete off roads 
#march_all_segmented_roads = gpd.read_parquet(f"{SHARED_GCS}segmented_all_roads.parquet")

In [None]:
# arch_all_segmented_roads.sample()

In [None]:
#april_segmented_roads = march_all_segmented_roads[~march_all_segmented_roads.linearid.isin(linearids_to_delete)].reset_index(drop = True)

In [None]:
#march_all_segmented_roads.linearid.nunique() - april_segmented_roads.linearid.nunique()

In [None]:
# Find the linearids from last month
#april_roads_to_cut = (gpd
 #             .read_parquet(f"{SHARED_GCS}local_roads_all_routes_stops_sjoin_April_2023.parquet")
 #            )


In [None]:
# april_roads_to_cut = april_roads_to_cut.loc[april_roads_to_cut.linearid.isin(new_linearids_to_cut)].reset_index(drop = True)

In [None]:
# len(april_roads_to_cut)

In [None]:
# april_cut_segments = cut_local_roads_monthly(april_roads_to_cut, april_month)

In [None]:
# type(april_cut_segments)

In [None]:
# april_local_roads = pd.concat([april_segmented_roads, april_cut_segments], axis=0)

In [None]:
# april_local_roads.sample()

### Concat local roads and primary/secondary ones

In [None]:
def cut_primary_secondary_roads() -> gpd.GeoDataFrame:
    start = datetime.datetime.now()
    print(f"Cutting primary/secondary roads")

    # Find all primary and secondary roads
    # regardless of intersection w/ GTFS shapes
    primary_secondary_mtfcc = ["S1100", "S1200"]
    primary_secondary_roads = load_roads(primary_secondary_mtfcc)

    segments = geography_utils.cut_segments(
        primary_secondary_roads, ["linearid", "fullname"], 1_000  # 1 km segments
    )

    segments.to_parquet(f"{SHARED_GCS}segmented_primary_secondary_roads.parquet")

    end = datetime.datetime.now()
    print(f"Done cutting primary & secondary roads: {end-start}")
    return segments

In [None]:
# primary_secondary = cut_primary_secondary_roads()

### Cut everything from top to bottom

In [None]:
def cut_all_roads(date:str) -> gpd.GeoDataFrame:
    """
    Cut all roads: primary, secondary, and primary roads
    that overlap with bus routes.
    
    Takes about 1.5 hours.
    date (str): analysis date
    """
    start = datetime.datetime.now()
    print(f"Start cutting all roads: {start}")
    # Find local roads that intersect  with GTFS shapes, then
    # segment them
    local_roads_unsegmented = sjoin_local_roads(date)
    local_roads_gdf = cut_local_roads(local_roads_unsegmented, date)

    # Segment primary and secondary roads
    segmented_primary_secondary_rds = cut_primary_secondary_roads()

    # Concat
    file_date = date.replace('-','_')
    all_roads = pd.concat([segmented_primary_secondary_rds, local_roads_gdf], axis=0)
    all_roads.to_parquet(f"{SHARED_GCS}segmented_all_roads_{file_date}.parquet")

    end = datetime.datetime.now()
    print(f"Time lapsed for cutting all roads: {end-start}")

In [None]:
# all_roads = cut_all_roads(analysis_date)

In [None]:
# all_cut_roads = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/shared_data/segmented_all_roads.parquet")

In [None]:
#all_cut_roads.linearid.nunique()

In [None]:
#all_cut_roads.linearid.value_counts().head()

In [None]:
#all_cut_roads.linearid.value_counts().describe()

In [None]:
#linearid_subset = ['11018382472869','1105640135361']

In [None]:
#all_cut_roads.head(100).explore('segment_sequence', cmap = 'tab10', style_kwds = {'weight':10}, legend = False)

In [None]:
# all_cut_roads.loc[all_cut_roads.linearid.isin(linearid_subset)].explore('segment_sequence', cmap = 'tab10', legend = False)

### If Main ??
* As of 5/18, takes 1.5 hours.
* 

In [None]:
def cut_all_or_month(date:str, last_month_segmented_local_roads: str, run_monthly:bool = True):
    if run_monthly:
        gdf1 = monthly_linearids(date, last_month_segmented_local_roads)
    else:
        gdf2 = cut_all_roads(date)    

In [None]:
analysis_date

In [None]:
"""
Cut all the roads top to bottom
test = cut_all_or_month(analysis_date, "", False)

Start cutting all roads: 2023-05-18 10:30:27.686491
Begin sjoin
Done buffering
Done with sjoin with stops with local roads. Time lapsed: 0:03:20.079432
Done with sjoin with routes and local roads. Time lapsed: 0:07:06.883445
Done with doing an sjoin for all local roads. Time lapsed: 0:18:44.197788
Begin cutting local roads
Split into mulitple parquets
Cut geometry
Begin computing
Done cutting local roads in 1:04:27.639005 minutes
Cutting primary/secondary roads 2023-05-18 11:53:45.703569
Done cutting primary & secondary roads: 0:05:24.448465
Time lapsed for cutting all roads: 1:29:23.356601
"""

In [None]:
april_month

In [None]:
# Takes about 23 minutes
# test2 = cut_all_or_month(april_month, "segmented_all_roads_2023_03_15", True)

In [None]:
april = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/shared_data/segmented_all_roads_2023_04_12.parquet")

In [None]:
april.linearid.nunique()

In [None]:
april.sample()

In [None]:
april.shape

In [None]:
april.linearid.value_counts().head(30)

In [None]:
april2 = april.loc[april.linearid.isin(["11012815158651"])].reset_index(drop = True)

### Draft

In [None]:
def cut_local_roads(date, chunk_row_size: int) -> gpd.GeoDataFrame:
    start = datetime.datetime.now()
    print(f"Cut local roads {start}")

    # Find all local roads that intersect with
    # stops and routes.
    local_roads_unsegmented = sjoin_local_roads(date)

    # Divide the gdf into equal sized chunks (roughly)
    # and turn them into dask gdfs
    ddfs, length = chunk_dask_df(local_roads_unsegmented, chunk_row_size)

    # Split the list of split dask dataframes
    # into half.
    length_list = [*range(0, length)]
    ddf1 = length_list[: len(length_list) // 2]
    ddf2 = length_list[len(length_list) // 2 :]

    # Cut geometry
    part1 = dask_segment(ddfs, ddf1)
    part1.to_parquet(f"{SHARED_GCS}segmented_local_rds_first_pt.parquet")
    print("Done with cutting part1")

    part2 = dask_segment(ddfs, ddf2)
    part2.to_parquet(f"{SHARED_GCS}segmented_local_rds_second_pt.parquet")
    print("Done with cutting part2")

    segmented_local_roads = pd.concat([part1, part2])
    segmented_local_roads.to_parquet(f"{SHARED_GCS}segmented_local_rds.parquet")

    end = datetime.datetime.now()
    print(f"Done cutting local roads in {end-start} minutes")
    return segmented_local_roads

In [None]:
def dask_segment(ddf_list: list, ddfs_range: list) -> gpd.GeoDataFrame:
    """
    Use dask to cut roads into segments. Compute the results
    back to a GDF.

    Args:
        ddf_list: dask dataframes stored in a list.
        ddf_list[0] will yield a ddf.

        ddfs_range: how many items are in the ddf_list.
    """
    # Empty dataframe
    my_results = []

    # For each dask dataframe int the list
    # cut them and append the results into the empty df.
    for i in ddfs_range:
        my_df = ddf_list[i]
        cut_geometry = delayed(geography_utils.cut_segments)(
            my_df, ["linearid", "fullname"], 1_000
        )
        my_results.append(cut_geometry)
        print(f"done with {i}")

    # Compute results into a normal gdf
    compute_results = [compute(i)[0] for i in my_results]

    # Concat results
    results_gdf = pd.concat(compute_results)

    return results_gdf

In [None]:
# Read in sjoined files
local_roads_unsegmented = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/shared_data/local_roads_all_routes_stops_sjoin.parquet"
)

In [None]:
local_roads_unsegmented.shape

In [None]:
# Turn sjoin local roads to dask
ddf1 = dd.from_pandas(local_roads_unsegmented, npartitions=1)

In [None]:
# Partition the sjoin stuff automatically
ddf1_partitioned = ddf1.repartition(partition_size="1MB")

In [None]:
#Save
ddf1_partitioned.to_parquet(f"{SHARED_GCS}daskutilstest")

In [None]:
file_names_dask = extract_number("part")

In [None]:
# https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/
# create empty list
dataframes_list = []
 
# append datasets into the list
for i in range(len(file_names)):
    gcs_file_path = "gs://calitp-analytics-data/data-analyses/shared_data/daskutilstest/part."
    temp_df = dg.read_parquet(f"{gcs_file_path}{file_names[i]}.parquet")
    dataframes_list.append(temp_df)

In [None]:
len(dataframes_list)

In [None]:
dataframes_list[0]

In [None]:
# Empty dataframe
my_results = []

In [None]:
# For each dask dataframe int the list
# cut them and append the results into the empty df.
for ddf in dataframes_list:
    cut_geometry = delayed(geography_utils.cut_segments)(ddf, ["linearid", "fullname"], 1_000)
    my_results.append(cut_geometry)

In [None]:
my_results

In [None]:
# Took 1:06:37.299317
#start = datetime.datetime.now()
#print(start)
#results2 = [compute(i)[0] for i in my_results]  # 9;12
#end = datetime.datetime.now()
#print(end)

In [None]:
# print(end-start)

In [None]:
# type(results2)

In [None]:
# testpd = pd.concat(results2, axis=0).reset_index(drop=True)

In [None]:
# type(testpd)

In [None]:
# testpd.to_parquet(f"{SHARED_GCS}dask_test2.parquet")

In [None]:
# testpd.shape

In [None]:
dask2 = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/shared_data/dask_test2.parquet")

In [None]:
dask2.linearid.nunique()

In [None]:
dask2.linearid.value_counts().head()

In [None]:
test_linearids = [ '11019653760031',
 '1106092764328',
 '11011135052903',
 '110411099535',
 '11011135055229',
 '11011135055553',
 '11011135056214',
 '11012028306122',
 '11012812027422',
 '11012812027505',
 '11012812035943',
 '11012812038881',
 '1106073054809']

In [None]:
og_tiger = load_roads(['S1400'], False)

In [None]:
og_tiger_filtered = og_tiger[og_tiger.linearid.isin(test_linearids)].reset_index()

In [None]:
# og_tiger_filtered.explore('linearid',cmap = "tab20c", style_kwds = {'weight':5})

In [None]:
dask2_filtered = dask2[dask2.linearid.isin(test_linearids)].reset_index()

In [None]:
dask2_filtered.shape

In [None]:
dask2_filtered.sample()

In [None]:
# dask2_filtered.explore('segment_sequence',cmap = "tab20c", style_kwds = {'weight':5})

In [None]:
def chunk_dask_df(gdf, chunk_row_size: int):
    """
    Break up dataframes by a certain
    number of rows, turn them into a dask
    dataframe

    Args:
        gdf: the local roads that intersect w/ stops and routes
        chunk_row_size(int): how many rows each dataframe should
        be after splitting it out.

    Returns:
        List of dask dataframes. Length of how many dask dataframes
        are returned after cutting.
    """
    # Specify how many rows I want the gdf to broken into per df
    n = chunk_row_size

    # Break it out
    list_df = [gdf[i : i + n] for i in range(0, gdf.shape[0], n)]

    # Turn each dataframe to a dask one
    my_ddfs = []
    for df in list_df:
        ddf = dd.from_pandas(df, npartitions=1)
        my_ddfs.append(ddf)

    return my_ddfs, len(my_ddfs)