# Speed up spatial join

In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd

import calitp.magics

DASK_GCS = "gs://calitp-analytics-data/data-analyses/dask_test/"
ALL_BUS = f"{DASK_GCS}all_bus.parquet"



In [2]:
def sjoin_keep_orthogonal(left_df: dg.GeoDataFrame, 
                          right_df: dg.GeoDataFrame) -> dd.DataFrame:
    route_cols = ["hqta_segment_id", "route_direction"]

    s1 = dg.sjoin(
        left_df[route_cols + ["geometry"]], 
        right_df[route_cols  + ["geometry"]],
        how = "inner",
        predicate = "intersects"
    ).drop(columns = ["index_right", "geometry"])

    # Only allow orthogonal!
    s2 = (s1[s1.route_direction_left != s1.route_direction_right]
                   [["hqta_segment_id_left", "hqta_segment_id_right"]]
                  )
    route_pairs = (s2.rename(columns = {
                    "hqta_segment_id_left": "hqta_segment_id", 
                    "hqta_segment_id_right": "intersect_hqta_segment_id"})
          .drop_duplicates()
          .reset_index(drop=True)
         )

    return route_pairs

In [3]:
def compile_within_operator_intersections(
    operator_df: dg.GeoDataFrame) -> dd.DataFrame:
    """
    Grab one operator's routes, and do sjoin against the other routes 
    of same operator.
    Look WITHIN operators.
    
    Concatenate all the small dask dfs into 1 dask df by the end.
    
    https://stackoverflow.com/questions/56072129/scale-and-concatenate-pandas-dataframe-into-a-dask-dataframe
    """
    results = []

    operator_routes = operator_df.route_identifier.unique()
    
    for r in operator_routes:
        this_route = operator_df[operator_df.route_identifier == r]
        other_routes = operator_df[operator_df.route_identifier != r]

        results.append(sjoin_keep_orthogonal(
            this_route, other_routes)) 

    # Concatenate all the dask dfs in the list and get it into one dask df
    ddf = dd.multi.concat(results, axis=0).drop_duplicates()

    return ddf  

### Set up dataset
* Keep just 5 LA area transit agencies.
* Repartition it, do it in a way where multiple transit operator are in the same partition.


#### Method 1: Loop, but save results in a list and unpack at the same time

Ex: https://stackoverflow.com/questions/56072129/scale-and-concatenate-pandas-dataframe-into-a-dask-dataframe


#### Method 2: try to use map partitions
If I set the index to be the transit operator, but expect multiple transit operators in the same partition, how can I make sure the spatial join is only operating only within an operator?

Check within the partition to see what operators are present, and introduce the loop there?

In [4]:
bus_segments = dg.read_parquet(ALL_BUS)

keep_cols = ["calitp_itp_id", "route_identifier", "route_direction",
             "hqta_segment_id", "geometry"]

test_ids = [
    182, # LA Metro
    183, # DASH
    300, # Big Blue Bus 
    17, # Arcadia transit
    45, # Burbank Bus
]

gdf = (bus_segments[(bus_segments.hq_transit_corr==True) & 
                    (bus_segments.calitp_itp_id.isin(test_ids))]
       [keep_cols].reset_index(drop=True)
      )

### Method 1: loop and append results in list, concat and compute at the end

In [5]:
%%timeit
gdf1 = gdf.repartition(npartitions=3)

results = []

IDS_TO_LOOP = gdf1.calitp_itp_id.unique()

for itp_id in IDS_TO_LOOP:
    operator_results = compile_within_operator_intersections(
        gdf1[gdf1.calitp_itp_id==itp_id]) 
    results.append(operator_results)
    
    
full = dd.multi.concat(results, axis=0).drop_duplicates()
full.compute()

1min 39s ± 1.26 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Method 2: map_partitions

Rewrite the function to account for multiple operators within a partition?

In [6]:
# use map_partitions
gdf2 = gdf.set_index("calitp_itp_id").repartition(npartitions=3)

In [7]:
def compile_within_operator_intersections_partitioned(
    operator_df: dg.GeoDataFrame) -> dd.DataFrame:
    """
    Grab one operator's routes, and do sjoin against the other routes 
    of same operator.
    Look WITHIN operators.
    
    Concatenate all the small dask dfs into 1 dask df by the end.
    
    https://stackoverflow.com/questions/56072129/scale-and-concatenate-pandas-dataframe-into-a-dask-dataframe
    """
    results = []
    
    these_operators_present = operator_df.index.unique()
    
    for itp_id in these_operators_present:
        one_operator_df = operator_df.loc[itp_id]
        operator_routes = one_operator_df.route_identifier.unique()
    
        for r in operator_routes:
            this_route = one_operator_df[one_operator_df.route_identifier == r]
            other_routes = one_operator_df[one_operator_df.route_identifier != r]

        results.append(sjoin_keep_orthogonal(
            this_route, other_routes)) 

    # Concatenate all the dask dfs in the list and get it into one dask df
    ddf = dd.multi.concat(results, axis=0).drop_duplicates()

    return ddf.persist() 

In [8]:
partition_full_results = gdf2.map_partitions(
    compile_within_operator_intersections_partitioned(gdf2),
                    meta = {"hqta_segment_id": int,
                           "intersect_hqta_segment_id": int})

AssertionError: 

In [None]:
partition_full_results.compute()