In [None]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/py_crow_flies/"

from dask import delayed, compute

files = ["CentralCal_POIs", "Mojave_POIs", "NorCal_POIs", "SoCal_POIs"]
CRS = "EPSG:3857"

In [None]:
def chunk_origin_points(n: int) -> gpd.GeoDataFrame: 
    # https://stackoverflow.com/questions/33367142/split-dataframe-into-relatively-even-chunks-according-to-length

    keep_cols = ["region", "poi_index", "geometry"]
    
    df = gpd.read_parquet(
        f"./all_pois.parquet", 
        columns = keep_cols
    )

    list_df = [delayed(df[i:i+n]) for i in range(0, df.shape[0], n)]

    return list_df

In [None]:
n = 15_000
list_df = chunk_origin_points(n)

In [None]:
df = gpd.read_parquet(f"{GCS_FILE_PATH}all_pois.parquet", 
                     filters = [[("region", "==", "Mojave")]],
                      columns = ["poi_index", "geometry"]
                     ).head(100_000)

In [None]:
valid_destinations = gpd.read_parquet(
    f"{GCS_FILE_PATH}all_pois.parquet",
    filters = [[("grid_code", ">", 0)]],
    columns = ["poi_index", "grid_code", "geometry"]
)

In [None]:
def buffer_origin_and_sjoin(
    origin_gdf: dg.GeoDataFrame,
    destination_gdf: gpd.GeoDataFrame,
    buffer_miles: int = 20
) -> dd.DataFrame:
    """
    Draw 20 mi buffer around origin point.
    Note: Our CRS is in meters, need to convert 20 miles into ___ meters.
    """
    METERS_IN_MILES = 1609.34
    
    origin_gdf = dg.from_geopandas(origin_gdf, npartitions=1)
    origin_gdf = origin_gdf.repartition(partition_size="50MB")
    
    origin_buffered = origin_gdf.assign(
        geometry = origin_gdf.geometry.buffer(
            buffer_miles * METERS_IN_MILES)
    )
            
    sjoin_to_destination = dg.sjoin(
        origin_buffered, 
        destination_gdf,
        how = "inner",
        predicate = "intersects"
    )[["poi_index_left", 
       "poi_index_right", "grid_code"]].drop_duplicates()
    
    sjoin_results = (sjoin_to_destination.rename(
        columns = {
            "poi_index_left": "origin_poi_index", 
            "poi_index_right": "destination_poi_index"}
        ).reset_index(drop=True)
        .repartition(npartitions=1)
    )
    
    # Merge point geometry back in
    with_origin_point_geom = dd.merge(
        origin_gdf,
        sjoin_results,
        left_on = "poi_index",
        right_on = "origin_poi_index",
        how = "inner"
    ).drop(columns = "poi_index")
    
    with_destin_point_geom = dd.merge(
        with_origin_point_geom,
        origin_gdf,
        left_on = "destination_poi_index",
        right_on = "poi_index",
        how = "inner"
    ).drop(columns = "poi_index")
    
    with_distance = calculate_distance(
        with_destin_point_geom, 
        "geometry_x", 
        "geometry_y"
    )
    
    return with_distance

In [None]:
def calculate_distance(
    gdf: dg.GeoDataFrame,
    origin_col: str, 
    destination_col: str
)-> dg.GeoDataFrame: 
                                       
    origin_geom = gdf.set_geometry(origin_col)[origin_col]
    destin_geom = gdf.set_geometry(destination_col)[destination_col]
    
    distance = origin_geom.distance(destin_geom)
    
    gdf2 = gdf.drop(columns = [origin_col, destination_col])
    gdf2 = gdf2.assign(
        dist = distance
    )
    
    return gdf2

In [None]:
sjoin_pairs = buffer_origin_and_sjoin(
    df,
    valid_destinations,
    buffer_miles = 20
)

In [None]:
sjoin_pairs[sjoin_pairs.origin_poi_index==
           sjoin_pairs.destination_poi_index].compute()

In [None]:
def decay_weighted_opportunities(df: pd.DataFrame):
        
    SPEED = 10
    # Define time cutoff (in minutes)
    CUTOFF = 60  
    
    df = df.assign(
        decay_weighted_opps = (df.grid_code * np.exp(np.log(0.5)) / 
                               (CUTOFF * 60) * 
                               (((60 * df.dist * 0.000621371) / SPEED) * 60)
                              )
    )
    
    # Adjust own opportunities to have full value
    df = df.assign(
        decay_weighted_opps = df.apply(
            lambda x: 
            x.grid_code if x.origin_poi_index == x.destination_poi_index
            else x.decay_weighted_opps, axis=1, 
            meta=('decay_weighted_opps', 'float')
        )
    )

    return df

In [None]:
decay_df = decay_weighted_opportunities(sjoin_pairs)

In [None]:
decay_df.compute()

In [None]:
decay_df[decay_df.origin_poi_index==
         decay_df.destination_poi_index].compute()

In [None]:
def aggregate_by_origin(
    df: pd.DataFrame
) -> pd.DataFrame:

    access = (df.groupby("origin_poi_index")
              .decay_weighted_opps
              .sum()
              .reset_index()
             ).rename(columns = {"origin_poi_index": "poi_index"})
        
    return access

In [None]:
access = aggregate_by_origin(decay_df)

In [None]:
access.compute()

In [None]:
# also a left join at the end because
# we want to track all the zeroes
def full_merge_onto_all_pois(df: gpd.GeoDataFrame, 
                             results: dd.DataFrame):
    # combine all the results
    final = dd.merge(
        df,
        results,
        on = "poi_index",
        how = "left",
    )

    final = final.assign(
        decay_weighted_opps = results.decay_weighted_opps.fillna(0)
    ).repartition(npartitions=1)
    
    return final

Original R script

In [None]:
# Read in Shapefile of grid (or origin) points. Points must have two required columns:
  #1. grid_code: The value of the opportunities being measured. In this case, the number of opportunities within the grid cell.
  #2. Point_ID: A unique id for each grid in character format.
grid_points <- st_read("Path to shapefile")

# Transform the grid points to your preferred CRS
grid_points <- st_as_sf(grid_points) %>%
  st_transform(crs = 3857)

# Create a SF dataset for destination points by filtering grid points to only those with opportunities > 0 (this reduces computing time)
Dest_Points <- grid_points %>%
  filter(grid_code > 0)

# Define origin points for analysis. If dataset is large, it may be useful to limit these or break them into chunks
origins <- grid_points

# For loop to perform crows fly access calcualtions for each origin in the defined dataset
out = NULL
for(i in 1:nrow(origins)) {
  
  # Create a buffer around the origin point and select points within that buffer
  # Define buffer distance (in miles)
  buffer_dis <- 20
  buffer <- st_buffer(origins[i, ], (buffer_dis * 1609.34))
  intersected_points <- st_intersection(Dest_Points, buffer)
  
  # Select origin point ID and remove spatial data
  origin <- origins[i, ] %>%
    select(Point_ID) %>%
    st_drop_geometry()
  
  # If the sum of opportunities within the buffer is > 1, perform access calculations
    if(nrow(intersected_points) >= 1) {
  
      # Calculate distance matrix between origin and point and all destination points within buffer
      dist <- st_distance(origins[i, ], intersected_points)
      dist <- matrix(dist, ncol = 1)
      # Add distance to intersected points DF
      intersected_points$dist <- dist
      
      # Decay-weight opportunities by travel time
      # Define travel speed (in MPH)
      speed <- 10
      # Define time cutoff (in minutes)
      cutoff <- 60
      intersected_points <- as.data.frame(intersected_points) %>%
      mutate(decay_weighted_opps = grid_code * exp(log(0.5) / (cutoff * 60) * (((60 * dist * 0.000621371) / speed) * 60)))
      
      # Sum decay-weighted opportunities by origin
      access <- sum(intersected_points$decay_weighted_opps)
      access_df <- data.frame(origin, access)
  
      out <- rbind.data.frame(access_df, out)
      
      # Print % progress (optional)
      print(i / nrow(origins))
    
    # If the sum of opportunities within the buffer is zero, access is zero
    } else {
      access <- 0
      access_df <- data.frame(origin, access)
      out <- rbind.data.frame(access_df, out)
      print(i / nrow(origins))
    }
}

# Write output to CSV
write.csv(out, "/Users/Username/Downloads/CrowsFlyWeighted.csv", na = "")
