In [1]:
import geopandas as gpd
import pandas as pd
from siuba import *

from tqdm.notebook import tqdm

import shared_utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


# Five Days of the Crow: a quick attempt to port over to Python

## Differences:

* Drop extra string id column
* Do all buffering first

## Components

* 3 remaining iterative steps: clip, measure distance, apply decay weights
* Similar to original script
* About 5 days in total, give or take

In [2]:
crow_folder = 'gs://calitp-analytics-data/data-analyses/py_crow_flies/'

In [3]:
# path = f'{crow_folder}CentralCal_POIs.zip'

# Datasets:

* CentralCal_POIs
* NorCal_POIs
* Mojave_POIs
* SoCal_POIs

In [4]:
# Read in Shapefile of grid (or origin) points. Points must have two required columns:
  #1. grid_code: The value of the opportunities being measured. In this case, the number of opportunities within the grid cell.
  #2. Point_ID: A unique id for each grid in character format.
central = gpd.read_parquet(f'{crow_folder}CentralCal_POIs.parquet')

In [5]:
central

Unnamed: 0,pointid,grid_code,Point_ID,geometry
0,1,0.0,id_1,POINT (-13783277.814 4813851.167)
1,2,0.0,id_2,POINT (-13782972.066 4813851.167)
2,3,0.0,id_3,POINT (-13782666.318 4813851.167)
3,4,0.0,id_4,POINT (-13782360.570 4813851.167)
4,5,0.0,id_5,POINT (-13782054.822 4813851.167)
...,...,...,...,...
2850974,3935450,0.0,id_3935450,POINT (-13109408.973 4283378.191)
2850975,3935451,0.0,id_3935451,POINT (-13109103.225 4283378.191)
2850976,3935452,0.0,id_3935452,POINT (-13108797.477 4283378.191)
2850977,3935453,0.0,id_3935453,POINT (-13108491.729 4283378.191)


In [77]:
# nor = gpd.read_parquet(f'{crow_folder}NorCal_POIs.parquet')

# so = gpd.read_parquet(f'{crow_folder}SoCal_POIs.parquet')

# mo = gpd.read_parquet(f'{crow_folder}Mojave_POIs.parquet')

In [7]:
# Transform the grid points to your preferred CRS
central = central.to_crs(shared_utils.geography_utils.CA_NAD83Albers).set_index('pointid')
central = central >> select(-_.Point_ID)

In [8]:
central.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

### keep point geometries around since we'll buffer actual geom later...

In [10]:
central['point_geom'] = central.geometry.copy()

In [11]:
# Create a SF dataset for destination points by filtering grid points to only those with opportunities > 0 (this reduces computing time)
dest_points = central >> filter(_.grid_code > 0)

In [12]:
dest_points >> head(3)

Unnamed: 0_level_0,grid_code,geometry,point_geom
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
736,1.0,POINT (-154261.074 181991.837),POINT (-154261.074 181991.837)
2280,1.0,POINT (-324523.035 186771.201),POINT (-324523.035 186771.201)
4547,1.0,POINT (-324532.413 186536.371),POINT (-324532.413 186536.371)


In [13]:
# dest_stripped = dest_points >> select(_.geometry)

In [14]:
# dest_stripped

## Look up all distances in advance?

* distinct step, but actually massively more work than doing it later...

In [15]:
# import numpy as np

In [16]:
# origins = central

In [17]:
# origins_stripped = origins.reset_index() >> select(-_.grid_code)

In [18]:
# origins_stripped 

In [16]:
# result = {}

# def create_od(row):
#     global result
#     # print(row)
#     # print(row.Name)
#     result[row.pointid] = dest_stripped.distance(row.geometry)
#     return

In [17]:
# origins_stripped[:100].apply(create_od, axis = 1)

In [18]:
# result[1]

In [19]:
# dest_stripped.distance(origins.geometry.iloc[0])

## Buffering and so on

* buffer everything first instead of one at a time

In [19]:
buffer_mi = 20
buffer_m = buffer_mi * 1609.34

In [20]:
origins = central

In [21]:
origins.geometry = origins.buffer(buffer_m)

In [22]:
# you wish, runs out of RAM

# origins_joined = origins.sjoin(dest_points, how='left', predicate='intersects',
#                               rsuffix='dest')

In [23]:
origins

Unnamed: 0_level_0,grid_code,geometry,point_geom
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,"POLYGON ((-295151.631 187119.328, -295306.620 ...",POINT (-327338.431 187119.328)
2,0.0,"POLYGON ((-294916.235 187109.849, -295071.223 ...",POINT (-327103.035 187109.849)
3,0.0,"POLYGON ((-294680.838 187100.376, -294835.827 ...",POINT (-326867.638 187100.376)
4,0.0,"POLYGON ((-294445.441 187090.910, -294600.430 ...",POINT (-326632.241 187090.910)
5,0.0,"POLYGON ((-294210.044 187081.451, -294365.032 ...",POINT (-326396.844 187081.451)
...,...,...,...
3935450,0.0,"POLYGON ((233848.011 -235671.905, 233693.023 -...",POINT (201661.211 -235671.905)
3935451,0.0,"POLYGON ((234095.655 -235666.059, 233940.667 -...",POINT (201908.855 -235666.059)
3935452,0.0,"POLYGON ((234343.299 -235660.207, 234188.311 -...",POINT (202156.499 -235660.207)
3935453,0.0,"POLYGON ((234590.943 -235654.347, 234435.955 -...",POINT (202404.143 -235654.347)


## what if we clip each row?

* rowise clip since spatial join all at once exhausts memory

In [24]:
one_row = origins.iloc[0,:]

In [25]:
one_row

grid_code                                                   0.0
geometry      POLYGON ((-295151.6312776939 187119.3281538663...
point_geom        POINT (-327338.4312776939 187119.32815386634)
Name: 1, dtype: object

In [26]:
%%timeit
one_clipped = dest_points.clip(one_row.geometry)

6.7 ms ± 656 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
one_clipped = dest_points.clip(one_row.geometry)

In [46]:
each_clip_ms = 14.8 # from timeit
clip_sec = central.shape[0] * each_clip_ms * 10**-3

clip_hours = clip_sec / 60**2
clip_hours

11.720691444444446

In [43]:
%%timeit
one_clipped['distance'] = one_clipped.geometry.distance(one_row.point_geom)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


304 µs ± 4.28 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [47]:
each_dist_ms = .304 # from timeit
dist_sec = central.shape[0] * each_dist_ms * 10**-3

dist_hours = dist_sec / 60**2
dist_hours

0.24074933777777777

In [44]:
one_clipped['distance'] = one_clipped.geometry.distance(one_row.point_geom)

In [45]:
one_clipped

Unnamed: 0_level_0,grid_code,geometry,point_geom,distance
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
299245,1.0,POINT (-328589.703 156058.796),POINT (-328589.703 156058.796),31085.725136
281110,1.0,POINT (-328277.478 157935.423),POINT (-328277.478 157935.423),29199.009447
281109,1.0,POINT (-328513.720 157944.936),POINT (-328513.720 157944.936),29198.055622
269776,1.0,POINT (-327993.825 159104.515),POINT (-327993.825 159104.515),28022.478282
256175,1.0,POINT (-327700.774 160509.093),POINT (-327700.774 160509.093),26612.702170
...,...,...,...,...
122435,1.0,POINT (-324077.427 174277.317),POINT (-324077.427 174277.317),13249.580751
88433,1.0,POINT (-323229.960 177776.695),POINT (-323229.960 177776.695),10206.092570
29485,1.0,POINT (-324400.107 183943.345),POINT (-324400.107 183943.345),4326.733478
4547,1.0,POINT (-324532.413 186536.371),POINT (-324532.413 186536.371),2865.933776


In [35]:
import math

In [34]:
speed, cutoff = 10, 60

In [40]:
def decay_weight_opportunity(row):
    
    decay_factor = math.e ** (math.log(0.5) / (cutoff * 60) * (((60 * row.distance * 0.000621371) / speed) * 60))
    row['decay_weighted_opps'] = row.grid_code * decay_factor
    return row

In [42]:
%%timeit
one_clipped.apply(decay_weight_opportunity, axis = 1)

37.8 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
each_weight_ms = 37.8 # from timeit
weight_sec = central.shape[0] * each_weight_ms * 10**-3

weight_hours = weight_sec / 60**2
weight_hours

29.935279499999996

In [66]:
one_clipped = one_clipped.apply(decay_weight_opportunity, axis = 1)

In [67]:
one_clipped

Unnamed: 0_level_0,grid_code,geometry,point_geom,distance,decay_weighted_opps
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
299245,1.0,POINT (-328589.703 156058.796),POINT (-328589.7027530935 156058.7964954544),31085.725136,0.262143
281110,1.0,POINT (-328277.478 157935.423),POINT (-328277.47819791455 157935.42259707302),29199.009447,0.284334
281109,1.0,POINT (-328513.720 157944.936),POINT (-328513.71974135406 157944.93614463136),29198.055622,0.284346
269776,1.0,POINT (-327993.825 159104.515),POINT (-327993.8249839457 159104.51514123473),28022.478282,0.299113
256175,1.0,POINT (-327700.774 160509.093),POINT (-327700.7741634822 160509.09282183833),26612.702170,0.317838
...,...,...,...,...,...
122435,1.0,POINT (-324077.427 174277.317),POINT (-324077.4269945271 174277.31736346334),13249.580751,0.565151
88433,1.0,POINT (-323229.960 177776.695),POINT (-323229.9601252843 177776.69520912413),10206.092570,0.644308
29485,1.0,POINT (-324400.107 183943.345),POINT (-324400.10666450474 183943.34467456443),4326.733478,0.829980
4547,1.0,POINT (-324532.413 186536.371),POINT (-324532.4132057393 186536.3710456635),2865.933776,0.883878


In [72]:
# answer for one row...
one_sum = one_clipped.decay_weighted_opps.sum()
one_sum

57.5342901693008

## How long does it take?

In [60]:
central_hours = clip_hours + dist_hours + weight_hours

In [61]:
central_hours

41.89672028222222

In [62]:
central_proportion = central.shape[0] / (nor.shape[0] + so.shape[0] + mo.shape[0] + central.shape[0])

In [63]:
total_hours = central_hours * 1/central_proportion

In [64]:
total_hours

122.32880233777777

In [65]:
total_days = total_hours / 24
total_days

5.097033430740741