In [1]:
%load_ext memory_profiler

## Run benchmark with

From repo root/base folder

```bash
mprof run python extract_point_from_raster_buffer.py -f srg-dev/test-data/pop_density/pop_density/*.tif -g srg-dev/test-data/1000_testing_points.rds
```

## Run all the cells below to record peak memory and time

In [2]:
import numpy as np
import pandas as pd
# import geopandas as gpd
import rioxarray as riox
import dask.dataframe as dd

from pyproj import Transformer
from shapely.geometry import mapping, Point

start = pd.Timestamp('now')
buffer_value = 10_000

## Run the analysis in a scalable way

load the raster and fill NaNs with 0

In [3]:
myraster = (
    riox.open_rasterio('test-data/apg18e_APPMA_NSW.tif')
    .sel(band=1)
)

In [4]:
myraster = myraster.where(myraster != myraster.rio.nodata, drop=True)

In [5]:
myraster

In [6]:
"{:,}".format(myraster.data.shape[0] * myraster.data.shape[1])

'1,280,566'

Load points for data extraction and create buffers

In [7]:
transformer = Transformer.from_crs("EPSG:3577", myraster.rio.crs, always_xy=True)

In [8]:
%%time
points = (
#     pd.read_csv('test-data/1000_testing_points.csv')
    pd.read_csv('test-data/100k_testing_points.csv')
    .rename(columns={'X': 'x', 'Y': 'y'})
    .assign(
        lat_lon_tuple = lambda columns: columns[['x', 'y']].apply(lambda row: transformer.transform(row['x'], row['y']), axis=1),
        lat = lambda columns: columns['lat_lon_tuple'].apply(lambda el: el[0]),
        lon = lambda columns: columns['lat_lon_tuple'].apply(lambda el: el[1]),
        points = lambda columns: columns['lat_lon_tuple'].apply(Point),
        points_buffer = lambda columns: columns['points'].apply(lambda x: x.buffer(buffer_value))
    )
)
points = dd.from_pandas(points, npartitions=24)# assign partitions equal to 2 x Nr logical cores in my machine

CPU times: user 1min 10s, sys: 1.34 s, total: 1min 11s
Wall time: 1min 11s


Memory of dataframe in MB

In [9]:
# def extract_mean_from_buffer(raster, geom):
#     data_points = pd.Series(geom.exterior.coords)
#     values_from_raster = data_points.apply(lambda row: raster.sel(x=row[0], y=row[1], method="nearest").item()).values
#     return values_from_raster.mean()
def extract_mean_from_buffer(raster, geom):
    data_points = geom.exterior.coords
    raster_selection = raster.sel(
        x=[el[0] for el in data_points],
        y=[el[1] for el in data_points],
        method="nearest"
    )
    return np.diag(raster_selection.data).mean()

In [10]:
# %%memit
points['extracted_mean'] = points['points_buffer'].apply(
    lambda x: extract_mean_from_buffer(myraster, x),
    meta=float
) * myraster.attrs['scale_factor'] + myraster.attrs['add_offset']

In [11]:
points = points.compute(scheduler="processes")

In [12]:
print(f"running time: {pd.Timestamp('now') - start}")

running time: 0 days 00:05:53.784430


In [13]:
points.sample(20)

Unnamed: 0,x,y,lat_lon_tuple,lat,lon,points,points_buffer,extracted_mean
279640,1298630.0,-3187270.0,"(1298630.2040837258, -3187270.093680637)",1298630.0,-3187270.0,POINT (1298630.204083726 -3187270.093680637),POLYGON ((1308630.204083726 -3187270.093680637...,0.0
864829,1426953.0,-3193294.0,"(1426952.9356282484, -3193294.1014502207)",1426953.0,-3193294.0,POINT (1426952.935628248 -3193294.101450221),POLYGON ((1436952.935628248 -3193294.101450221...,0.0
616146,1969909.0,-3190680.0,"(1969909.2585584368, -3190679.909399269)",1969909.0,-3190680.0,POINT (1969909.258558437 -3190679.909399269),POLYGON ((1979909.258558437 -3190679.909399269...,0.809785
534302,1435250.0,-3189884.0,"(1435250.1538769198, -3189884.2857315885)",1435250.0,-3189884.0,POINT (1435250.15387692 -3189884.285731588),"POLYGON ((1445250.15387692 -3189884.285731588,...",0.0
478011,1299880.0,-3189316.0,"(1299880.4698472242, -3189315.9831118165)",1299880.0,-3189316.0,POINT (1299880.469847224 -3189315.983111816),POLYGON ((1309880.469847224 -3189315.983111816...,0.021631
301499,1278058.0,-3187497.0,"(1278057.6492479788, -3187497.414728546)",1278058.0,-3187497.0,POINT (1278057.649247979 -3187497.414728546),POLYGON ((1288057.649247979 -3187497.414728546...,0.010815
965023,1542205.0,-3194317.0,"(1542204.7069180142, -3194317.0461658104)",1542205.0,-3194317.0,POINT (1542204.706918014 -3194317.04616581),"POLYGON ((1552204.706918014 -3194317.04616581,...",0.033692
866939,1666777.0,-3193294.0,"(1666776.6411720412, -3193294.1014502207)",1666777.0,-3193294.0,POINT (1666776.641172041 -3193294.101450221),POLYGON ((1676776.641172041 -3193294.101450221...,0.050077
23620,1007659.0,-3184656.0,"(1007659.262760451, -3184655.901629686)",1007659.0,-3184656.0,POINT (1007659.262760451 -3184655.901629686),POLYGON ((1017659.262760451 -3184655.901629686...,0.0
277107,1010728.0,-3187270.0,"(1010728.0969072198, -3187270.093680637)",1010728.0,-3187270.0,POINT (1010728.09690722 -3187270.093680637),"POLYGON ((1020728.09690722 -3187270.093680637,...",0.0
