In [1]:
%load_ext memory_profiler

In [2]:
import pandas as pd
import rioxarray as riox

from pyproj import Transformer
from shapely.geometry import mapping, Point

## Run the analysis in a scalable way

load the raster and fill NaNs with 0

In [3]:
myraster = (
    riox.open_rasterio('test-data/Impervious_Surface_NOAA_Satellite_2010/Impervious_Surface_NOAA_Satellite_2010/data_provided/impsa_2010_20210519.tif')
    .sel(band=1)
)

In [4]:
myraster = myraster.where(myraster != myraster.rio.nodata, 0)

Load points for data extraction and create buffers

In [5]:
transformer = Transformer.from_crs("EPSG:3577", myraster.rio.crs, always_xy=True)

In [6]:
%%time
points = (
    pd.read_csv('test-data/one-million.csv')
    .rename(columns={'X': 'x', 'Y': 'y'})
    .assign(
        lat_lon_tuple = lambda columns: columns[['x', 'y']].apply(lambda row: transformer.transform(row['x'], row['y']), axis=1),
        lat = lambda columns: columns['lat_lon_tuple'].apply(lambda el: el[0]),
        lon = lambda columns: columns['lat_lon_tuple'].apply(lambda el: el[1]),
        points = lambda columns: columns['lat_lon_tuple'].apply(Point),
        points_buffer = lambda columns: columns['points'].apply(lambda x: x.buffer(10))
    )
)

CPU times: user 1min 13s, sys: 876 ms, total: 1min 14s
Wall time: 1min 14s


In [7]:
points.head()

Unnamed: 0,x,y,lat_lon_tuple,lat,lon,points,points_buffer
0,828075.634912,-3184429.0,"(140.6104924557582, -29.04751940499192)",140.610492,-29.047519,POINT (140.6104924557582 -29.04751940499192),POLYGON ((150.6104924557582 -29.04751940499192...
1,828189.295436,-3184429.0,"(140.61167074356007, -29.04745115348828)",140.611671,-29.047451,POINT (140.6116707435601 -29.04745115348828),POLYGON ((150.6116707435601 -29.04745115348828...
2,828302.95596,-3184429.0,"(140.61284902989465, -29.047382892661382)",140.612849,-29.047383,POINT (140.6128490298946 -29.04738289266138),POLYGON ((150.6128490298946 -29.04738289266138...
3,828416.616484,-3184429.0,"(140.61402731476173, -29.047314622511333)",140.614027,-29.047315,POINT (140.6140273147617 -29.04731462251133),POLYGON ((150.6140273147617 -29.04731462251133...
4,828530.277008,-3184429.0,"(140.61520559816114, -29.04724634303804)",140.615206,-29.047246,POINT (140.6152055981611 -29.04724634303804),POLYGON ((150.6152055981611 -29.04724634303804...


Memory of dataframe in MB

In [8]:
points.memory_usage(deep=True).sum() / 1024**2

152.5880126953125

In [9]:
def extract_mean_from_buffer(raster, geom):
    data_points = pd.Series(geom.exterior.coords)
    values_from_raster = data_points.apply(lambda row: raster.sel(x=row[0], y=row[1], method="nearest").item()).values
    return values_from_raster.mean()

In [None]:
%%time
%%memit
points['extracted_mean'] = points['points_buffer'].apply(lambda x: extract_mean_from_buffer(myraster, x)) * myraster.attrs['scale_factor'] + myraster.attrs['add_offset']

In [None]:
points.sample(20)

In [None]:
points.shape