In [None]:
# Some helper functions

import os
def set_secrets(con,
                key = os.getenv("AWS_ACCESS_KEY_ID", ""), 
                secret = os.getenv("AWS_SECRET_ACCESS_KEY", ""), 
                endpoint = os.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com"),
                bucket = '',
                url_style = "path",
                region = os.getenv("AWS_REGION",  "us-east-1"),
                use_ssl = os.getenv("AWS_HTTPS", "TRUE"),
                url_compatibility_mode = True,
                session_token = os.getenv("AWS_SESSION_TOKEN", ""),
                type = "S3",
               ):
    
    if 'amazonaws.com' in endpoint:
        url_style = 'vhost'    

    if bucket != '':
        bucket = f"SCOPE 's3://{bucket}',"

    query = f'''
    CREATE OR REPLACE SECRET s3_{key} (
        TYPE S3,
        KEY_ID '{key}',
        SECRET '{secret}',
        ENDPOINT '{endpoint}',
        REGION '{region}',
        URL_COMPATIBILITY_MODE {url_compatibility_mode},
        USE_SSL {use_ssl},
        {bucket}
        URL_STYLE '{url_style}'
    );
    '''
    con.raw_sql(query)

def install_h3(): 
    import duckdb
    db = duckdb.connect()
    db.install_extension("h3", repository = "community")
    db.close()





import ibis.expr.datatypes as dt
@ibis.udf.scalar.builtin
def ST_Multi (geom) -> dt.geometry:
    ...
    
    
def geom_to_cell (df, zoom = 5):
    con = df.get_backend() # ibis >= 10.0

    # First make sure we are using multipolygons everywhere and not a mix
    cases = ibis.cases(
        (df.geom.geometry_type() == 'POLYGON' , ST_Multi(df.geom)),
        else_=df.geom,
    )
    
    df = df.mutate(geom = cases)
    sql = ibis.to_sql(df)
    expr = f'''
        WITH t1 AS (
        SELECT id, region, UNNEST(ST_Dump(ST_GeomFromWKB(geom))).geom AS geom 
        FROM ({sql})
        ) 
        SELECT *, h3_polygon_wkt_to_cells_string(geom, {zoom}) AS h{zoom}  FROM t1
    '''

    out = con.sql(expr)
    return out




@ibis.udf.scalar.builtin
def h3_cell_to_parent(cell, zoom: int) -> int:
    ...



In [2]:
import ibis
from ibis import _

con = ibis.duckdb.connect(extensions = ["spatial", "h3"])
endpoint = os.getenv("AWS_S3_ENDPOINT", "minio.carlboettiger.info")

set_secrets(con)
install_h3()


In [None]:
# open an arbitrary area via overture maps.  

# Alternately we can read these from the official S3 but faster with our MINIO
# set_secrets(con, "", "", "s3.amazonaws.com", "overturemaps-us-west-2", 'vhost')
#overture = con.read_parquet('s3://overturemaps-us-west-2/release/2024-11-13.0/theme=divisions/type=division_area/*', filename=True, hive_partitioning=1)

area_of_interest = con.read_parquet('s3://public-overturemaps/regions.parquet').filter(_.country == "US")
area_of_interest.head().execute()




Unnamed: 0,id,geometry,bbox,country,version,sources,subtype,class,names,is_land,is_territorial,region,division_id,theme,type,primary,filename
0,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...","{'xmin': -90.31031036376953, 'xmax': -81.64721...",US,2,"[{'property': '', 'dataset': 'OpenStreetMap', ...",region,land,"{'primary': 'Tennessee', 'common': {'hy': 'Թեն...",True,True,US-TN,f5763fdc-5c84-473a-b852-c4986bbc4600,divisions,division_area,Tennessee,s3://public-overturemaps/regions.parquet
1,24b25f46-b60a-4df8-b9f3-0a583f9abf99,"MULTIPOLYGON (((-89.51484 36.49773, -89.51391 ...","{'xmin': -89.5715103149414, 'xmax': -81.964538...",US,2,"[{'property': '', 'dataset': 'OpenStreetMap', ...",region,land,"{'primary': 'Kentucky', 'common': {'hy': 'Կենտ...",True,True,US-KY,5d02d18d-7037-4f8f-920f-19288fa7c55e,divisions,division_area,Kentucky,s3://public-overturemaps/regions.parquet
2,0a45f157-a06f-4eb1-ab0c-71675ec2b68e,"MULTIPOLYGON (((-89.52954 30.19091, -89.52875 ...","{'xmin': -91.65502166748047, 'xmax': -88.09779...",US,2,"[{'property': '', 'dataset': 'OpenStreetMap', ...",region,land,"{'primary': 'Mississippi', 'common': {'hy': 'Մ...",True,False,US-MS,2462eb22-d8b1-4302-b031-b42b9af57a1a,divisions,division_area,Mississippi,s3://public-overturemaps/regions.parquet
3,c1136e11-530c-4057-895b-7ae0c1b349ac,"MULTIPOLYGON (((-85.25795 33.24107, -85.2815 3...","{'xmin': -88.47311401367188, 'xmax': -84.88827...",US,2,"[{'property': '', 'dataset': 'OpenStreetMap', ...",region,land,"{'primary': 'Alabama', 'common': {'hy': 'Ալաբա...",True,False,US-AL,76553122-6157-43cc-90f5-cc9aef992922,divisions,division_area,Alabama,s3://public-overturemaps/regions.parquet
4,1269d82a-28cf-436b-9e5c-e5baab7eb800,"MULTIPOLYGON (((-81.53353 30.71324, -81.53365 ...","{'xmin': -85.60517883300781, 'xmax': -80.84108...",US,2,"[{'property': '', 'dataset': 'OpenStreetMap', ...",region,land,"{'primary': 'Georgia', 'common': {'hy': 'Ջորջի...",True,False,US-GA,9bf5c55b-bf85-4884-88c5-0af917c46fa9,divisions,division_area,Georgia,s3://public-overturemaps/regions.parquet


In [None]:
# hex the area of interest:


zoom = 6



hexed_aoi = (
    geom_to_cell(area_of_interest.rename(geom = "geometry"), zoom)
    .mutate(h6 = _.h6.unnest())
    .mutate(h0 = h3_cell_to_parent(_.h6, 0))
)

hexed_aoi.head().execute()



Unnamed: 0,id,region,geom,h6,h0
0,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,US-TN,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...",862648057ffffff,8027fffffffffff
1,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,US-TN,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...",8644c8c1fffffff,8045fffffffffff
2,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,US-TN,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...",86264dcdfffffff,8027fffffffffff
3,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,US-TN,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...",862649b97ffffff,8027fffffffffff
4,c5a8ead2-6a3b-4186-b6df-12eac8d44f38,US-TN,"POLYGON ((-85.08187 34.98666, -85.06497 34.986...",8644ca75fffffff,8045fffffffffff


In [37]:
# Open GBIF 


h0 = hexed_aoi.select(_.h0).distinct().mutate(h0 = _.h0.cast('string').upper()).execute()["h0"]

# For efficiency we open only the relevant partitions:

gbif = con.read_parquet('s3://public-gbif/hex/h0='+h0 + "/*")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [38]:
gbif

In [None]:
gbif.inner_join(hexed_aoi, "h6").select('taxonkey', 'h6').distinct().group_by('h6').agg(n = _.taxonkey.count()).head().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))