In [1]:
# Some helper functions
import ibis
from ibis import _

import os
def set_secrets(con,
                key = os.getenv("AWS_ACCESS_KEY_ID", ""), 
                secret = os.getenv("AWS_SECRET_ACCESS_KEY", ""), 
                endpoint = os.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com"),
                bucket = '',
                url_style = "path",
                region = os.getenv("AWS_REGION",  "us-east-1"),
                use_ssl = os.getenv("AWS_HTTPS", "TRUE"),
                url_compatibility_mode = True,
                session_token = os.getenv("AWS_SESSION_TOKEN", ""),
                type = "S3",
               ):
    
    if 'amazonaws.com' in endpoint:
        url_style = 'vhost'    

    if bucket != '':
        bucket = f"SCOPE 's3://{bucket}',"

    query = f'''
    CREATE OR REPLACE SECRET s3_{key} (
        TYPE S3,
        KEY_ID '{key}',
        SECRET '{secret}',
        ENDPOINT '{endpoint}',
        REGION '{region}',
        URL_COMPATIBILITY_MODE {url_compatibility_mode},
        USE_SSL {use_ssl},
        {bucket}
        URL_STYLE '{url_style}'
    );
    '''
    con.raw_sql(query)

def install_h3(): 
    import duckdb
    db = duckdb.connect()
    db.install_extension("h3", repository = "community")
    db.close()





import ibis.expr.datatypes as dt
@ibis.udf.scalar.builtin
def ST_Multi (geom) -> dt.geometry:
    ...
    
    
def geom_to_cell (df, zoom = 5):
    con = df.get_backend() # ibis >= 10.0

    # First make sure we are using multipolygons everywhere and not a mix
    cases = ibis.cases(
        (df.geom.geometry_type() == 'POLYGON' , ST_Multi(df.geom)),
        else_=df.geom,
    )
    
    df = df.mutate(geom = cases)
    sql = ibis.to_sql(df)
    expr = f'''
        WITH t1 AS (
        SELECT id, region, UNNEST(ST_Dump(ST_GeomFromWKB(geom))).geom AS geom 
        FROM ({sql})
        ) 
        SELECT *, h3_polygon_wkt_to_cells_string(geom, {zoom}) AS h{zoom}  FROM t1
    '''

    out = con.sql(expr)
    return out




@ibis.udf.scalar.builtin
def h3_cell_to_parent(cell, zoom: int) -> int:
    ...



In [2]:

con = ibis.duckdb.connect(extensions = ["spatial", "h3"])
endpoint = os.getenv("AWS_S3_ENDPOINT", "minio.carlboettiger.info")

set_secrets(con)
install_h3()


In [3]:
# open an arbitrary area via overture maps.  

# Alternately we can read these from the official S3 but faster with our MINIO
# set_secrets(con, "", "", "s3.amazonaws.com", "overturemaps-us-west-2", 'vhost')
#overture = con.read_parquet('s3://overturemaps-us-west-2/release/2024-11-13.0/theme=divisions/type=division_area/*', filename=True, hive_partitioning=1)

area_of_interest = con.read_parquet('s3://public-overturemaps/regions.parquet').filter(_.country == "US")


In [4]:
# hex the area of interest.  IMPORTANT NOTE!  Zoom could vary.  


zoom = 6



hexed_aoi = (
    geom_to_cell(area_of_interest.rename(geom = "geometry"), zoom)
    .mutate(h6 = _.h6.unnest())
    .mutate(h0 = h3_cell_to_parent(_.h6, 0))
)



# peek at hexed area of interest data
# hexed_aoi.head().execute()



In [5]:
# Open GBIF.  
# For efficiency we open only the relevant partitions, though probably not much slower to filter to these later given hive partitioning?

h0 = hexed_aoi.select(_.h0).distinct().mutate(h0 = _.h0.cast('string').upper()).execute()["h0"]

gbif = con.read_parquet('s3://public-gbif/hex/h0='+h0 + "/*")


In [None]:
%%time

gbif.inner_join(hexed_aoi, "h6").select('taxonkey', 'h6').distinct().group_by('h6').agg(n = _.taxonkey.count()).to_parquet("example.parquet")