In [1]:
from cng.utils import *
import ibis
from ibis import _
import pathlib

duckdb_install_h3()

con = ibis.duckdb.connect("duck.db", extensions = ["spatial", "h3"])
set_secrets(con)


In [None]:
def geom_to_h3(con, 
               taxa, 
               cols = "taxon_id, parent_taxon_id, name, rank, iconic_taxon_id, iconic_taxon_name",
               zoom = "4"):

    con.read_geo(gpkg, taxa) # FIXME allow overwrite

    
    con.sql(f'''
      WITH t2 AS (
        WITH t1 AS (
          SELECT {cols},  ST_Dump(geom) AS geom 
          FROM {taxa}
        ) 
        SELECT {cols},
              h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}
        FROM t1
      )
      SELECT {cols}, UNNEST(h{zoom}) AS h{zoom} FROM t2
      ''').to_parquet(f"s3://public-inat/hex/{taxa}.parquet")




In [None]:
## assumes files have all been downladed from https://www.inaturalist.org/pages/range_maps
files = pathlib.Path('~/nvme/public-inat').expanduser().rglob('*.gpkg')

for f in files:
    taxa = f.name.removesuffix('.gpkg').replace('iNaturalist_geomodel_', '')
    gpkg = f"/home/jovyan/nvme/public-inat/iNaturalist_geomodel_{taxa}.gpkg"
    taxon = taxa
    con.read_geo(gpkg, taxa).to_parquet(f"s3://public-inat/polygon/{taxa}.parquet")


In [None]:

files = pathlib.Path('~/nvme/public-inat').expanduser().rglob('*.gpkg')

for f in files:
    taxa = f.name.removesuffix('.gpkg').replace('iNaturalist_geomodel_', '')
    gpkg = f"/home/jovyan/nvme/public-inat/iNaturalist_geomodel_{taxa}.gpkg"
    taxon = taxa
    geom_to_h3(con, taxa)

In [2]:
# access all range-maps, h3
con.read_parquet("s3://public-inat/hex/**").head().execute()

Unnamed: 0,taxon_id,parent_taxon_id,name,rank,iconic_taxon_id,iconic_taxon_name,h4
0,47174,47175,Nelusetta ayraud,species,47178,Actinopterygii,84a700dffffffff
1,47174,47175,Nelusetta ayraud,species,47178,Actinopterygii,84a7047ffffffff
2,47174,47175,Nelusetta ayraud,species,47178,Actinopterygii,84a7041ffffffff
3,47174,47175,Nelusetta ayraud,species,47178,Actinopterygii,84a7007ffffffff
4,47174,47175,Nelusetta ayraud,species,47178,Actinopterygii,84c9931ffffffff


In [None]:
from cng.h3 import *

con.raw_sql("SET memory_limit = '100GB';")
con.raw_sql("SET threads TO 1;")


(con.read_parquet("s3://public-inat/hex/**")
.mutate(h0 = h3_cell_to_parent(_.h4, 0))
.mutate(h1 = h3_cell_to_parent(_.h4, 1))
.mutate(h2 = h3_cell_to_parent(_.h4, 2))
.mutate(h3 = h3_cell_to_parent(_.h4, 3))
.to_parquet("s3://public-inat/tmp/", partition_by = "h0", per_thread_output=False)
)


# To avoid shards we have to round-trip this again to its final location!
(con
.read_parquet("s3://public-inat/tmp/")
.to_parquet("s3://public-inat/range-maps/hex",
            partition_by = "h0", per_thread_output=False)
)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
con.read_parquet("s3://public-inat/range-maps/hex/**").count().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

377964016

In [None]:
@ibis.udf.scalar.builtin
def h3_cell_to_children(cell, zoom: int) -> list[int]:
    ...


In [1]:
import duckdb

con = duckdb.connect("duck.db")
con.execute(f'''
SET threads TO 1;
CREATE SECRET my_secret (
    TYPE s3,
    KEY_ID '',
    SECRET '',
    ENDPOINT 'minio.carlboettiger.info',
    URL_STYLE 'path');
INSTALL h3;
LOAD h3;
''')
## this creates shards
con.execute("""
    COPY (
        SELECT *, 
               h3_cell_to_parent(h4, 0) AS h0
        FROM read_parquet('s3://public-inat/hex/**')
    ) TO 'tmp/' (FORMAT PARQUET, PARTITION_BY h0, OVERWRITE_OR_IGNORE)
""")



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Error: KeyboardInterrupt: <EMPTY MESSAGE>

At:
  /opt/conda/lib/python3.12/site-packages/traitlets/traitlets.py(708): __set__
  /tmp/ipykernel_409338/2197136462.py(15): <module>
  /opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3699): run_code
  /opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3639): run_ast_nodes
  /opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3394): run_cell_async
  /opt/conda/lib/python3.12/site-packages/IPython/core/async_helpers.py(128): _pseudo_sync_runner
  /opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3171): _run_cell
  /opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3116): run_cell
  /opt/conda/lib/python3.12/site-packages/ipykernel/zmqshell.py(577): run_cell
  /opt/conda/lib/python3.12/site-packages/ipykernel/ipkernel.py(455): do_execute
  /opt/conda/lib/python3.12/site-packages/ipykernel/kernelbase.py(767): execute_request
  /opt/conda/lib/python3.12/site-packages/ipykernel/ipkernel.py(368): execute_request
  /opt/conda/lib/python3.12/site-packages/ipykernel/kernelbase.py(400): dispatch_shell
  /opt/conda/lib/python3.12/site-packages/ipykernel/kernelbase.py(508): process_one
  /opt/conda/lib/python3.12/site-packages/ipykernel/kernelbase.py(519): dispatch_queue
  /opt/conda/lib/python3.12/asyncio/events.py(88): _run
  /opt/conda/lib/python3.12/asyncio/base_events.py(1999): _run_once
  /opt/conda/lib/python3.12/asyncio/base_events.py(645): run_forever
  /opt/conda/lib/python3.12/site-packages/tornado/platform/asyncio.py(211): start
  /opt/conda/lib/python3.12/site-packages/ipykernel/kernelapp.py(739): start
  /opt/conda/lib/python3.12/site-packages/traitlets/config/application.py(1075): launch_instance
  /opt/conda/lib/python3.12/site-packages/ipykernel_launcher.py(18): <module>
  <frozen runpy>(88): _run_code
  <frozen runpy>(198): _run_module_as_main


In [None]:
# To avoid shards we have to round-trip this again to its final location!
duck_con.execute("""
    COPY (
        SELECT * FROM read_parquet('tmp/**')
    ) TO 'hex/' (FORMAT PARQUET, PARTITION_BY h0, OVERWRITE_OR_IGNORE)
""")