# Using Source.Coop to add value to a half-terrabyte public dataset


_The magic of object stores_



## 1. Stream data _from_ source.coop

## 2. Stream data _to_ source.coop


In [2]:
import ibis
from ibis import _
import pydeck as pdk
from boilerplate import *



First I computed **[H3 heirarchical spatial indexes](https://h3geo.org)** using [duckdb](https://github.com/boettiger-lab/gbif-maps/blob/main/gbif_as_h3.py) for the Global Biodiversity Information Facility (GBIF) & upload to [source.coop/cboettig/gbif](https://source.coop/cboettig/gbif/).   (***> 24 hrs, 430 GB, 3 billion+ observations***)

Now let's compute over _all the data in seconds_.


In [3]:
con = ibis.duckdb.connect(extensions=['httpfs', 'spatial', 'h3'])
set_secrets(con) # s3 credentials


In [6]:
%%time

dest = "csv/gbif_demo.csv"
(con
  .read_parquet("s3://cboettig/gbif/2024-10-01/**")
  .filter(_["class"].isin(["Insecta"]))
  .rename(hex = "h5")                   # h3 == 41,150 hexes.  h5 == 2,016,830 hexes	
  .group_by(_.hex)
  .agg(n = _.count())
  .mutate(logn = _.n.log())
  .mutate(value = (255 * _.logn / _.logn.max()).cast("int")) # normalized color-scale
  .to_csv("s3://cboettig/gbif/" + dest)
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 2min 8s, sys: 18.4 s, total: 2min 26s
Wall time: 11.8 s


# Cloud-native visualization

## from source.coop

Use deck.gl, where the data later is passed as URL, only read client-side

In [7]:
url = base_url + "/cboettig/gbif/" + dest

# Render with deck-gl
layer = HexagonLayer(url)
deck = DeckGlobe(layer)

deck.to_html("globe.html")

In [None]:
# Post the map itself to source
client = s3_client()
client.fput_object("cboettig", "gbif/maps/" + "globe.html", "globe.html")


---


# Zooming in

In [None]:
## grab polygon of a National park:
polygon = (con
        .read_geo("/vsicurl/https://huggingface.co/datasets/cboettig/biodiversity/resolve/main/data/NPS.gdb")
        .filter(_.UNIT_NAME == "Yosemite National Park")
        .mutate(SHAPE = _.SHAPE.convert('EPSG:3857', 'EPSG:4326'))
        .execute()
       )

poly_expr = ibis.literal(polygon.geometry.iloc[0])

In [None]:
%%time
subset = (con
  .read_parquet("s3://cboettig/gbif/2024-10-01/**")
  .filter( _.geom.within(poly_expr))
  .filter(_["genus"] == "Ursus") 
 # .filter(_["species"] == "Marmota flaviventris")
  .rename(hex = "h9")
  .group_by(_.hex)
  .agg(n = _.count())
  .mutate(logn = _.n.log())
  .mutate(value = (255 * _.logn / _.logn.max()).cast("int"))
)

# .to_json() doesn't exist in ibis, use SQL
query = ibis.to_sql(subset)
con.raw_sql(f"COPY ({query}) TO 's3://cboettig/gbif/json/example.json' (FORMAT JSON, ARRAY true);")



# Cloud-native map layers with leafmap



In [None]:
import leafmap.maplibregl as leafmap
m = leafmap.Map(style= terrain_style, 
                center=[-120, 37.6], zoom=9, pitch=55, bearing=20)

url2 = base_url + "/cboettig/gbif/json/example.json"
layer = HexagonLayer(url2, .5)
m.add_deck_layers([layer])
m.to_html("yosemite_bears.html", overwrite=True)
m