# Using Source.Coop to add value to a half-terrabyte public dataset


_The magic of object stores_



## 1. Stream data _from_ source.coop

## 2. Stream data _to_ source.coop


In [1]:
import ibis
from ibis import _
import pydeck as pdk
from boilerplate import *



First I computed **[H3 heirarchical spatial indexes](https://h3geo.org)** using [duckdb](https://github.com/boettiger-lab/gbif-maps/blob/main/gbif_as_h3.py) for the Global Biodiversity Information Facility (GBIF) & upload to [source.coop/cboettig/gbif](https://source.coop/cboettig/gbif/).   (***> 24 hrs, 430 GB, 3 billion+ observations***)

Now let's compute over _all the data in seconds_.


In [2]:
con = ibis.duckdb.connect(extensions=['httpfs', 'spatial', 'h3'])
set_secrets(con) # s3 credentials


In [3]:
%%time

dest = "csv/gbif_demo.csv"
(con
  .read_parquet("s3://cboettig/gbif/2024-10-01/**")
  .filter(_["class"].isin(["Insecta"]))
  .rename(hex = "h3")                   # h3 == 41,150 hexes.  h5 == 2,016,830 hexes	
  .group_by(_.hex)
  .agg(n = _.count())
  .mutate(logn = _.n.log())
  .mutate(value = (255 * _.logn / _.logn.max()).cast("int")) # normalized color-scale
  .to_csv("s3://cboettig/gbif/" + dest)
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 2min 2s, sys: 14.2 s, total: 2min 17s
Wall time: 10.5 s


# Cloud-native visualization

## from source.coop

Use deck.gl, where the data later is passed as URL, only read client-side

In [4]:
url = base_url + "/cboettig/gbif/" + dest

# Render with deck-gl
layer = HexagonLayer(url)
deck = DeckGlobe(layer)

deck.to_html("globe.html")

In [5]:
# Post the map itself to source
client = s3_client()
client.fput_object("cboettig", "gbif/maps/" + "globe.html", "globe.html")

<minio.helpers.ObjectWriteResult at 0x721d25b83ce0>


---


# Zooming in

In [6]:
## grab polygon of a National park:
polygon = (con
        .read_geo("/vsicurl/https://huggingface.co/datasets/cboettig/biodiversity/resolve/main/data/NPS.gdb")
        .filter(_.UNIT_NAME == "Yellowstone National Park")
        .mutate(SHAPE = _.SHAPE.convert('EPSG:3857', 'EPSG:4326'))
        .mutate(SHAPE = _.SHAPE.buffer(2))
        .execute()
       )

poly_expr = ibis.literal(polygon.geometry.iloc[0])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
con.read_parquet("s3://cboettig/gbif/2024-10-01/**").columns

['gbifid',
 'datasetkey',
 'occurrenceid',
 'kingdom',
 'phylum',
 'class',
 'order',
 'family',
 'genus',
 'species',
 'infraspecificepithet',
 'taxonrank',
 'scientificname',
 'verbatimscientificname',
 'verbatimscientificnameauthorship',
 'countrycode',
 'locality',
 'stateprovince',
 'occurrencestatus',
 'individualcount',
 'publishingorgkey',
 'decimallatitude',
 'decimallongitude',
 'coordinateuncertaintyinmeters',
 'coordinateprecision',
 'elevation',
 'elevationaccuracy',
 'depth',
 'depthaccuracy',
 'eventdate',
 'day',
 'month',
 'year',
 'taxonkey',
 'specieskey',
 'basisofrecord',
 'institutioncode',
 'collectioncode',
 'catalognumber',
 'recordnumber',
 'identifiedby',
 'dateidentified',
 'license',
 'rightsholder',
 'recordedby',
 'typestatus',
 'establishmentmeans',
 'lastinterpreted',
 'mediatype',
 'issue',
 'geom',
 'h0',
 'h1',
 'h2',
 'h3',
 'h4',
 'h5',
 'h6',
 'h7',
 'h8',
 'h9',
 'h10',
 'h11']

In [8]:
subset = (con
  .read_parquet("s3://cboettig/gbif/2024-10-01/**")
  .filter( _.geom.within(poly_expr))
  .filter(_.year > 2000)
  .filter(_["family"].isin(['Antilocapridae', 'Cervidae', 'Suidae']) |  _["genus"].isin(["Canis", 'Ursus', 'Bison']))
  .rename(hex = "h9")
  .group_by(_.species)
  .agg(n = _.count())
  .order_by(_.n.desc())
  .filter(_.n > 1)
  .execute()
)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [9]:
subset

Unnamed: 0,species,n
0,Bison bison,5286
1,Cervus elaphus,3465
2,Alces alces,1893
3,Odocoileus hemionus,1747
4,Antilocapra americana,1665
5,Ursus americanus,1594
6,Canis latrans,1096
7,Ursus arctos,1086
8,Canis lupus,608
9,Odocoileus virginianus,330


In [10]:
data = {
    "species": [
    "Bison bison",
    "Cervus elaphus",
    "Alces alces",
    "Odocoileus hemionus",
    "Antilocapra americana",
    "Ursus americanus",
    "Canis latrans",
    "Ursus arctos",
    "Canis lupus",
    "Odocoileus virginianus",
    "Alces americanus",
    "Sus scrofa"
],
    "name": [
    "American Bison",
    "Red Deer",
    "Moose",
    "Mule Deer",
    "Pronghorn",
    "Black Bear",
    "Coyote",
    "Brown Bear",
    "Gray Wolf",
    "White-tailed Deer",
    "American Moose",
    "Wild Boar"
],
    "fill": [
        [128, 0, 128], [0, 0, 200], [100, 100, 0], [150, 75, 50], [255, 0, 0], [0, 255, 0], 
        [0, 200, 0],   [200, 0, 0], [150, 50, 150],  [200, 100, 0], [50, 200, 150], [100, 50, 200],
    ],
}
colors = con.create_table("colors", data, overwrite=True)


In [11]:
subset = (con
  .read_parquet("s3://cboettig/gbif/2024-10-01/**")
  .filter( _.geom.within(poly_expr))
  .filter(_.year > 2000)
  .filter(_["family"].isin(['Antilocapridae', 'Cervidae', 'Suidae']) |  _["genus"].isin(["Canis", 'Ursus', 'Bison']))
  .rename(hex = "h11")
  .group_by(_["species"], _["hex"])
  .agg(n = _.count())
  .mutate(logn = _.n.log())
  .mutate(value = (255 * _.logn / _.logn.max()).cast("int"))
  .join(colors, "species")
)

# .to_json() doesn't exist in ibis, use SQL
query = ibis.to_sql(subset)
con.raw_sql(f"COPY ({query}) TO 's3://cboettig/gbif/json/mammals.json' (FORMAT JSON, ARRAY true);")



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x72213406fbf0>

In [12]:
url2 = base_url + "/cboettig/gbif/json/mammals.json"


layer = pdk.Layer(
            "H3HexagonLayer",
            id="gbif",
            data=url2,
            extruded=True,
            get_elevation="value",
            get_hexagon="hex",
            elevation_scale = 200 * .1,
            elevation_range = [0,1],
            pickable=True,
            auto_highlight=True,
            get_fill_color="fill",
            )

# Cloud-native map layers with leafmap



In [13]:
import leafmap.maplibregl as leafmap
m = leafmap.Map(style= terrain_style, pitch=55, bearing=20,  center=[-111, 44.5], zoom=11,)

m.add_deck_layers([layer], tooltip="Species: {{ name }}, count: {{ n }}")
#m.add_gdf(polygon[["SHAPE"]], "fill", paint = {"fill-opacity": 0.2})
m.to_html("common-mammals.html", overwrite=True)
m

Map(height='600px', map_options={'bearing': 20, 'center': (-111, 44.5), 'pitch': 55, 'style': {'version': 8, 'â€¦