In [3]:
# Using polars more elegant s3-style access, but fails with GPU engine.  Can add auth for private data + writes
import polars as pl

endpoint = "minio.carlboettiger.info"
storage_options = {
    "endpoint_url": "https://" + endpoint,
    "aws_skip_signature": "true",
}

upper_districts = pl.scan_parquet(
    "s3://public-census/2024/sld/upper/z8/*",
    storage_options=storage_options
)
# error
# upper_districts.head().collect(engine="gpu")


In [None]:
import polars as pl
import pyarrow.fs as fs
import os

# assumes public URLs
def pl_scan_parquet_gpu(base_path, endpoint = os.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com")):
    s3 = fs.S3FileSystem(endpoint_override = endpoint, anonymous = True)
    file_stats = s3.get_file_info(fs.FileSelector(base_path, recursive = True))
    
    # 3. Construct "Direct HTTP" URLs
    file_urls = [
        f"https://{endpoint}/{info.path}" 
        for info in file_stats 
        if info.is_file and info.path.endswith(".parquet")
    ]
    return pl.scan_parquet(file_urls)


upper_districts = pl_scan_parquet_gpu("public-census/2024/sld/upper/z8/")
lower_districts = pl_scan_parquet_gpu("public-census/2024/sld/lower/z8/")

# upper_districts.head().collect(engine="gpu")

STATEFP,SLDUST,GEOID,GEOIDFQ,NAMELSAD,LSAD,LSY,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,h8,h0
str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""8826493243fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264930cbfffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264932c5fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649a9e3fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649ad55fffff""","""8027fffffffffff"""


In [7]:
%%time
wetlands = pl_scan_parquet_gpu("public-wetlands/hex/")
(
wetlands
    .filter(pl.col("Z") != 255)
    .group_by("Z")
    .agg(pl.len().alias("n"))
    .sort("n", descending=True)
    .collect(engine="gpu")
)

CPU times: user 2.85 s, sys: 3.43 s, total: 6.28 s
Wall time: 10 s


Z,n
i32,u32
0,1006929452
22,118737775
23,105021290
24,64233490
1,40473541
…,…
27,1792308
28,1751918
16,1370685
5,904302


In [8]:
%%time
import polars as pl

pad_z8 = pl.scan_parquet(
    "https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet"
)

tracts_z8 = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet")
    .with_columns(pl.col("h8").str.to_lowercase())
)

mobi = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-mobi/hex/all-richness-h8.parquet")
    .select(["richness", "h8"])
)

svi = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022/SVI2022_US_tract.parquet")
    .select(["FIPS", "RPL_THEMES"])
)

# Constant definition (standard Python)
hectres_h8 = 737327.598 / 10000

# 1. Join the LazyFrames
combined = (
    tracts_z8
    .join(mobi, on="h8", how="inner")
    .join(pad_z8, on="h8", how="inner")
    .join(svi, on="FIPS", how="inner")
    .join(upper_districts, on="h8", how="inner")
)

stats = (
    combined
    .group_by(["NAMELSAD", "STATE"])
    .agg([
        pl.col("richness").mean().alias("richness"),
        (pl.len() * hectres_h8).alias("area"),
        pl.col("RPL_THEMES").mean().alias("svi")
    ])
    .sort("richness", descending=True)
)

stats.collect().head()

CPU times: user 9.5 s, sys: 7.25 s, total: 16.7 s
Wall time: 4.58 s


NAMELSAD,STATE,richness,area,svi
str,str,f64,f64,f64
"""State Senate District 9""","""California""",10.210274,107649.829308,0.29136
"""State Senate District 54""","""Georgia""",9.626004,100940.148166,0.626828
"""State Senate District 51""","""Indiana""",9.142857,516.129319,0.448214
"""State Senate District 40""","""California""",9.026264,300387.263425,0.406408
"""State Senate District 28""","""Tennessee""",8.277985,39520.759253,0.340525


In [6]:

import ibis

con = ibis.duckdb.connect()


set_secrets = f'''
CREATE OR REPLACE SECRET s3_key (
    TYPE S3,
    ENDPOINT 'minio.carlboettiger.info',
    URL_STYLE 'path'
);
'''

con.raw_sql(set_secrets)
con.read_parquet("s3://public-census/2024/sld/lower/z8/**").head().execute()



Unnamed: 0,STATEFP,SLDLST,GEOID,GEOIDFQ,NAMELSAD,LSAD,LSY,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,h8,h0
0,1,103,1103,620L900US01103,State House District 103,LL,2024,G5220,N,66409356,7339298,30.6278069,-88.1138273,8844500261fffff,8045fffffffffff
1,1,103,1103,620L900US01103,State House District 103,LL,2024,G5220,N,66409356,7339298,30.6278069,-88.1138273,884450014bfffff,8045fffffffffff
2,1,103,1103,620L900US01103,State House District 103,LL,2024,G5220,N,66409356,7339298,30.6278069,-88.1138273,8844500035fffff,8045fffffffffff
3,1,103,1103,620L900US01103,State House District 103,LL,2024,G5220,N,66409356,7339298,30.6278069,-88.1138273,8844500159fffff,8045fffffffffff
4,1,103,1103,620L900US01103,State House District 103,LL,2024,G5220,N,66409356,7339298,30.6278069,-88.1138273,8844500301fffff,8045fffffffffff


In [5]:
import ibis
from ibis import _
# env vars
import os
endpoint = os.getenv("AWS_S3_ENDPOINT", "s3-west.nrp-nautilus.io")
url_style = "path"
use_ssl = os.getenv("AWS_HTTPS", "TRUE")
url_compatibility_mode = True

con = ibis.duckdb.connect()


set_secrets = f'''
CREATE OR REPLACE SECRET s3_key (
    TYPE S3,
    KEY_ID '',
    SECRET '',
    ENDPOINT '{endpoint}',
    URL_STYLE '{url_style}'
);
'''


con.raw_sql(set_secrets)

red = con.read_parquet("s3://public-redlining/hex/**")
#us_nwi_wetlands = con.read_parquet("s3://public-nwi/hex/**")
#us_nwi_wetlands_geoms = con.read_parquet("s3://public-nwi/geoms/*")



In [6]:
import duckdb
endpoint = os.getenv("AWS_S3_ENDPOINT", "s3-west.nrp-nautilus.io")
url_style = "path"
use_ssl = os.getenv("AWS_HTTPS", "TRUE")

set_secrets = f'''
CREATE OR REPLACE SECRET s3_key (
    TYPE S3,
    KEY_ID '',
    SECRET '',
    ENDPOINT '{endpoint}',
    URL_STYLE '{url_style}'
);
'''

duckdb.sql(set_secrets)
duckdb.read_parquet("s3://public-redlining/hex/**").limit(1)

┌─────────┬─────────┬─────────┬─────────────┬──────────────────────┬─────────┬─────────┬─────────────┬────────────┬────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────┬─────────────────┬─────────────────┬─────────────────┐
│ area_id │  city   │  state  │ city_survey │       category       │  grade  │  label  │ residential │ commercial │ industrial │  fill   │                                geometry_bbox                                │       h10       │       h9        │       h8        │
│  int32  │ varchar │ varchar │   boolean   │       varchar        │ varchar │ varchar │   boolean   │  boolean   │  boolean   │ varchar │           struct(xmin float, ymin float, xmax float, ymax float)            │     varchar     │     varchar     │     varchar     │
├─────────┼─────────┼─────────┼─────────────┼──────────────────────┼─────────┼─────────┼─────────────┼────────────┼────────────┼─────────┼─────────────────────────────────────────────────

In [9]:
biggest = us_nwi_wetlands_geoms.group_by(_.WETLAND_TYPE).agg(biggest = _.ACRES.max())

biggest.execute()

Unnamed: 0,WETLAND_TYPE,biggest
0,Lakes,19555.58
1,RIverine,0.8797203
2,Estuarine and Marine Wetland,103952.1
3,,1.512783
4,Lake,18585760.0
5,Other,22846.28
6,Estuarine and Marine Deepwater,3950070.0
7,Freshwater Forested/Shrub Wetland,113763.6
8,Freshwater Pond,12177.89
9,Freshwater Emergent Wetland,1711031.0


In [15]:

# Hex is in NWI but not global
us_only = (us_nwi_wetlands
           .left_join(global_wetlands, "h8")
           .filter(_.Z.isnull())
           )
us_only.count().execute()

164263

In [None]:
tracts_z8 = con.read_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet").mutate(h8 = _.h8.lower()).select(_.h8)


# Hex is in US global collection but not NWI:
globe_has = (global_wetlands
           .inner_join(tracts_z8, "h8")
           .left_join(us_nwi_wetlands, "h8")
           .filter(_.Z.isnull())
           )
globe_has.count().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

97425434

In [20]:
(con
.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
.group_by(_.h8)
.agg(overlaps = _.h8.count())
.order_by(_.overlaps.desc())
).head().execute()

Unnamed: 0,h8,overlaps
0,882aac85abfffff,48
1,882aac1425fffff,48
2,882aac85d7fffff,47
3,882aac8f55fffff,47
4,882aac81a3fffff,46


In [62]:
pad_hex = con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
pad_geom = con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-us-4.parquet")

# few overlaps involve different gap codes
(pad_hex
.inner_join(pad_geom, "row_n")
.select(_.GAP_Sts, _.h8)
.distinct()
.group_by(_.h8)
.agg(overlaps = _.h8.count())
.filter(_.overlaps > 1)
.order_by(_.overlaps.desc())
).head().execute()



Unnamed: 0,h8,overlaps
0,8827592e3dfffff,4
1,882a123585fffff,4
2,882a33b357fffff,4
3,882aa039b7fffff,4
4,882aa060d5fffff,4


In [70]:
pad_hex = con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
pad_geom = con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-us-4.parquet")

# few overlaps involve different gap codes
gap12_hexes = (pad_hex
.inner_join(pad_geom, "row_n")
.filter(_.GAP_Sts.isin(['1', '2']))
.select( _.h8)
.distinct()
).count().execute()


hectres_h8 = 737327.598	/ 10000

gap12_hexes * hectres_h8


302987522.93230677

In [54]:
(pad_hex
.inner_join(pad_geom, "row_n")
#.select(_.GAP_Sts, _.h8)
#.distinct()
.filter(_.h8 == "882a993a8dfffff")
.select(_.Unit_Nm, _.GAP_Sts, _.FeatClass)
.execute()
)

Unnamed: 0,Unit_Nm,GAP_Sts,FeatClass
0,NC Clean Water Management Trust Fund Easement,2,Easement
1,Pond Mountain Game Land,3,Fee
2,NC Land and Water Fund Project,4,Fee
3,NC Land and Water Fund Project,1,Fee
4,Blue Ridge Rural Land Trust Easement,4,Easement


In [23]:
so_many = (con
.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
.filter(_.h8 == "882aac85abfffff")
.execute()
)

In [None]:
(con
.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
.group_by(_.h8)
.agg(overlaps = _.h8.count())
.order_by(_.overlaps.desc())
.mutate(more_than_2 = _.overlaps > 2)
.group_by(_.more_than_2)
.agg(n = _.more_than_2.count())
).execute()

Unnamed: 0,more_than_2,n
0,False,8572785
1,True,320621


In [4]:
us_nwi_wetlands.head().execute()

Unnamed: 0,ATTRIBUTE,WETLAND_TYPE,state_code,h8,h0
0,L1UBH,Lake,ID,8812d9d43dfffff,8013fffffffffff
1,L1UBH,Lake,ID,8812dd6de3fffff,8013fffffffffff
2,L2AB3/4Hh,Lake,ID,8812d9c057fffff,8013fffffffffff
3,PAB3H,Freshwater Pond,ID,8812dd6da3fffff,8013fffffffffff
4,PAB4H,Freshwater Pond,ID,8812dd6d13fffff,8013fffffffffff


In [None]:

lower_districts = con.read_parquet("s3://public-census/2024/sld/lower/z8/**")
upper_districts = con.read_parquet("s3://public-census/2024/sld/upper/z8/**")
pad_z8 =  con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
tracts_z8 = con.read_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet").mutate(h8 = _.h8.lower())
mobi = con.read_parquet("https://minio.carlboettiger.info/public-mobi/hex/all-richness-h8.parquet").select('richness', 'h8')
svi = con.read_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022/SVI2022_US_tract.parquet").select('FIPS', 'RPL_THEMES')



In [None]:
%%time
(wetlands
.filter(_.Z != 255)
.group_by(_.Z)
.agg(n = _.Z.count())
.order_by(_.n.desc())
).execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 23.4 s, sys: 3.14 s, total: 26.6 s
Wall time: 7.84 s


Unnamed: 0,Z,n
0,0,1006929452
1,22,118737775
2,23,105021290
3,24,64233490
4,1,40473541
5,15,40294454
6,26,33736217
7,7,28838006
8,25,24330662
9,33,22282826


In [None]:

wetlands.inner_join(tracts_z8, "h8").head().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Z,h8,h0,STATE,COUNTY,FIPS
0,0,8813969003fffff,800dfffffffffff,Alaska,Yakutat City and Borough,2282000100
1,0,8813969015fffff,800dfffffffffff,Alaska,Yakutat City and Borough,2282000100
2,0,8813969015fffff,800dfffffffffff,Alaska,Yakutat City and Borough,2282000100
3,0,8813969011fffff,800dfffffffffff,Alaska,Yakutat City and Borough,2282000100
4,0,8813969011fffff,800dfffffffffff,Alaska,Yakutat City and Borough,2282000100


In [22]:
pad_z8.head().execute()

Unnamed: 0,Unit_Nm,row_n,h8
0,Pyramid Mountain Natural Historic Area,490053,882a1041e3fffff
1,Lake Rickabear Camp,490059,882a104131fffff
2,Pyramid Mountain Natural Historic Area,490064,882a1041c7fffff
3,Pyramid Mountain Natural Historic Area,490069,882a1041ebfffff
4,Pyramid Mountain,490070,882a1041ebfffff


In [6]:
%%time
combined = (tracts_z8
    .inner_join(mobi, "h8")
    .inner_join(pad_z8, "h8")
    .inner_join(svi, "FIPS")
    .inner_join(upper_districts, "h8")
           )

hectres_h8 = 737327.598	/ 10000

stats = (
    combined
    #.group_by("COUNTY", "STATE")
    .group_by("NAMELSAD", "STATE")
    .agg(richness = _.richness.mean(), 
         
         area = _.count() * hectres_h8,
         svi = _.RPL_THEMES.mean() 
         )
    .order_by(_.richness.desc())
)

stats.head().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 12 s, sys: 4.46 s, total: 16.4 s
Wall time: 4.38 s


Unnamed: 0,NAMELSAD,STATE,richness,area,svi
0,State Senate District 9,California,10.210274,107649.829308,0.29136
1,State Senate District 54,Georgia,9.626004,100940.148166,0.626828
2,State Senate District 51,Indiana,9.142857,516.129319,0.448214
3,State Senate District 40,California,9.026264,300387.263425,0.406408
4,State Senate District 28,Tennessee,8.277985,39520.759253,0.340525


---

## Polars with pyarrow

Uses s3 directly but very slow

In [26]:
%%time

## Polars with pyarrow is crazy slow
import polars as pl
import pyarrow.dataset as ds
from pyarrow import fs

s3 = fs.S3FileSystem(endpoint_override=endpoint)

# Define the dataset
dataset = ds.dataset(
    "public-census/2024/sld/upper/z8/", 
    filesystem=s3, 
    format="parquet"
)

upper_districts = pl.scan_pyarrow_dataset(dataset)
upper_districts.head().collect(engine="gpu")

CPU times: user 234 ms, sys: 97 ms, total: 331 ms
Wall time: 28.3 s


STATEFP,SLDUST,GEOID,GEOIDFQ,NAMELSAD,LSAD,LSY,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,h8,h0
str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""8826493243fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264930cbfffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264932c5fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649a9e3fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649ad55fffff""","""8027fffffffffff"""
