In [None]:
import polars

In [2]:
# Using polars more elegant s3-style access, but fails with GPU engine.  Can add auth for private data + writes
import polars as pl

endpoint = "minio.carlboettiger.info"
storage_options = {
    "endpoint_url": "https://" + endpoint,
    "aws_skip_signature": "true",
}

upper_districts = pl.scan_parquet(
    "s3://public-census/2024/sld/upper/z8/*",
    storage_options=storage_options
)
# error
# upper_districts.head().collect(engine="gpu")


In [3]:
%%time
import polars as pl
import pyarrow.fs as fs
import os

# assumes public URLs
def pl_scan_parquet_gpu(base_path, endpoint = os.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com")):
    s3 = fs.S3FileSystem(endpoint_override = endpoint, anonymous = True)
    file_stats = s3.get_file_info(fs.FileSelector(base_path, recursive = True))
    
    # 3. Construct "Direct HTTP" URLs
    file_urls = [
        f"https://{endpoint}/{info.path}" 
        for info in file_stats 
        if info.is_file and info.path.endswith(".parquet")
    ]
    return pl.scan_parquet(file_urls)


upper_districts = pl_scan_parquet_gpu("public-census/2024/sld/upper/z8/")
#lower_districts = pl_scan_parquet_gpu("public-census/2024/sld/lower/z8/")

upper_districts.head().collect(engine="gpu")

CPU times: user 1.52 s, sys: 1.83 s, total: 3.35 s
Wall time: 2.12 s


STATEFP,SLDUST,GEOID,GEOIDFQ,NAMELSAD,LSAD,LSY,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,h8,h0
str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""8826493243fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264930cbfffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264932c5fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649a9e3fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649ad55fffff""","""8027fffffffffff"""


In [4]:
%%time
import polars as pl

pad_z8 = pl.scan_parquet(
    "https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet"
)

tracts_z8 = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet")
    .with_columns(pl.col("h8").str.to_lowercase())
)

mobi = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-mobi/hex/all-richness-h8.parquet")
    .select(["richness", "h8"])
)

svi = (
    pl.scan_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022/SVI2022_US_tract.parquet")
    .select(["FIPS", "RPL_THEMES"])
)

# Constant definition (standard Python)
hectres_h8 = 737327.598 / 10000

# 1. Join the LazyFrames
combined = (
    tracts_z8
    .join(mobi, on="h8", how="inner")
    .join(pad_z8, on="h8", how="inner")
    .join(svi, on="FIPS", how="inner")
    .join(upper_districts, on="h8", how="inner")
)

stats = (
    combined
    .group_by(["NAMELSAD", "STATE"])
    .agg([
        pl.col("richness").mean().alias("richness"),
        (pl.len() * hectres_h8).alias("area"),
        pl.col("RPL_THEMES").mean().alias("svi")
    ])
    .sort("richness", descending=True)
)

stats.collect().head()

CPU times: user 8.81 s, sys: 4.59 s, total: 13.4 s
Wall time: 4.28 s


NAMELSAD,STATE,richness,area,svi
str,str,f64,f64,f64
"""State Senate District 9""","""California""",10.210274,107649.829308,0.29136
"""State Senate District 54""","""Georgia""",9.626004,100940.148166,0.626828
"""State Senate District 51""","""Indiana""",9.142857,516.129319,0.448214
"""State Senate District 40""","""California""",9.026264,300387.263425,0.406408
"""State Senate District 28""","""Tennessee""",8.277985,39520.759253,0.340525


In [2]:
%%time

import ibis
from ibis import _
# env vars
import os
endpoint = os.getenv("AWS_S3_ENDPOINT", "minio.carlboettiger.info")
url_style = "path"
use_ssl = os.getenv("AWS_HTTPS", "TRUE")
url_compatibility_mode = True

con = ibis.duckdb.connect()


set_secrets = f'''
CREATE OR REPLACE SECRET s3_key (
    TYPE S3,
    ENDPOINT '{endpoint}',
    URL_STYLE '{url_style}'
);
'''


con.raw_sql(set_secrets)
upper_districts = con.read_parquet("s3://public-census/2024/sld/upper/z8/**")
pad_z8 =  con.read_parquet("https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-h3-z8.parquet")
tracts_z8 = con.read_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet").mutate(h8 = _.h8.lower())
mobi = con.read_parquet("https://minio.carlboettiger.info/public-mobi/hex/all-richness-h8.parquet").select('richness', 'h8')
svi = con.read_parquet("https://minio.carlboettiger.info/public-social-vulnerability/2022/SVI2022_US_tract.parquet").select('FIPS', 'RPL_THEMES')



In [3]:
# combined


In [4]:
%%time
combined = (tracts_z8
    .inner_join(mobi, "h8")
    .inner_join(pad_z8, "h8")
    .inner_join(svi, "FIPS")
    .inner_join(upper_districts, "h8")
           )

hectres_h8 = 737327.598	/ 10000

stats = (
    combined
    #.group_by("COUNTY", "STATE")
    .group_by("NAMELSAD", "STATE")
    .agg(richness = _.richness.mean(), 
         area = _.count() * hectres_h8,
         svi = _.RPL_THEMES.mean() 
         )
    .order_by(_.richness.desc())
)

stats.head().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,NAMELSAD,STATE,richness,area,svi
0,State Senate District 9,California,10.210274,107649.829308,0.29136
1,State Senate District 54,Georgia,9.626004,100940.148166,0.626828
2,State Senate District 51,Indiana,9.142857,516.129319,0.448214
3,State Senate District 40,California,9.026264,300387.263425,0.406408
4,State Senate District 28,Tennessee,8.277985,39520.759253,0.340525


In [26]:
%%time

## Polars with pyarrow is crazy slow
import polars as pl
import pyarrow.dataset as ds
from pyarrow import fs

s3 = fs.S3FileSystem(endpoint_override=endpoint)

# Define the dataset
dataset = ds.dataset(
    "public-census/2024/sld/upper/z8/", 
    filesystem=s3, 
    format="parquet"
)

upper_districts = pl.scan_pyarrow_dataset(dataset)
upper_districts.head().collect(engine="gpu")

CPU times: user 234 ms, sys: 97 ms, total: 331 ms
Wall time: 28.3 s


STATEFP,SLDUST,GEOID,GEOIDFQ,NAMELSAD,LSAD,LSY,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,h8,h0
str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""8826493243fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264930cbfffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""88264932c5fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649a9e3fffff""","""8027fffffffffff"""
"""01""","""002""","""01002""","""610U900US01002""","""State Senate District 2""","""LU""","""2024""","""G5210""","""N""",353315068,1820551,"""+34.7832617""","""-086.7348706""","""882649ad55fffff""","""8027fffffffffff"""
