In [6]:
import ibis
from ibis import _
import pathlib
from cng.utils import *
duckdb_install_h3()

con = ibis.duckdb.connect(extensions = ["spatial", "h3"])
set_secrets(con)

con.raw_sql("SET threads = 2;")

<duckdb.duckdb.DuckDBPyConnection at 0x72a36ed4c1b0>

In [7]:

congress_code = "119"  # 119th Congress
states = con.read_geo("/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/STATE/tl_2024_us_state.zip")
state_fips = states.select("STATEFP").order_by(_.STATEFP).execute()["STATEFP"]

def generate_urls(congress_code, state_fips):
    base_url = f"https://www2.census.gov/geo/tiger/TIGER2024/CD/tl_2024_"
    extension = f"_cd{congress_code}.zip"
    return [base_url + state_code + extension for state_code in state_fips]

## for some reason these won't stream
state_fips = state_fips[state_fips != '45']
state_fips = state_fips[state_fips != '60']

In [8]:
urls = generate_urls(congress_code, state_fips)
for url in urls:
    vurl = "/vsizip//vsicurl/" + url
    dest = "data/" + pathlib.Path(vurl).with_suffix(".parquet").name
    # print(dest)
    con.read_geo(vurl).to_parquet(dest)
    

In [11]:
con.read_geo("/vsizip/tl_2024_45_cd119.zip").to_parquet("data/tl_2024_45_cd119.parquet")
con.read_geo("/vsizip/tl_2024_60_cd119.zip").to_parquet("data/tl_2024_60_cd119.parquet")

In [12]:

df = con.read_parquet("data/**")
df.to_parquet("s3://public-census/year=2024/CD/cd.parquet")



In [26]:

import ibis.expr.datatypes as dt
@ibis.udf.scalar.builtin
def ST_Multi (geom) -> dt.geometry:
    ...
    
def geom_to_cell (df, zoom = 8):
    con = df._find_backend() # df.get_backend() ibis >= 10.0

    # First make sure we are using multipolygons everywhere and not a mix
    cases = ibis.cases(
        (df.geom.geometry_type() == 'POLYGON' , ST_Multi(df.geom)),
        else_=df.geom,
    )
    
    df = df.mutate(geom = cases)
    sql = ibis.to_sql(df)
    expr = f'''
        WITH t1 AS (
        SELECT * EXCLUDE (geom), UNNEST(ST_Dump(ST_GeomFromWKB(geom))).geom AS geom 
        FROM ({sql})
        ) 
        SELECT *, h3_polygon_wkt_to_cells_string(geom, {zoom}) AS h{zoom}  FROM t1
    '''

    out = con.sql(expr)
    return out


In [27]:

df = con.read_parquet("data/**")

geom_to_cell(df, 8).to_parquet("s3://public-census/year=2024/CD/cd-hex-z8.parquet")



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [28]:
con.read_parquet("s3://public-census/year=2024/CD/cd-hex-z8.parquet").head().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,STATEFP,CD119FP,GEOID,GEOIDFQ,NAMELSAD,LSAD,CDSESSN,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geom,h8,year
0,1,1,101,5001900US0101,Congressional District 1,C2,119,G5200,N,18753464839,2274273696,31.0328895,-86.798975,"POLYGON ((-88.41482 30.78562, -88.41444 30.785...","[884452ab33fffff, 8844e1b8c5fffff, 8844508811f...",2024
1,1,2,102,5001900US0102,Congressional District 2,C2,119,G5200,N,24514317067,234734076,31.7619148,-86.6281876,"POLYGON ((-88.46443 31.69795, -88.46418 31.697...","[8844ecd443fffff, 8844ec9037fffff, 884453c337f...",2024
2,1,3,103,5001900US0103,Congressional District 3,C2,119,G5200,N,17327582348,466368976,33.4140097,-85.7577724,"POLYGON ((-86.5778 33.76652, -86.5778 33.76831...","[8844eb4293fffff, 8844eaa8e3fffff, 8844ee6707f...",2024
3,1,4,104,5001900US0104,Congressional District 4,C2,119,G5200,N,22351892592,578754652,34.1302579,-87.2816902,"POLYGON ((-88.27459 33.53425, -88.2745 33.5349...","[8844ed214bfffff, 8844ebad39fffff, 8844eb3133f...",2024
4,1,5,105,5001900US0105,Congressional District 5,C2,119,G5200,N,10105546447,427050369,34.6916846,-86.6808412,"POLYGON ((-86.15427 34.53026, -86.15425 34.532...",[8844eb2b29fffff],2024
