In [1]:
# !pip install git+https://github.com/boettiger-lab/cng-python

In [None]:

import ibis
from ibis import _
from cng.utils import *
from cng.h3 import * 
import os
con = ibis.duckdb.connect("local.db", extensions = ["spatial", "h3"])
install_h3()


# Must used scoped secrets with different names for the different endpoints
set_secrets(con, name = "minio") # read/write using AWS env var credentials
set_secrets(con, "", "", endpoint = "s3.amazonaws.com", region="us-west-2", name = "source", bucket = "us-west-2.opendata.source.coop")

def geom_to_cell(df, zoom=8, keep_cols=None):
    con = df.get_backend()
    
    # Default to keeping all columns except geom if not specified
    if keep_cols is None:
        keep_cols = [col for col in df.columns if col != 'geom']
    
    # Build column list for SELECT statements
    col_list = ', '.join(keep_cols)
    
    # all types must be multi-polygons
    cases = ibis.cases(
        (df.geom.geometry_type() == 'POLYGON', ST_Multi(df.geom)),
        else_=df.geom,
    )
    
    df = df.mutate(geom=cases)
    sql = ibis.to_sql(df)
    
    expr = f'''
        WITH t1 AS (
            SELECT {col_list}, UNNEST(ST_Dump(ST_GeomFromWKB(geom))).geom AS geom 
            FROM ({sql})
        ) 
        SELECT *, h3_polygon_wkt_to_cells_string(geom, {zoom}) AS h3id FROM t1
    '''

    out = con.sql(expr)
    return out







In [None]:
SOURCE = "s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/**"
SOURCE = "s3://public-nwi/aws/us-west-2.opendata.source.coop/giswqs/nwi/wetlands/**"

nwi =(con
    .read_parquet(SOURCE, filename = True)
    .select('geometry', 'ATTRIBUTE', 'WETLAND_TYPE', 'filename')
    .rename(geom = "geometry")
    .mutate(state_code=_.filename.re_extract(r"([A-Z]{2})_Wetlands.parquet", 1))
    .mutate(geom =  _.geom.convert('EPSG:5070','EPSG:4326'))
    .drop('filename')
)


x = nwi.head().execute()

In [None]:
MEMORY_LIMIT='20GB'
CHUNK_SIZE = 10000
con.raw_sql(f"SET memory_limit='{MEMORY_LIMIT}';")

OUTPUT_PATH="s3://public-wetlands/nwi/"


table = nwi
# Read parquet file

# Get total row count and calculate chunks
total_rows = table.count().execute()
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE

print(f"Total rows: {total_rows:,}")
print(f"Chunk size: {CHUNK_SIZE:,}")
print(f"Number of chunks: {num_chunks}")


Total rows: 38,065,251
Chunk size: 10,000
Number of chunks: 3807


In [7]:

chunk_id = 0
offset = chunk_id * CHUNK_SIZE
print(f"\nProcessing chunk {chunk_id + 1}/{num_chunks} (rows {offset:,} to {min(offset + CHUNK_SIZE, total_rows):,})")

chunk = table.limit(CHUNK_SIZE, offset=offset)
result = geom_to_cell(chunk, zoom=8).mutate(h8 = _.h3id.unnest()).drop('h3id')

result.to_parquet("test_10.parquet")


Processing chunk 1/3807 (rows 0 to 10,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [8]:
con.read_parquet("test_10.parquet").head().execute()

Unnamed: 0,ATTRIBUTE,WETLAND_TYPE,geom,h8
0,E1AB3L,Estuarine and Marine Deepwater,"POLYGON ((-96.06927 29.60529, -96.06948 29.605...",884893604bfffff
1,E1AB3L,Estuarine and Marine Deepwater,"POLYGON ((-96.04739 29.64596, -96.0478 29.6462...",8848936007fffff
2,E1AB3L,Estuarine and Marine Deepwater,"POLYGON ((-96.04739 29.64596, -96.0478 29.6462...",884893603dfffff
3,E1AB3L,Estuarine and Marine Deepwater,"POLYGON ((-96.04739 29.64596, -96.0478 29.6462...",8848936001fffff
4,E1AB3L,Estuarine and Marine Deepwater,"POLYGON ((-96.04739 29.64596, -96.0478 29.6462...",8848936063fffff


In [9]:

# Process each chunk
for chunk_id in range(num_chunks):
    offset = chunk_id * CHUNK_SIZE
    print(f"\nProcessing chunk {chunk_id + 1}/{num_chunks} (rows {offset:,} to {min(offset + CHUNK_SIZE, total_rows):,})")
    
    # Get chunk with row filtering
    chunk = table.limit(CHUNK_SIZE, offset=offset)
    result = geom_to_cell(chunk, zoom=8).mutate(h8 = _.h3id.unnest())
    
    # Write to parquet
    output_file = f"{OUTPUT_PATH}chunks/chunk_{chunk_id:04d}.parquet"
    result.to_parquet(output_file)
    
    print(f"  ✓ Chunk {chunk_id} written")




Processing chunk 1/3807 (rows 0 to 10,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 0 written

Processing chunk 2/3807 (rows 10,000 to 20,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 1 written

Processing chunk 3/3807 (rows 20,000 to 30,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 2 written

Processing chunk 4/3807 (rows 30,000 to 40,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 3 written

Processing chunk 5/3807 (rows 40,000 to 50,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 4 written

Processing chunk 6/3807 (rows 50,000 to 60,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 5 written

Processing chunk 7/3807 (rows 60,000 to 70,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 6 written

Processing chunk 8/3807 (rows 70,000 to 80,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 7 written

Processing chunk 9/3807 (rows 80,000 to 90,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 8 written

Processing chunk 10/3807 (rows 90,000 to 100,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 9 written

Processing chunk 11/3807 (rows 100,000 to 110,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 10 written

Processing chunk 12/3807 (rows 110,000 to 120,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 11 written

Processing chunk 13/3807 (rows 120,000 to 130,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 12 written

Processing chunk 14/3807 (rows 130,000 to 140,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 13 written

Processing chunk 15/3807 (rows 140,000 to 150,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 14 written

Processing chunk 16/3807 (rows 150,000 to 160,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 15 written

Processing chunk 17/3807 (rows 160,000 to 170,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 16 written

Processing chunk 18/3807 (rows 170,000 to 180,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 17 written

Processing chunk 19/3807 (rows 180,000 to 190,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 18 written

Processing chunk 20/3807 (rows 190,000 to 200,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 19 written

Processing chunk 21/3807 (rows 200,000 to 210,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 20 written

Processing chunk 22/3807 (rows 210,000 to 220,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 21 written

Processing chunk 23/3807 (rows 220,000 to 230,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 22 written

Processing chunk 24/3807 (rows 230,000 to 240,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 23 written

Processing chunk 25/3807 (rows 240,000 to 250,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 24 written

Processing chunk 26/3807 (rows 250,000 to 260,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 25 written

Processing chunk 27/3807 (rows 260,000 to 270,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 26 written

Processing chunk 28/3807 (rows 270,000 to 280,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 27 written

Processing chunk 29/3807 (rows 280,000 to 290,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 28 written

Processing chunk 30/3807 (rows 290,000 to 300,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 29 written

Processing chunk 31/3807 (rows 300,000 to 310,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 30 written

Processing chunk 32/3807 (rows 310,000 to 320,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 31 written

Processing chunk 33/3807 (rows 320,000 to 330,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 32 written

Processing chunk 34/3807 (rows 330,000 to 340,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 33 written

Processing chunk 35/3807 (rows 340,000 to 350,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 34 written

Processing chunk 36/3807 (rows 350,000 to 360,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 35 written

Processing chunk 37/3807 (rows 360,000 to 370,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 36 written

Processing chunk 38/3807 (rows 370,000 to 380,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 37 written

Processing chunk 39/3807 (rows 380,000 to 390,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 38 written

Processing chunk 40/3807 (rows 390,000 to 400,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 39 written

Processing chunk 41/3807 (rows 400,000 to 410,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 40 written

Processing chunk 42/3807 (rows 410,000 to 420,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 41 written

Processing chunk 43/3807 (rows 420,000 to 430,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 42 written

Processing chunk 44/3807 (rows 430,000 to 440,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 43 written

Processing chunk 45/3807 (rows 440,000 to 450,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 44 written

Processing chunk 46/3807 (rows 450,000 to 460,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 45 written

Processing chunk 47/3807 (rows 460,000 to 470,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 46 written

Processing chunk 48/3807 (rows 470,000 to 480,000)
  ✓ Chunk 47 written

Processing chunk 49/3807 (rows 480,000 to 490,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 48 written

Processing chunk 50/3807 (rows 490,000 to 500,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 49 written

Processing chunk 51/3807 (rows 500,000 to 510,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 50 written

Processing chunk 52/3807 (rows 510,000 to 520,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 51 written

Processing chunk 53/3807 (rows 520,000 to 530,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 52 written

Processing chunk 54/3807 (rows 530,000 to 540,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 53 written

Processing chunk 55/3807 (rows 540,000 to 550,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 54 written

Processing chunk 56/3807 (rows 550,000 to 560,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 55 written

Processing chunk 57/3807 (rows 560,000 to 570,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 56 written

Processing chunk 58/3807 (rows 570,000 to 580,000)
  ✓ Chunk 57 written

Processing chunk 59/3807 (rows 580,000 to 590,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 58 written

Processing chunk 60/3807 (rows 590,000 to 600,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 59 written

Processing chunk 61/3807 (rows 600,000 to 610,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 60 written

Processing chunk 62/3807 (rows 610,000 to 620,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 61 written

Processing chunk 63/3807 (rows 620,000 to 630,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 62 written

Processing chunk 64/3807 (rows 630,000 to 640,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 63 written

Processing chunk 65/3807 (rows 640,000 to 650,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 64 written

Processing chunk 66/3807 (rows 650,000 to 660,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 65 written

Processing chunk 67/3807 (rows 660,000 to 670,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 66 written

Processing chunk 68/3807 (rows 670,000 to 680,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 67 written

Processing chunk 69/3807 (rows 680,000 to 690,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 68 written

Processing chunk 70/3807 (rows 690,000 to 700,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 69 written

Processing chunk 71/3807 (rows 700,000 to 710,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 70 written

Processing chunk 72/3807 (rows 710,000 to 720,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 71 written

Processing chunk 73/3807 (rows 720,000 to 730,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 72 written

Processing chunk 74/3807 (rows 730,000 to 740,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 73 written

Processing chunk 75/3807 (rows 740,000 to 750,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 74 written

Processing chunk 76/3807 (rows 750,000 to 760,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 75 written

Processing chunk 77/3807 (rows 760,000 to 770,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 76 written

Processing chunk 78/3807 (rows 770,000 to 780,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 77 written

Processing chunk 79/3807 (rows 780,000 to 790,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 78 written

Processing chunk 80/3807 (rows 790,000 to 800,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 79 written

Processing chunk 81/3807 (rows 800,000 to 810,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 80 written

Processing chunk 82/3807 (rows 810,000 to 820,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 81 written

Processing chunk 83/3807 (rows 820,000 to 830,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 82 written

Processing chunk 84/3807 (rows 830,000 to 840,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 83 written

Processing chunk 85/3807 (rows 840,000 to 850,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 84 written

Processing chunk 86/3807 (rows 850,000 to 860,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 85 written

Processing chunk 87/3807 (rows 860,000 to 870,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 86 written

Processing chunk 88/3807 (rows 870,000 to 880,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 87 written

Processing chunk 89/3807 (rows 880,000 to 890,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 88 written

Processing chunk 90/3807 (rows 890,000 to 900,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 89 written

Processing chunk 91/3807 (rows 900,000 to 910,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 90 written

Processing chunk 92/3807 (rows 910,000 to 920,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 91 written

Processing chunk 93/3807 (rows 920,000 to 930,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 92 written

Processing chunk 94/3807 (rows 930,000 to 940,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 93 written

Processing chunk 95/3807 (rows 940,000 to 950,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 94 written

Processing chunk 96/3807 (rows 950,000 to 960,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 95 written

Processing chunk 97/3807 (rows 960,000 to 970,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 96 written

Processing chunk 98/3807 (rows 970,000 to 980,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 97 written

Processing chunk 99/3807 (rows 980,000 to 990,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 98 written

Processing chunk 100/3807 (rows 990,000 to 1,000,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 99 written

Processing chunk 101/3807 (rows 1,000,000 to 1,010,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 100 written

Processing chunk 102/3807 (rows 1,010,000 to 1,020,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 101 written

Processing chunk 103/3807 (rows 1,020,000 to 1,030,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 102 written

Processing chunk 104/3807 (rows 1,030,000 to 1,040,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 103 written

Processing chunk 105/3807 (rows 1,040,000 to 1,050,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 104 written

Processing chunk 106/3807 (rows 1,050,000 to 1,060,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 105 written

Processing chunk 107/3807 (rows 1,060,000 to 1,070,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 106 written

Processing chunk 108/3807 (rows 1,070,000 to 1,080,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 107 written

Processing chunk 109/3807 (rows 1,080,000 to 1,090,000)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ✓ Chunk 108 written

Processing chunk 110/3807 (rows 1,090,000 to 1,100,000)


OutOfMemoryException: Out of Memory Error: failed to allocate data of size 21.4 MiB (18.6 GiB/18.6 GiB used)

Possible solutions:
* Reducing the number of threads (SET threads=X)
* Disabling insertion-order preservation (SET preserve_insertion_order=false)
* Increasing the memory limit (SET memory_limit='...GB')

See also https://duckdb.org/docs/stable/guides/performance/how_to_tune_workloads

In [None]:
print("\n✅ All chunks processed!")

# Combine all chunks
print("\nCombining chunks...")
combined = con.read_parquet(f'{OUTPUT_PATH}/chunks/chunk_*.parquet')
combined.to_parquet(f'{OUTPUT_PATH}/hex/combined_results.parquet')
print("✅ Combined file created!")

con.disconnect()