# 2000 census county data

In [25]:
import os
import io
import requests
import zipfile
import tempfile
import shutil
import geopandas as gpd
import ibis
from ibis import _
from cng.utils import *
from cng.h3 import *

duckdb_install_h3()
con = ibis.duckdb.connect(extensions = ["spatial", "h3"])
con.raw_sql("SET THREADS=100;")
set_secrets(con)

bucket = "public-census"
s3_prefix = "2000/county"

In [40]:
import geopandas as gpd
def shape_to_parquet(url, s3_prefix, bucket):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to download for state {state}")

    zip_bytes = io.BytesIO(response.content)

    # extract zip in temp directory 
    with tempfile.TemporaryDirectory() as temp_dir:
        with zipfile.ZipFile(zip_bytes) as zf:
            shapefile_prefix = f"co99_d00"
            shp_name = f"{shapefile_prefix}.shp"
            zf.extractall(temp_dir)

        #get fips code for each state
        fips_url = 'https://www2.census.gov/geo/docs/reference/codes2020/national_state2020.txt'
        fips_codes = con.read_csv(fips_url).rename(state='STATE',state_name='STATE_NAME').drop('STATENS')

        shp_path = os.path.join(temp_dir, shp_name) 
        gdf = (con.read_geo(shp_path)
            .rename(name = 'NAME',geometry='geom', STATEFP='STATE')
            .mutate(geometry =_.geometry.convert('EPSG:4269','EPSG:4326'))
            .join(fips_codes,'STATEFP', how='inner')
              )

    # # convert to parquet
        parquet_name = f"{shapefile_prefix}.parquet"
        parquet_path = f"s3://{bucket}/{s3_prefix}/{parquet_name}"
        gdf.to_parquet(parquet_path)

In [41]:
url='https://www2.census.gov/geo/tiger/PREVGENZ/co/co00shp/co99_d00_shp.zip'
shape_to_parquet(url, s3_prefix, bucket)