## setup

In [1]:
# !pip install awscli

In [2]:
import os
os.environ['AWS_ACCESS_KEY_ID']     = '<REDACTED>'
os.environ['AWS_SECRET_ACCESS_KEY'] = '<REDACTED>'
# os.environ['AWS_DEFAULT_REGION']    = 'us‑west‑2'


In [8]:
project_id = 'ee-biplov-sandbox'
bucket_name = 'clay-embeddings-v1-5'
mount_dir = f'mounted_{bucket_name}'
state = 'wa'
year = 2021

staging_table_id = f"{project_id}.naip_embeddings.raw_embeddings"
prod_table_id = f"{project_id}.naip_embeddings.embeddings"


In [4]:
!mkdir -p $mount_dir

In [5]:
!gcsfuse $bucket_name $mount_dir

{"timestamp":{"seconds":1745546314,"nanos":628954733},"severity":"INFO","message":"Start gcsfuse/2.11.1 (Go version go1.24.0) for app \"\" using mount point: /home/jupyter/mounted_clay-embeddings-v1-5\n"}
{"timestamp":{"seconds":1745546314,"nanos":629006069},"severity":"INFO","message":"GCSFuse config","config":{"AppName":"","CacheDir":"","Debug":{"ExitOnInvariantViolation":false,"Fuse":false,"Gcs":false,"LogMutex":false},"EnableAtomicRenameObject":false,"EnableHns":true,"FileCache":{"CacheFileForRangeRead":false,"DownloadChunkSizeMb":50,"EnableCrc":false,"EnableODirect":false,"EnableParallelDownloads":false,"ExperimentalParallelDownloadsDefaultOn":false,"MaxParallelDownloads":64,"MaxSizeMb":-1,"ParallelDownloadsPerFile":16,"WriteBufferSize":4194304},"FileSystem":{"DirMode":"755","DisableParallelDirops":false,"FileMode":"644","FuseOptions":[],"Gid":-1,"HandleSigterm":true,"IgnoreInterrupts":true,"KernelListCacheTtlSecs":0,"PreconditionErrors":true,"RenameDirLimit":0,"TempDir":"","Uid":

## download

In [None]:
!aws s3 sync s3://clay/clay-v1-5-naip-2/{state}/{year} ./{mount_dir}/naip_{state}_{year} \
  --endpoint-url=https://data.source.coop


# gcp parquet host

## load into staging table

In [6]:
from google.cloud import bigquery

client = bigquery.Client(project=project_id)


In [7]:
def load_to_staging(gcs_uri: str):
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,
        autodetect=True,
        write_disposition="WRITE_TRUNCATE",
    )
    load_job = client.load_table_from_uri(
        gcs_uri,
        staging_table_id,
        job_config=job_config
    )
    load_job.result()
    print(f"Staging table {staging_table_id} loaded from {gcs_uri}")


In [9]:
def build_or_replace_prod(state: str, year: int):
    """First-time setup of prod table: partition+cluster + flatten embeddings."""
    sql = f"""
    CREATE OR REPLACE TABLE `{prod_table_id}`
    PARTITION BY dt
    CLUSTER BY geohash5 AS
    SELECT
      geometry,
      -- flatten the nested RECORD into a true ARRAY<FLOAT64>
      ARRAY(SELECT r.element
            FROM UNNEST(embeddings.list) AS r) AS embeddings,
      "{state}" AS state,
      {year}     AS year,
      DATE({year}, 1, 1) AS dt,
      SUBSTR(ST_GEOHASH(ST_CENTROID(geometry), 5), 1, 5) AS geohash5
    FROM `{staging_table_id}`;
    """
    client.query(sql).result()
    print(f"Production table {prod_table_id} CREATED for {state}/{year}.")


In [10]:
def insert_into_prod(state: str, year: int):
    """Append a new state/year batch into the existing prod table."""
    sql = f"""
    INSERT INTO `{prod_table_id}`
    SELECT
      geometry,
      ARRAY(SELECT r.element
            FROM UNNEST(embeddings.list) AS r) AS embeddings,
      "{state}" AS state,
      {year}     AS year,
      DATE({year},1,1) AS dt,
      SUBSTR(ST_GEOHASH(ST_CENTROID(geometry), 5),1,5) AS geohash5
    FROM `{staging_table_id}`;
    """
    client.query(sql).result()
    print(f"Inserted {state}/{year} into {prod_table_id}.")


In [11]:
# first time load
gcs_wa21 = f"gs://{bucket_name}/naip_{state}_{year}/*.parquet"

# load to staging
load_to_staging(gcs_wa21)



Staging table ee-biplov-sandbox.naip_embeddings.raw_embeddings loaded from gs://clay-embeddings-v1-5/naip_wa_2021/*.parquet


In [12]:
# load to prod
build_or_replace_prod(state=state, year=year)


Production table ee-biplov-sandbox.naip_embeddings.embeddings CREATED for wa/2021.


In [None]:
# update later

new_state =
new_year =

gcs_new = f"gs://{bucket_name}/naip_{new_state}_{new_year}/*.parquet"

load_to_staging(gcs_new)

insert_into_prod(state=new_state, year=new_year)


In [16]:
# !pip install bigquery-jupyter-plugin google-cloud-bigquery-storage[pyarrow]

## create vector index in BQ

In [None]:
%%bigquery
CREATE VECTOR INDEX naip_index ON `naip_embeddings.embeddings`(embeddings)
OPTIONS(distance_type='COSINE', index_type='IVF', ivf_options='{"num_lists": 1000}');

In [None]:
# check if all index are  created
# confirm that the coverage_percentage value is 100:
%%bigquery
SELECT * FROM naip_embeddings.INFORMATION_SCHEMA.VECTOR_INDEXES;
