# Pre-processing

### Setting up

In [None]:
import ibis
from ibis import _
import geopandas as gpd
import duckdb

conn = ibis.duckdb.connect("tmp", extensions=["spatial"])
ca_parquet = "https://data.source.coop/cboettig/ca30x30/ca_areas.parquet"
# or use local copy:
# ca_parquet = "../data/ca_areas.parquet" # CA Nature Data 

ca_boundary = "../data/ca_shape"
ca_boundary_parquet = "../data/ca_boundary.parquet"
ca_nonconserved_parquet = "../data/ca_notPAD_500m_simplify.parquet"
ca_nonconserved_clean_parquet = "../data/ca_notPAD_500m_simplify_clean.parquet"

### Computing all "non-conserved" areas

In [None]:
# Using a shape file of CA boundary and converting to parquet file 
ca_all = gpd.read_file(ca_boundary).to_crs(epsg = 3310)
ca_all.to_parquet(ca_boundary_parquet)

In [None]:
# Computing difference: CA Boundary - Conserved Areas = Non-conserved areas
# This chunk will take ~2 hours to run 
con = ibis.duckdb.connect("tmp10", extensions=["spatial"]) #save to disk

# CA Boundary 
ca_all_tbl = (
    con.read_parquet(ca_boundary_parquet)
    .rename(geom = "geometry")
    .cast({"geom": "geometry"})
)


# CA-Nature data / protected areas 
tbl = (
    con.read_parquet(ca_parquet)
    .cast({"SHAPE": "geometry"})
    .rename(geom = "SHAPE", gid = "OBJECTID")
)

con.create_table("t1", ca_all_tbl, overwrite = True)
con.create_table("t2", tbl.filter(_.Release_Year == 2024), overwrite = True)

# simplified all geometries 500m so the kernel doesn't crash
# computing difference
con.con.execute('''
CREATE TABLE not_in_pad AS
WITH t2_simplified AS (
    SELECT ST_Simplify(geom, 500) AS geom
    FROM t2
),
t2_union AS (
    SELECT ST_Union_Agg(geom) AS geom
    FROM t2_simplified
)
SELECT 
    ST_Difference(t1.geom, t2_union.geom) AS geom
FROM 
    t1, t2_union;
''')


# save to parquet file 
ca = con.table("not_in_pad")
ca.execute().to_parquet(ca_nonconserved_parquet)

In [None]:
# match CA Nature schema 
non_conserved = (
    conn.read_parquet(ca_nonconserved_parquet)
    .cast({"geom": "geometry"})
    .mutate(established = ibis.null(), gap_code = ibis.null(), name = ibis.literal("Non-Conserved Areas"),
            access_type = ibis.null(), manager = ibis.null(), manager_type = ibis.null(),
            ecoregion = ibis.null(), easement = ibis.null(), id = ibis.null(), type = ibis.null(),
            status = ibis.literal("non-conserved"),
            acres = _.geom.area() / 4046.8564224 #convert sq meters to acres
           )
    .cast({"established": "string", "gap_code": "int16", "status": "string","name": "string",
          "access_type": "string", "manager": "string", "manager_type": "string",
          "ecoregion": "string", "easement": "string", "id": "int64", "type": "string",
          "acres":"float32"}) #match schema to CA Nature
)

non_conserved.execute().to_parquet(ca_nonconserved_clean_parquet)

### Separating pre-2024 and 2024 protected areas

In [None]:
# negative buffer to account for overlapping boundaries. 
buffer = -30 #30m buffer 

tbl = (
    conn.read_parquet(ca_parquet)
    .cast({"SHAPE": "geometry"})
    .rename(geom = "SHAPE")
    .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30
)

# polygons with release_year 2024 are a superset of release_year 2023. 
# use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established). 
tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer)) 
tbl_2024 = tbl.filter(_.Release_Year == 2024)
intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))

### Merging data into a single parquet file 

In [None]:
%%time
new2024 = intersects.select("OBJECTID").mutate(established = ibis.literal("2024")) # saving IDs to join on

ca = (conn
      .read_parquet(ca_parquet)
      .cast({"SHAPE": "geometry"})
      .mutate(area = _.SHAPE.area())
      .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.
      .left_join(new2024, "OBJECTID") # newly established 2024 polygons 
      .mutate(established=_.established.fill_null("pre-2024")) 
      .mutate(geom = _.SHAPE.convert("epsg:3310","epsg:4326"))
      .rename(name = "cpad_PARK_NAME", access_type = "cpad_ACCESS_TYP", manager = "cpad_MNG_AGENCY",
              manager_type = "cpad_MNG_AG_LEV", id = "OBJECTID", type = "TYPE", 
              ecoregion = "CA_Ecoregion_Name", acres = "Acres", gap_code = "reGAP")
      .mutate(manager = _.manager.substitute({"": "Unknown"})) 
      .mutate(manager_type = _.manager_type.substitute({"": "Unknown"}))
      .mutate(access_type = _.access_type.substitute({"": "Unknown Access"}))
      .mutate(name = _.name.substitute({"": "Unknown"}))
      .mutate(manager_type = _.manager_type.substitute({"Home Owners Association": "HOA"}))
      .mutate(easement=_.Easement.cast("string").substitute({"0": "False", "1": "True"}))
      .mutate(status=_.gap_code.cast("string")
              .substitute({"1": "30x30-conserved", "2": "30x30-conserved", "3": "other-conserved", 
                           "4": "other-conserved"}))
      .select(_.established, _.gap_code, _.status, _.name, _.access_type, _.manager, _.manager_type,
              _.ecoregion, _.easement, _.acres, _.id, _.type, _.geom)
      .union(non_conserved)
      .mutate(acres = _.acres.round(4))
     )


ca2024 = ca.execute()
ca2024.to_parquet("../data/ca-all.parquet")

### Upload file to Hugging Face dataset repo

In [None]:
from huggingface_hub import HfApi, login
import streamlit as st
login(st.secrets["HF_TOKEN"])
api = HfApi()

def hf_upload(file, path):
    info = api.upload_file(
            path_or_fileobj=path,
            path_in_repo=file,
            repo_id="boettiger-lab/ca-30x30",
            repo_type="dataset",
        )
    
hf_upload("ca-all.parquet","../data/ca-all.parquet")