# Pre-processing script

In [None]:
import ibis
from ibis import _
import geopandas as gpd
import duckdb
from cng.utils import ST_MakeValid

conn = ibis.duckdb.connect(extensions=["spatial"])
ca_parquet = "https://data.source.coop/cboettig/ca30x30/ca_areas.parquet"
# or use local copy:
# ca_parquet = "../data/ca_areas.parquet" # CA Nature Data 

path = '../data/ca-layers/'

# CA Nature Data 
ca_boundary = "../data/ca_shape"
ca_boundary_parquet = path + "ca_boundary.parquet"
ca_nonconserved_parquet = path + "ca_notPAD_500m_simplify.parquet"
ca_all_parquet = path + "ca-all.parquet"
ca_all_stats =  path + "ca-all-stats.parquet"
ca_final_parquet = "ca_30x30_stats.parquet"
#vector data 
svi = path + 'SVI2022_US_tract' #4326
fire = path + 'calfire-2023'#4326
rxburn = path + 'calfire-rxburn-2023'#4326

#raster data 
irrecoverable_c = path + 'ca_irrecoverable_c_2018_cog' # EPSG:3857
manageable_c = path + 'ca_manageable_c_2018_cog'# EPSG:3857
richness = path + 'SpeciesRichness_All' # EPSG:3857
rsr = path + 'RSR_All'# EPSG:3857


# Step 1: Computing all "non-conserved" areas

#### Convert CA Boundary file to parquet 

In [None]:
# Using a shape file of CA boundary and converting to parquet file 
ca_all = gpd.read_file(ca_boundary).to_crs(epsg = 3310)
ca_all.to_parquet(ca_boundary_parquet)

#### Computing difference: CA Boundary - Conserved Areas = Non-conserved areas
(This chunk will take ~2 hours to run)

In [None]:
# Computing difference: CA Boundary - Conserved Areas = Non-conserved areas
# This chunk will take ~2 hours to run 
con = ibis.duckdb.connect("tmp", extensions=["spatial"]) #save to disk

# CA Boundary 
ca_all_tbl = (
    con.read_parquet(ca_boundary_parquet)
    .rename(geom = "geometry")
    .cast({"geom": "geometry"})
)


# CA-Nature data / protected areas 
tbl = (
    con.read_parquet(ca_parquet)
    .cast({"SHAPE": "geometry"})
    .rename(geom = "SHAPE", gid = "OBJECTID")
)

con.create_table("t1", ca_all_tbl, overwrite = True)
con.create_table("t2", tbl.filter(_.Release_Year == 2024), overwrite = True)

# simplified all geometries 500m so the kernel doesn't crash
# computing difference
con.con.execute('''
CREATE TABLE not_in_pad AS
WITH t2_simplified AS (
    SELECT ST_Simplify(geom, 500) AS geom
    FROM t2
),
t2_union AS (
    SELECT ST_Union_Agg(geom) AS geom
    FROM t2_simplified
)
SELECT 
    ST_Difference(t1.geom, t2_union.geom) AS geom
FROM 
    t1, t2_union;
''')


# save to parquet file 
ca = con.table("not_in_pad")
ca.execute().to_parquet(ca_nonconserved_parquet)

#### Non-conserved areas need to match CA Nature schema

In [None]:
# match CA Nature schema 
non_conserved = (
    conn.read_parquet(ca_nonconserved_parquet)
    .cast({"geom": "geometry"})
    .mutate(established = ibis.null(), gap_code = 0, name = ibis.literal("Non-Conserved Areas"),
            access_type = ibis.null(), manager = ibis.null(), manager_type = ibis.null(),
            ecoregion = ibis.null(), easement = ibis.null(), id = 0, type = ibis.literal("Land"),
            status = ibis.literal("non-conserved"),
            acres = _.geom.area() / 4046.8564224 #convert sq meters to acres
           )
    .cast({"established": "string", "gap_code": "int16", "status": "string","name": "string",
          "access_type": "string", "manager": "string", "manager_type": "string",
          "ecoregion": "string", "easement": "string", "id": "int64", "type": "string",
          "acres":"float32"}) #match schema to CA Nature
)

# Step 2: Isolate pre-2024 from 2024 polygons

In [None]:
# negative buffer to account for overlapping boundaries. 
buffer = -30 #30m buffer 

tbl = (
    conn.read_parquet(ca_parquet)
    .cast({"SHAPE": "geometry"})
    .rename(geom = "SHAPE")
    .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30
)

# polygons with release_year 2024 are a superset of release_year 2023. 
# use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established). 
tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer)) 
tbl_2024 = tbl.filter(_.Release_Year == 2024)
intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))

# Step 3: Join all protected land data into single parquet file 

In [None]:
%%time
new2024 = intersects.select("OBJECTID").mutate(established = ibis.literal("2024")) # saving IDs to join on

ca = (conn
      .read_parquet(ca_parquet)
      .cast({"SHAPE": "geometry"})
      .mutate(area = _.SHAPE.area())
      .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.
      .left_join(new2024, "OBJECTID") # newly established 2024 polygons 
      .mutate(established=_.established.fill_null("pre-2024")) 
      .rename(name = "cpad_PARK_NAME", access_type = "cpad_ACCESS_TYP", manager = "cpad_MNG_AGENCY",
              manager_type = "cpad_MNG_AG_LEV", id = "OBJECTID", type = "TYPE", 
              ecoregion = "CA_Ecoregion_Name", acres = "Acres", gap_code = "reGAP", geom = "SHAPE")
      .cast({"gap_code": "int16"})
      .cast({"id": "int64"})
      .mutate(manager = _.manager.substitute({"": "Unknown"})) 
      .mutate(manager_type = _.manager_type.substitute({"": "Unknown"}))
      .mutate(access_type = _.access_type.substitute({"": "Unknown Access"}))
      .mutate(name = _.name.substitute({"": "Unknown"}))
      .mutate(manager_type = _.manager_type.substitute({"Home Owners Association": "HOA"}))
      .mutate(easement=_.Easement.cast("string").substitute({"0": "False", "1": "True"}))
      .mutate(status=_.gap_code.cast("string")
              .substitute({"1": "30x30-conserved", "2": "30x30-conserved", "3": "other-conserved", 
                           "4": "other-conserved"}))
      .select(_.established, _.gap_code, _.status, _.name, _.access_type, _.manager, _.manager_type,
              _.ecoregion, _.easement, _.acres, _.id, _.type, _.geom)
      .union(non_conserved)
      .mutate(acres = _.acres.round(4))
      .mutate(geom = ST_MakeValid(_.geom))
      .drop_null(['geom'],how = "any")
     )


ca2024 = ca.execute()
ca2024 = ca2024.set_crs("epsg:3310")
ca2024.to_parquet(ca_all_parquet)

# Step 4: Compute zonal stats

#### Functions: Reproject and compute overlap for vector data 

In [None]:
con = ibis.duckdb.connect("tmp2", extensions=["spatial"])

def reproject_vectors(file, gdf_temp): # change data layer projections to match CA Nature data 
    gdf_temp = gdf_temp.rename_geometry('geom')
    gdf_temp["geom"] = gdf_temp["geom"].make_valid()
    gdf_temp = gdf_temp.to_crs("EPSG:3310")
    gdf_temp.to_parquet(file + '-epsg3310.parquet')
    return

def vector_vector_stats(base, data_layer):
    t1 = con.read_parquet(base).select(_.id, _.geom)
    t2 = con.read_parquet(data_layer).select(_.geom)
    expr = (t1
     .left_join(t2, t1.geom.intersects(t2.geom))
     .group_by(t1.id, t1.geom)
     .agg(overlap_fraction = (t1.geom.intersection(t2.geom).area() / t1.geom.area()).sum().coalesce(0).round(3) )
    )
    ibis.to_sql(expr)
    gdf = expr.execute()
    return gdf[['id','overlap_fraction']]
    

#### Compute zonal stats with vector data 

In [None]:
%%time
vectors = [svi,fire,rxburn]
names = ['svi','fire','rxburn']

gdf = gpd.read_parquet(ca_all_parquet) # CA Nature data 
gdf = gdf.set_index('id') # set the index to the col we are joining on for gpd.join()

# clean data + reproject + zonal stats 
for file,name in zip(vectors,names):
    gdf_temp = gpd.read_parquet(file + '.parquet') #load in vector data layer 

    # filter: we only want 10 year range for fire
    if name in ['fire','rxburn']:
        gdf_temp = gdf_temp[gdf_temp['YEAR_']>=2013] 

     # filter: only want CA data, not nationwide. 
    if name == 'svi': 
        gdf_temp = gdf_temp[gdf_temp['STATE']=="California"]
        
    reproject_vectors(file, gdf_temp) # change projection to match CA Nature data 
    gdf_zonal = vector_vector_stats(ca_all_parquet, file + '-epsg3310.parquet') #compute zonal stats 
    gdf_zonal = gdf_zonal.rename(columns ={'overlap_fraction':name}) 
    gdf = gdf.join(gdf_zonal.set_index('id')) # joining new zonal stats column with CA Nature data. 

gdf.to_parquet(ca_all_stats) #save CA Nature + zonal stats 

#### Function: Reproject raster data

In [None]:
import subprocess

def raster_reprojection(input_file, output_file, epsg="EPSG:3310"):
    cmd = [
        "gdalwarp",
        "-t_srs", epsg,
        input_file,
        output_file
    ]
    try:
        subprocess.run(cmd, check=True)
        print(f"Reprojection successful! Output saved to: {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error occurred during reprojection: {e}")

#### Compute zonal stats with raster data

In [None]:
%%time
import rasterio
from exactextract import exact_extract

rasters = [irrecoverable_c, manageable_c, richness, rsr]
names = ['irrecoverable_carbon','manageable_carbon','richness','rsr']

gdf = gpd.read_parquet(ca_all_stats) # zonal stats data from vector data layers step 

# exact_extract() is a bit finnicky so we need to make the following changes to our data for it to work:
gdf.index.names = ['ca_id'] # rename "id" since it confuses the name with a field in the raster data
gdf = gdf.reset_index() # can't have an index 
gdf.to_parquet(ca_all_stats) #saving changes 

for file,name in zip(rasters,names):
    raster_reprojection(file+'.tif', file+'_epsg3310.tif') #reproject rasters to match CA Nature
    temp = exact_extract(file+'_epsg3310.tif', ca_all_stats, ["mean"], include_cols=["ca_id"], output = 'pandas') #zonal stats 
    
    #the column we want is 'band_1_mean'; these rasters have multiple bands. 
    if name in ['irrecoverable_carbon','manageable_carbon']:
        temp = temp[['ca_id','band_1_mean']] 
        temp = temp.rename(columns ={'band_1_mean':name}) 

    #these rasters have only 1 band, so zonal stats column is 'mean'
    elif name in ['richness','rsr']:
        temp = temp[['ca_id','mean']] 
        temp = temp.rename(columns ={'mean':name})

    temp[name] = temp[name].round(3) #rounding stats 
     
    # joining with gpd.join(), need to set an index 
    gdf = gdf.set_index("ca_id").join(temp.set_index("ca_id")) 

    # exact_extract() won't work with index, so now that it's joined, we reset the index. 
    gdf = gdf.reset_index() 

gdf = gdf.rename(columns ={'ca_id':'id'}) #reverting back to "id" col name, since we are finished with exact_extract() 
gdf.to_parquet(ca_all_stats) # save results 

# Step 5: Upload file

In [None]:
from cng.utils import set_secrets, hf_upload, s3_cp 
conn = ibis.duckdb.connect(extensions=["spatial"])

set_secrets(conn)

#to use PMTiles, I need to convert to 4326
ca_all = (conn
          .read_parquet(ca_all_stats)
          .mutate(geom = _.geom.convert("epsg:3310","epsg:4326"))
         )

ca_all = ca_all.execute()
ca_all = ca_all.set_crs("epsg:4326")
ca_all.to_parquet(path + ca_final_parquet)

# upload to minio and HF
hf_upload(ca_final_parquet, path+ca_final_parquet)
s3_cp(path+ca_final_parquet, "s3://public-ca30x30/"+ca_final_parquet, "minio")


### PMTiles for app display

In [None]:
from cng.utils import to_geojson, to_pmtiles
to_geojson(path+ca_final_parquet, path+ 'ca_30x30_stats.geojson')

pmtiles = to_pmtiles(path+ 'ca_30x30_stats.geojson',path+ 'ca_30x30_stats.pmtiles')
hf_upload("ca_30x30_stats.pmtiles",path+ 'ca_30x30_stats.pmtiles')
s3_cp(path+ 'ca_30x30_stats.pmtiles', "s3://public-ca30x30/ca_30x30_stats.pmtiles", "minio")
