# Preprocessing CBN Data

In [None]:
from cng.utils import set_secrets, s3_client, to_pmtiles
s3 = s3_client()

import zipfile
import os
import subprocess
os.chdir('../data/')

import geopandas as gpd
import ibis
from ibis import _
con = ibis.duckdb.connect(extensions=["spatial"])

import rasterio
import numpy as np

#### Helper functions

In [None]:
def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN-data/'):
    path = os.path.join(base_folder, folder, file)
    return bucket, path 
    
def download(folder, file, file_name = None):
    if not file_name: 
        file_name = file
    bucket, path = info(folder, file)
    s3.fget_object(bucket, path ,file_name) 
    return

def upload(folder, file):
    bucket, path = info(folder, file)
    s3.fput_object(bucket, path ,file) 
    return

def unzip(folder, file):
    download(folder, file)
    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall()
    return 

def upload_parquet(folder, file, gdf):
    name, ext = os.path.splitext(file)
    parquet_file = f"{name}{'.parquet'}"
    gdf.to_parquet(parquet_file)
    upload(folder, parquet_file)
    return  

def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:3310"):
    if gdf is None:
        gdf = gpd.read_file(file)
    gdf = gdf.to_crs(crs)
    gdf = gdf.rename_geometry('geom')
    if file_name:
        file = file_name
    upload_parquet(folder, file, gdf)
    return 

def reproject_raster(input_file, crs="EPSG:3310"):
    suffix = '_processed'
    name, ext = os.path.splitext(input_file)
    output_file = f"{name}{suffix}{ext}"
    command = [
        "gdalwarp",
        "-t_srs", crs,
        input_file,
        output_file 
        ]
    try:
        subprocess.run(command, check=True)
        print(f"Reprojection successful!")
    except subprocess.CalledProcessError as e:
        print(f"Error occurred during reprojection: {e}")
    return output_file 

def make_cog(input_file, crs="EPSG:4326"):
    suffix = '_COG'
    name, ext = os.path.splitext(input_file)
    output_file = f"{name}{suffix}{ext}"
    command = [
        "gdalwarp",
        "-t_srs", crs,
        "-of", "COG",
        input_file,
        output_file 
        ]
    try:
        subprocess.run(command, check=True)
        print(f"Successful!")
    except subprocess.CalledProcessError as e:
        print(f"Error occurred during processing: {e}")
    return output_file 

def process_raster(folder, file, file_name = None):
    if file_name:
        file = file_name
    output_file = reproject_raster(file)
    upload(folder, output_file)
    output_cog_file = make_cog(output_file)
    upload(folder, output_cog_file)
    return

def filter_raster(folder, file, percentile):
    with rasterio.open(file) as src:
        data = src.read(1)  # Read the first band
        profile = src.profile
    
    # mask no data values
    masked_data = np.ma.masked_equal(data, src.nodata)

    # compute percentile/threshold 
    p = np.percentile(masked_data.compressed(),percentile)
    filtered = np.where(data >= p, data, src.nodata)
    
    name, ext = os.path.splitext(file)
    new_file = f"{name}{'_'}{percentile}{'percentile'}{ext}"

    profile.update(dtype=rasterio.float64)
    with rasterio.open(new_file, "w", **profile) as dst:
        dst.write(filtered, 1)
        
    process_raster(folder, file)
    return

def convert_pmtiles(folder, parquet_file):
    name, ext = os.path.splitext(parquet_file)
    con.read_parquet(parquet_file).execute().set_crs('epsg:3310').to_crs('epsg:4326').to_file(name+'.geojson')
    to_pmtiles(name+'.geojson', name+'.pmtiles', options = ['--extend-zooms-if-still-dropping'])
    upload(folder, name+'.pmtiles')
    return

## Counties **

In [None]:
unzip(folder = 'Counties', file = '30x30_Counties.zip')
process_vector(folder = 'Counties', file = 'CA_counties.shp')

## Climate Zones **

In [None]:
download(folder = 'Climate_zones', file = 'clusters_10.tif')
process_raster(folder = 'Climate_zones', file = 'clusters_10.tif', file_name =  'climate_zones_10.tif')

## Ecoregions **

In [None]:
unzip(folder = 'Ecoregion', file = '30x30_Ecoregions.zip')
process_vector(folder = 'Ecoregion', file = 'ACE_ecoregions.shp')

## Habitat

#### 13 class major habitat types **

In [None]:
# download(folder = 'Habitat', file = 'CWHR13_2022.tif')
# process_raster(folder = 'Habitat', file = 'CWHR13_2022.tif')

In [None]:
unzip(folder = 'Habitat', file = 'fveg221gdb.zip')

command = [
    "gdalwarp",
    "-of", "GTiff",
    'fveg22_1.gdb',
    'fveg22_1.tif' 
    ]

subprocess.run(command, check=True)
process_raster(folder = 'Habitat', file = 'fveg22_1.tif')
upload(folder = 'Habitat', file = 'fveg22_1_processed.tif.aux.xml')


#### 60+ class habitat types

## ACE Biodiversity

In [None]:
download(folder = 'ACE_biodiversity', file = 'Terrestrial_Biodiversity_Summary_-_ACE_[ds2739].geojson',
         file_name = 'ACE_biodiversity_all_ds2739.geojson')
gdf = gpd.read_file('ACE_biodiversity_all_ds2739.geojson')

to_pmtiles('ACE_biodiversity_all_ds2739.geojson', 'ACE_biodiversity_ds2739_all.pmtiles', options = ['--extend-zooms-if-still-dropping'])
upload(folder = 'ACE_biodiversity',file = 'ACE_biodiversity_ds2739_all.pmtiles')

#### ACE BioRank and Rare Rank 

In [None]:
# Filter data to rank 5.
ACE_rank_files = ['ACE_biorank_statewide','ACE_biorank_ecoregion',
                  'ACE_rarerank_statewide','ACE_rarerank_ecoregion']
             
ACE_rank_cols =  ['BioRankSW','BioRankEco','RarRankSW','RarRankEco']   

for col,name in zip(ACE_rank_cols,ACE_rank_files):
    cols = ['OBJECTID', 'Hex_ID', 'Eco_Sect', 'Eco_Name',
            'County', 'Shape__Area', 'Shape__Length', 'geometry']
    cols.append(col) #select only the cols we want + the new col. 
    rank_df = gdf[gdf[col]==5][cols]# filter ranks = 5
    process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = rank_df)
    convert_pmtiles(folder ='ACE_biodiversity/'+name, parquet_file = name+'.parquet')


#### Other ACE Biodiversity **

In [None]:
ACE_files = ['ACE_amphibian_richness','ACE_reptile_richness',
            'ACE_bird_richness','ACE_mammal_richness',
            'ACE_rare_amphibian_richness','ACE_rare_reptile_richness',
            'ACE_rare_bird_richness','ACE_rare_mammal_richness',
            'ACE_endemic_amphibian_richness','ACE_endemic_reptile_richness',
            'ACE_endemic_bird_richness','ACE_endemic_mammal_richness']

ACE_cols = ['NtvAmph','NtvRept','NtvBird','NtvMamm','RarAmph','RarRept','RarBird','RarMamm',
            'AmphEndem','ReptEndem','BirdEndem','MammEndem']

for col,name in zip(ACE_cols,ACE_files):
    cols = ['OBJECTID', 'Hex_ID', 'Eco_Sect', 'Eco_Name',
            'County', 'Shape__Area', 'Shape__Length', 'geometry']
    cols.append(col) #select only the cols we want + the new col. 
    if name in ['NtvAmph','NtvRept','NtvBird','NtvMamm']:
        percentile = 0.8
    else: 
        percentile = 0.95
    threshold = gdf[col].quantile(percentile)
    ace = gdf[gdf[col]>=threshold][cols]
    process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = ace)
    convert_pmtiles(folder ='ACE_biodiversity/'+name, parquet_file = name+'.parquet')


# calculate 80% percentile, filter to those >= threshold. 
# subset to calculate acres within each network, % of feature conserved and % of network 

## Biodiversity

#### Plant richness **

In [None]:
download(folder = 'Biodiversity_unique/Plant_richness', file = 'species_D.tif')
filter_raster(folder = 'Biodiversity_unique/Plant_richness', file = 'species_D.tif', percentile = 80)


#### Rarity-weighted endemic plant richness **

In [None]:
download(folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness', file = 'endemicspecies_E.tif')

filter_raster(folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness',
               file = 'endemicspecies_E.tif', percentile = 80)

#### Abundance for 26 bird species

## Connectivity and Resilience
#### Resilient Connected Network - all categories **

In [None]:
process_raster(folder = 'Connectivity_resilience/Resilient_connected_network_allcategories',
               file = 'rcn_wIntactBioCat_caOnly_2020-10-27.tif')

#### Present day connectivity - all categories

#### Climate migration routes

## Freshwater Resources

#### Freshwater species richness

#### Wetlands **

In [None]:
unzip(folder = 'Freshwater_resources/Wetlands', file = 'CA_geodatabase_wetlands.zip')

# only pick a subset 
gdf = gpd.read_file('CA_geodatabase_wetlands.gdb')
wetlands = ['Freshwater Emergent Wetland', 'Freshwater Forested/Shrub Wetland', 'Estuarine and Marine Wetland']
gdf = gdf[gdf['WETLAND_TYPE'].isin(wetlands)]
process_vector(folder = 'Freshwater_resources/Wetlands', file = 'CA_wetlands.parquet', gdf = gdf)
convert_pmtiles(folder ='Freshwater_resources/Wetlands', parquet_file ='CA_wetlands.parquet')


#### Groundwater dependent ecosystems

#### Streams by order

#### Perennial streams

#### Fish passage barriers

## NBS and Agriculture

#### Drinking water source watersheds

#### Farmland + Land suitable for grazing **

In [None]:
unzip(folder = 'NBS_agriculture/Farmland', file = 'Important_Farmland_2018.zip')

# only pick a subset 
gdf = gpd.read_file('Important_Farmland_2018.gdb')
farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique
gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]
process_vector(folder = 'NBS_agriculture/Farmland', file = 'Farmland_2018.parquet', gdf = gdf_farmland)
convert_pmtiles(folder ='NBS_agriculture/Farmland', parquet_file ='Farmland_2018.parquet')

gdf_grazing = gdf[gdf['polygon_ty'] == 'G']
process_vector(folder = 'NBS_agriculture/Lands_suitable_grazing', 
               file = 'Grazing_land_2018.parquet', gdf = gdf_grazing)
convert_pmtiles(folder ='NBS_agriculture/Lands_suitable_grazing', parquet_file ='Grazing_land_2018.parquet')


#### Carbon storage **

## Climate Risks

#### Fire perimeters **

Only YEAR >= 2014. 

In [None]:
unzip(folder = 'Climate_risks/Historical_fire_perimeters', file = 'fire23-1gdb.zip')
gdf = gpd.read_file('fire23_1.gdb')
gdf = gdf[~gdf['YEAR_'].isna()]
gdf['YEAR_'] = gdf['YEAR_'].astype('int64')
gdf = gdf[gdf['YEAR_']>=2014]

process_vector(folder = 'Climate_risks/Historical_fire_perimeters', file = 'calfire_2023.parquet', gdf = gdf)
convert_pmtiles(folder ='Climate_risks/Historical_fire_perimeters', parquet_file ='calfire_2023.parquet')


#### Flood hazard zones **

#### Sea level rise

#### Mid-century habitat climate exposure **

In [None]:
'''
First mask out non-natural lands.
A binary natural vs. non-natural  land mask is included in the data package. 
Use the combined group of all values < 0 and >=0.95 as exposed.  
Do seperately for both climate models - CNRM and MIROC.
'''

unzip(folder = 'Climate_risks/Mid-century_habitat_climate_exposure', file = 'Midcentury_habitat_climate_exposure.zip')

# still need to do 

## Progress data - newly protected

#### Newly counted

In [None]:
unzip(folder = 'Progress_data_new_protection/Newly_counted_lands', file = 'newly_counted_lands_2024.shp.zip')
process_vector(folder = 'Progress_data_new_protection/Newly_counted_lands', file = 'newly_counted_lands_2024.shp')
convert_pmtiles(folder ='Progress_data_new_protection/Newly_counted_lands', parquet_file ='newly_counted_lands_2024.parquet')


#### DAC **

In [None]:
unzip(folder = 'Progress_data_new_protection/DAC', file = 'sb535dacgdbf2022gdb.zip')
process_vector(folder = 'Progress_data_new_protection/DAC', file = 'SB535DACgdb_F_2022.gdb',
               file_name = 'DAC_2022.parquet')
convert_pmtiles(folder ='Progress_data_new_protection/DAC', parquet_file ='DAC_2022.parquet')


#### Priority populations

#### Low income communities **

In [None]:
unzip(folder = 'Progress_data_new_protection/Low_income_communities',
      file = 'Priority Populations 4.0 Geodatabase.zip')

gdf = gpd.read_file('Priority Populations 4.0 Combined Layer.gdb')
gdf = gdf[gdf['Designatio'] =='Low-income community']


process_vector(folder = 'Progress_data_new_protection/Low_income_communities', 
               file = 'low_income_CalEnviroScreen4.parquet',gdf = gdf)
convert_pmtiles(folder ='Progress_data_new_protection/Low_income_communities', parquet_file ='low_income_CalEnviroScreen4.parquet')


## Base layer for denominator 

In [None]:
unzip(folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties',
      file = 'Land_Status_Zone_Ecoregion_Counties.shp.zip')
process_vector(folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties', file = 'Land_Status_Zone_Ecoregion_Counties.shp',
               file_name = 'all_regions_reGAP_county_eco.parquet')
convert_pmtiles(folder ='Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties', parquet_file ='all_regions_reGAP_county_eco.parquet')
