In [None]:
%load_ext autoreload
%autoreload 2

import dask
import numpy as np
import pandas as pd
import xarray as xr

from carbonplan_trace.tiles import tiles
from carbonplan_trace.v1 import biomass_rollup

import fsspec
import h5py
import regionmask
from rasterio.session import AWSSession
from rasterio.warp import Resampling

from carbonplan_trace.v1 import utils
from collections import defaultdict
from carbonplan_trace.v1.landsat_preprocess import access_credentials
from carbonplan_trace.v0.core import compute_grid_area, coarsen_emissions
import rioxarray as rio
import geopandas as gpd

# from dask.distributed import Client
from carbonplan_data.utils import set_zarr_encoding

In [None]:
from carbonplan_trace.v1.landsat_preprocess import access_credentials

access_key_id, secret_access_key = access_credentials()
fs = fsspec.get_filesystem_class("s3")(
    key=access_key_id,
    secret=secret_access_key,
)

In [None]:
dask.config.set({"array.slicing.split_large_chunks": False})

# our biomass


## 3km


In [None]:
skip_existing = True
version = "v1.2"
bucket = f"s3://carbonplan-climatetrace/{version}/results/"
coarse_tile_template = bucket + "tiles/3000m/{tile_id}_biomass.zarr"
variables = ["AGB", "BGB", "dead_wood", "litter"]

In [None]:
for tile in tiles:
    result = biomass_rollup.coarsen_biomass_one_tile(
        tile_id=tile,
        get_biomass_ds_func=biomass_rollup.open_biomass_tile,
        output_template=coarse_tile_template,
        variables=variables,
        version=version,
        skip_existing=skip_existing,
        coarsening_factor=100,
        coarse_chunks={"lat": 400, "lon": 400},
    )
    print(tile, result)

In [None]:
coarse_full_template = bucket + "global/3000m/raster_biomass.zarr"

biomass_rollup.combine_all_tiles(
    input_tile_template=coarse_tile_template,
    output_global_fn=coarse_full_template,
    variables=variables,
    skip_existing=False,
)

In [None]:
# visualize
ds = xr.open_zarr(coarse_full_template)
ds.isel(time=0).AGB.plot(vmin=0, vmax=500)

## half degree


In [None]:
ds = xr.open_zarr(
    f"s3://carbonplan-climatetrace/{version}/results/global/3000m/raster_biomass.zarr"
)
ds

In [None]:
coarsening_factor = int(0.5 / (1 / 40))
print(f"coarsening by {coarsening_factor}x")
ds["total_biomass"] = ds["AGB"] + ds["BGB"] + ds["dead_wood"] + ds["litter"]
half_degree = coarsen_emissions(
    ds[["total_biomass", "AGB"]], factor=coarsening_factor, mask_var="total_biomass", method="mean"
)
half_degree = half_degree.to_array(dim="variable")
half_degree = (
    half_degree.mean(dim="time")
    .chunk({"lat": -1, "lon": -1, "variable": -1})
    .rename({"lat": "y", "lon": "x"})
)
half_degree = half_degree.to_dataset(name="biomass")
half_degree = set_zarr_encoding(
    half_degree, codec_config={"id": "zlib", "level": 1}, float_dtype="float32", int_dtype="i4"
)
half_degree.to_zarr(
    "s3://carbonplan-climatetrace/v1.2/map/v2/half_degree_biomass.zarr", mode="w", consolidated=True
)

# harris biomass 2000

## warning! the land area used here has been masked to: 1) land (no ocean), and 2) the available landsat area in collection 2 between 2014-2020. This is a different area compared to the Harris analysis region, and thus the calculation here will need to be redone.


In [None]:
# we already have harris biomass summed by a factor of 100
input_filename = "s3://carbonplan-climatetrace/v0.4/global/3000m/raster_biomass.zarr"
# we want to get harris biomass averaged by a factor of 100
output_filename = f"s3://carbonplan-climatetrace/validation/harris_3000m_agbd.zarr"
var = "agb"
# we have total land area of each 3km grid in this file to divide the sum by
land_area_filename = (
    f"s3://carbonplan-climatetrace/{version}/results/global/3000m/raster_biomass.zarr"
)

In [None]:
# load in the sum version and convert into average by dividing with area
ds = xr.open_zarr(input_filename)
ds

In [None]:
area = xr.open_zarr(land_area_filename)
area = area.assign_coords({"lat": ds.lat, "lon": ds.lon}).isel(time=0).drop("time")[["land_area"]]
area

In [None]:
ds["agbd"] = ds.agb / area.land_area

In [None]:
ds.agbd.plot(vmin=0, vmax=500)

In [None]:
ds[["agbd"]].to_zarr(output_filename, consolidated=True, mode="w")

# GlobBiomass


In [None]:
import os
import subprocess

In [None]:
folder = "/home/jovyan/globbiomass/"
all_tiles = list(set([t.split("_")[0] for t in os.listdir(folder)]))

In [None]:
harris = xr.open_zarr(f"s3://carbonplan-climatetrace/validation/harris_3000m_agbd.zarr")
harris = harris.rename({"lon": "x", "lat": "y"})
harris.attrs["crs"] = "EPSG:4326"
harris

In [None]:
for tile in all_tiles:
    print(tile)
    #     subprocess.run(["unzip", f"{folder}{tile}_agb.zip", "-d", f"{folder}"])
    da = xr.open_rasterio(f"{folder}{tile}_agb.tif")
    da = da.squeeze("band", drop=True)
    da = da.rename({"x": "lon", "y": "lat"})
    if da.lat[0] > da.lat[-1]:
        da = da.reindex(lat=da.lat[::-1])

    # apply land mask
    min_lat = da.lat.min().values
    max_lat = da.lat.max().values
    min_lon = da.lon.min().values
    max_lon = da.lon.max().values
    igbp = utils.open_global_igbp_data(lat_lon_box=[min_lat, max_lat, min_lon, max_lon])
    land_mask = (igbp.igbp > 0).any(dim="year")
    land_mask = utils.find_matching_records(data=land_mask, lats=da.lat, lons=da.lon)
    da = da.where(land_mask).fillna(0)

    # compute area + apply land mask
    da_area = compute_grid_area(da)
    da_area = da_area.where(land_mask).fillna(0)
    da_area.attrs["crs"] = "EPSG:4326"

    # calc biomass * area
    summed = da * da_area
    summed.attrs["crs"] = "EPSG:4326"
    summed = summed.rename({"lon": "x", "lat": "y"})
    da_area = da_area.rename({"lon": "x", "lat": "y"})

    # resample
    sub_harris = harris.sel(
        y=slice(np.floor(min_lat), np.ceil(max_lat)), x=slice(np.floor(min_lon), np.ceil(max_lon))
    )
    summed = summed.rio.reproject_match(sub_harris, resampling=Resampling.sum)
    summed_area = da_area.rio.reproject_match(sub_harris, resampling=Resampling.sum)
    summed = summed.where(summed < 1e100)
    summed_area = summed_area.where(summed < 1e100)
    summed = summed.rename({"x": "lon", "y": "lat"})
    summed_area = summed_area.rename({"x": "lon", "y": "lat"})
    summed_area = summed_area.assign_coords({"lat": summed.lat, "lon": summed.lon})

    # divide by total area
    out = summed / summed_area
    out.to_dataset(name="agbd").to_zarr(
        f"s3://carbonplan-climatetrace/inputs/processed/GlobBiomass/{tile}_agb.zarr",
        mode="w",
        consolidated=True,
    )

In [None]:
out.plot()

In [None]:
# concat everything into one file
ds_list = []
for tile in all_tiles:
    ds_list.append(
        xr.open_zarr(f"s3://carbonplan-climatetrace/inputs/processed/GlobBiomass/{tile}_agb.zarr")
    )

ds = xr.combine_by_coords(ds_list, combine_attrs="drop_conflicts")
ds

In [None]:
output_filename = f"s3://carbonplan-climatetrace/validation/globbiomass_3000m_agbd.zarr"
ds.drop("spatial_ref").to_zarr(output_filename, consolidated=True)

# gedi biomass 2019-2020

https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1907


In [None]:
# copy temp credentials from https://data.ornldaac.earthdata.nasa.gov/s3credentials
# expires in an hour...

nasa_creds = {
    "accessKeyId": "ASIATDFCZRUYVQHMIHW2",
    "secretAccessKey": "qEUL0OW1Xz1DkmKmfOFK2yLu7uq/K3BPu0fJBSWa",
    "sessionToken": "FwoGZXIvYXdzEO///////////wEaDA98fCkcQhtZEmQJiSLYAf1VX0eOG0bskUJKuDdZIEjkfQ+qP5mCesx6j8ZpPz4fYQ0ja9Iix1fIcKRggx39AN+o9UTe2Pi5OD0uGf1DGPUoaMcXiSM1N//Fe5TUhZeYlm5tUugnA+LxxLbj1ndLdlm7Lc+cDH7MAb0v/FKDTyjdYkQq6e0fy6SDl1ytpCnGxsdPwsL1HHRa186y88ylUl6LaEdOAysJ//x01ljNenkTObZE1d11dRj/QFoIq27XZ+hb4sg5h7RyaJEhA9MU494vSidRere/SGW8LIBgD09z3jqDhMKyISj/hf+JBjItZdvJzhpxl+BQE7Rpc99Vr7j9Sm0QHx+zcubn0/YgCk3djyzU6NaTYUzfFB6O",
    "expiration": "2021-09-13 22:30:39+00:00",
}

In [None]:
def process_one_gedi_granule(
    granule, beams, fields, bucket, nasa_creds, access_key_id, secret_access_key
):
    fs = fsspec.get_filesystem_class("s3")(
        key=nasa_creds["accessKeyId"],
        secret=nasa_creds["secretAccessKey"],
        token=nasa_creds["sessionToken"],
    )
    fn = granule.split("/")[-1].split(".h5")[0]
    output_filepath = f"{bucket}{fn}.parquet"
    out = defaultdict(list)

    with fs.open(granule, "rb") as f:
        fi = h5py.File(f, "r")
        for beam in beams:
            for k, v in fields.items():
                out[k].extend(fi[f"BEAM{beam}/{v}"][:])
        out = pd.DataFrame(out)
        out = out.loc[(out.l4_quality_flag == 1)]
        out.drop(["l4_quality_flag"], axis=1)

        utils.write_parquet(out, output_filepath, access_key_id, secret_access_key)


process_one_gedi_granule_delayed = dask.delayed(process_one_gedi_granule)

In [None]:
bucket = "s3://carbonplan-climatetrace/inputs/processed/gedi_agbd/"
beams = ["0000", "0001", "0010", "0011", "0101", "0110", "1000", "1011"]
fields = {
    "agbd": "agbd",
    "agbd_pi_lower": "agbd_pi_lower",
    "agbd_pi_upper": "agbd_pi_upper",
    "l4_quality_flag": "l4_quality_flag",
    "lat_lowestmode": "lat_lowestmode",
    "lon_lowestmode": "lon_lowestmode",
    "leaf_off_flag": "land_cover_data/leaf_off_flag",
}

In [None]:
completed = fs.ls(bucket)
completed = [c.split("/")[-1].split(".parquet")[0] for c in completed]
len(completed)

In [None]:
with open("./gedi_agb.txt") as f:
    granules = f.read().split("\n")

tasks = []

for granule in granules:
    fn = granule.split("/")[-1].split(".h5")[0]
    if fn not in completed:
        tasks.append(
            process_one_gedi_granule_delayed(
                granule=granule,
                beams=beams,
                fields=fields,
                bucket=bucket,
                nasa_creds=nasa_creds,
                access_key_id=access_key_id,
                secret_access_key=secret_access_key,
            )
        )

len(tasks)

In [None]:
from dask.distributed import Client

client = Client(n_workers=30, threads_per_worker=1)
client.compute(tasks, retries=1)

In [None]:
files = fs.ls(bucket)

In [None]:
df = []
for i, f in enumerate(files):
    if i % 100 == 0:
        print(i)
    df.append(pd.read_parquet("s3://" + f))
df = pd.concat(df)

In [None]:
for v in ["lat", "lon"]:
    vmin = np.floor(df[f"{v}_lowestmode"].min())
    vmax = np.ceil(df[f"{v}_lowestmode"].max())
    res = 1 / 40
    bins = np.arange(vmin, vmax + res / 2, res)
    bin_labels = np.arange(vmin + res / 2, vmax, res)

    assert len(bins) == len(bin_labels) + 1

    df[f"{v}"] = pd.cut(
        df[f"{v}_lowestmode"], bins=np.round(bins, 4), labels=np.round(bin_labels, 4)
    )

In [None]:
df = df[["lat", "lon", "agbd"]].groupby(["lat", "lon"]).mean().reset_index()
output_filepath = "s3://carbonplan-climatetrace/inputs/processed/gedi_agbd_3000m.parquet"
utils.write_parquet(df, output_filepath, access_key_id, secret_access_key)

In [None]:
df

In [None]:
def turn_point_cloud_to_grid(df, precision=4):
    df.lat = df.lat.round(precision)
    df.lon = df.lon.round(precision)
    pixel_size = 1 / 40
    # add tiny offset to ensure you get the last entry in the lat/lon list
    # and then assert you get the 8000 entries you need
    offset = pixel_size / 2
    min_lat = -60
    max_lat = 80
    min_lon = -180
    max_lon = 180

    lats = np.arange(min_lat + offset, max_lat, pixel_size).round(precision)
    lons = np.arange(min_lon + offset, max_lon, pixel_size).round(precision)

    pivot = df.pivot(columns="lon", index="lat", values="agbd")
    del df
    reindexed = pivot.reindex(index=lats, columns=lons)
    ds_grid = xr.DataArray(
        data=reindexed.values,
        dims=["lat", "lon"],
        coords=[lats, lons],
    ).astype("float32")
    del reindexed
    ds_grid = ds_grid.to_dataset(name="agbd", promote_attrs=True)
    return ds_grid

In [None]:
ds = turn_point_cloud_to_grid(df)

In [None]:
ds.agbd.plot(vmin=0, vmax=500)

In [None]:
output_filename = "s3://carbonplan-climatetrace/validation/gedi_3000m_agbd.zarr"
ds.to_zarr(output_filename, consolidated=True)

# land mask


In [None]:
spawn = xr.open_zarr(f"s3://carbonplan-climatetrace/validation/spawn_3000m_agbd.zarr")
spawn = spawn.rio.write_crs("EPSG:4326")

In [None]:
land = regionmask.defined_regions.natural_earth.land_110
land_mask = land.mask(spawn.lon, spawn.lat)

In [None]:
land_mask.plot()

In [None]:
land_mask = land_mask.to_dataset(name="land_mask", promote_attrs=True)

In [None]:
land_mask.to_zarr(
    f"s3://carbonplan-climatetrace/validation/land_mask.zarr", consolidated=True, mode="w"
)

# landsat mask


In [None]:
from s3fs import S3FileSystem
import geopandas

fs = S3FileSystem()
with fs.open(f"s3://carbonplan-climatetrace/{version}/masks/valid_landsat.shp.zip") as f:
    landsat_shape = geopandas.read_file(f)

In [None]:
landsat_shape["name"] = "study domain"

In [None]:
landsat_shape

In [None]:
landsat_shape.to_file(
    f"s3://carbonplan-climatetrace/{version}/masks/study_domain.geojson", driver="GeoJSON"
)

# regions


In [None]:
spawn = xr.open_zarr(f"s3://carbonplan-climatetrace/validation/spawn_3000m_agbd.zarr")
spawn = spawn.rio.write_crs("EPSG:4326")

In [None]:
ar6 = regionmask.defined_regions.ar6.all

In [None]:
ar6[0].name

In [None]:
ar6_mask = ar6.mask(spawn.lon, spawn.lat)

In [None]:
ar6_mask.plot()

In [None]:
ar6_mask = ar6_mask.to_dataset(name="ar6_mask", promote_attrs=True)

In [None]:
ar6_mask.to_zarr(
    f"s3://carbonplan-climatetrace/validation/ar6_mask.zarr", consolidated=True, mode="w"
)

# realm mask


In [None]:
spawn = xr.open_zarr(f"s3://carbonplan-climatetrace/validation/spawn_3000m_agbd.zarr")
spawn = spawn.rio.write_crs("EPSG:4326")

In [None]:
fp = "s3://carbonplan-climatetrace/inputs/shapes/inputs_Ecoregions2017_Ecoregions2017.shp"
ecoregions = gpd.read_file(fp)

In [None]:
ecoregion_mask = regionmask.mask_geopandas(
    ecoregions, numbers="ECO_ID", lon_or_obj=spawn.lon, lat=spawn.lat
)

In [None]:
from carbonplan_trace.v1 import glas_allometric_eq as allo

realm_mask = allo.get_realm_from_ecoregion(ecoregion_mask)

In [None]:
realm_mask = realm_mask.to_dataset(name="realm")

In [None]:
realm_mask.to_zarr(
    f"s3://carbonplan-climatetrace/validation/realm_mask.zarr", consolidated=True, mode="w"
)

# xu 2021 biomass - standardizing to 3 km and upscaling to 20km

https://doi.org/10.5281/zenodo.4161694.


In [None]:
xu = xr.open_rasterio(
    "s3://carbonplan-climatetrace/validation/xu2021/test10a_cd_ab_pred_corr_2000_2019_v2.tif"
)

In [None]:
xu = xu.rename({"band": "year"}).assign_coords({"year": np.arange(2000, 2020)})

raw units for xu are MgC/ha so we'll adjust multiply by the inverse of the IPCC [c/biomass] factor


In [None]:
xu *= 1 / 0.47

In [None]:
xu.rename({"x": "lon", "y": "lat"}).to_dataset(name="agbd").to_zarr(
    "s3://carbonplan-climatetrace/validation/xu_10km_agbd.zarr/"
)

In [None]:
# for comparison with our biomass we'll upscale ours to the 10 km resolution, averaging
ds = ds.rio.write_crs("EPSG:4326")

In [None]:
ds_10km = (
    ds["AGB"]
    .rename({"lat": "y", "lon": "x"})
    .rio.reproject_match(xu.isel(year=0))
    .rename({"time": "year"})
    .assign_coords({"year": np.arange(2014, 2021)})
    .to_dataset(name="AGB")
)

In [None]:
ds_10km.to_zarr("s3://carbonplan-climatetrace/v1.2/results/global/10km/raster_biomass.zarr")

In [None]:
# also write out a 3km product for easy comparison with other datsets

In [None]:
xu_3000m = xu.rio.reproject_match(
    ds.rename({"lat": "y", "lon": "x"}), resampling=Resampling.nearest
).to_dataset(name="agbd")

In [None]:
xu_3000m = xu_3000m.rename({"x": "lon", "y": "lat"}).assign_coords(
    {"lon": ds.lon.values, "lat": ds.lat.values}
)

In [None]:
xu_3000m.to_zarr("s3://carbonplan-climatetrace/validation/xu_3000m_agbd.zarr/", mode="w")

# roll up to country level


In [None]:
def rollup_shapes(variables, input_filename, output_filename_template):
    print("rollup_shapes")
    shapes_file = "s3://carbonplan-climatetrace/inputs/shapes/countries.shp"
    shapes_df = gpd.read_file(shapes_file)
    # assign each country a number
    shapes_df["numbers"] = np.arange(len(shapes_df))

    ds = xr.open_zarr(input_filename, consolidated=True)
    ds["area"] = compute_grid_area(ds[variables[0]])
    for var in variables:
        ds[var] = ds[var] * ds["area"]

    mask = regionmask.mask_geopandas(shapes_df, ds["lon"], ds["lat"], numbers="numbers")

    for var in variables:
        # this will trigger dask compute
        df = ds[[var, "area"]].groupby(mask).sum().to_dataframe()

        # map the numbers back to country code
        df = df.reset_index()
        mapping = {
            row["numbers"]: row["alpha3"] for i, row in shapes_df[["alpha3", "numbers"]].iterrows()
        }
        df["region"] = df.region.apply(lambda x: mapping[int(x)])
        df = df.rename(columns={"region": "iso3_country"})
        # rename variables if needed
        if "agbd" not in df:
            df = df.rename(columns={var: "agbd"})

        # compute average instead of sum
        df["agbd"] = df["agbd"] / df["area"]
        df = df.drop("area", axis=1)

        # write out
        uri = output_filename_template.format(var=var)
        print(f"writing data to {uri}")
        df.to_csv(uri, index=False)

In [None]:
comps = ["Xu"]  # "Harris", "Spawn", "GEDI",
variables = ["agbd"]
for name in comps:
    input_filename = f"s3://carbonplan-climatetrace/validation/{name.lower()}_3000m_agbd.zarr"
    output_filename_template = (
        f"s3://carbonplan-climatetrace/validation/{name.lower()}" + "_country_rollups_{var}.csv"
    )
    rollup_shapes(variables, input_filename, output_filename_template)

In [None]:
version = "v1.2"
input_filename = f"s3://carbonplan-climatetrace/{version}/results/global/3000m/raster_biomass.zarr"
output_filename_template = f"s3://carbonplan-climatetrace/{version}/" + "country_rollups_{var}.csv"
variables = ["AGB", "AGB_na_filled"]

In [None]:
rollup_shapes(variables, input_filename, output_filename_template)

# roll up to realm level


In [None]:
def rollup_realm(variables, input_filename, output_filename_template):
    print("rollup_shapes")
    ds = xr.open_zarr(input_filename, consolidated=True)

    realms = xr.open_zarr("s3://carbonplan-climatetrace/validation/realm_mask.zarr")
    realms = realms.assign_coords({"lat": realms.lat.round(4), "lon": realms.lon.round(4)})
    ds["realm"] = realms.realm
    ds["area"] = compute_grid_area(ds[variables[0]])
    for var in variables:
        ds[var] = ds[var] * ds["area"]

    for var in variables:
        # this will trigger dask compute
        df = ds[[var, "area", "realm"]].groupby("realm").sum().to_dataframe()
        df = df.reset_index()

        # rename variables if needed
        if "agbd" not in df:
            df = df.rename(columns={var: "agbd"})

        # compute average instead of sum
        df["agbd"] = df["agbd"] / df["area"]
        df = df.drop("area", axis=1)
        print(df)

        # write out
        uri = output_filename_template.format(var=var)
        print(f"writing data to {uri}")
        df.to_csv(uri, index=False)

In [None]:
version = "v1.2"
input_filename = f"s3://carbonplan-climatetrace/{version}/results/global/3000m/raster_biomass.zarr"
output_filename_template = f"s3://carbonplan-climatetrace/{version}/" + "realm_rollups_{var}.csv"
variables = ["AGB", "AGB_na_filled"]

In [None]:
rollup_realm(variables, input_filename, output_filename_template)

In [None]:
comps = ["Harris", "GEDI"]
# comps = ['Spawn']
variables = ["agbd"]
for name in comps:
    input_filename = f"s3://carbonplan-climatetrace/validation/{name.lower()}_3000m_agbd.zarr"
    output_filename_template = (
        f"s3://carbonplan-climatetrace/validation/{name.lower()}" + "_realm_rollups_{var}.csv"
    )
    rollup_realm(variables, input_filename, output_filename_template)