# Make allometric equations mask

Following Harris et al 2021 ([paper](https://www.nature.com/articles/s41558-020-00976-6),
[spreadsheet](https://docs.google.com/spreadsheets/d/1Hb67l3xYCfgxKu9TnpbEfY6iQo4ISNEvQewpmYvH9yQ/edit#gid=1620488341)),
and Farina et al
([paper](https://docs.google.com/document/d/1qoIoYBghr7FfqZlcT8h5BGMww_Obtnrw/edit)), we want to use
the following datasets to determine which allometric equations to use:

- Ecoregions2017. Dinerstein, Eric, David Olson, Anup Joshi, Carly Vynne, Neil D Burgess, Eric
  Wikramanayake, Nathan Hahn, et al. 2017. “An Ecoregion-Based Approach to Protecting Half the
  Terrestrial Realm.” BioScience 67 (6): 534–45. https://doi.org/10.1093/biosci/bix014. Retrieved
  from https://ecoregions2017.appspot.com/ on Mar 5th, 2021.

- NLCD. Retrieved from CarbonPlan data storage on GCP.

- EOSD.

- IGBP. Friedl, M.A., A.H. Strahler, and J. Hodges. 2010. ISLSCP II MODIS (Collection 4) IGPB Land
  Cover, 2000-2001. In Hall, Forrest G., G. Collatz, B. Meeson, S. Los, E. Brown de Colstoun, and D.
  Landis (eds.). ISLSCP Initiative II Collection. Data set. Available on-line
  [http://daac.ornl.gov/] from Oak Ridge National Laboratory Distributed Active Archive Center, Oak
  Ridge, Tennessee, U.S.A. doi:10.3334/ORNLDAAC/968. Retrieved from
  https://daac.ornl.gov/daacdata/islscp_ii/vegetation/modis_landcover_xdeg/data/. Documented in
  https://daac.ornl.gov/daacdata/islscp_ii/vegetation/modis_landcover_xdeg/comp/1_modis_landcover_doc.pdf.

In this notebook, we load in each dataset, transform everything to the target grid, and store the
output.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

import fsspec
import os
import shutil
import regionmask
import rioxarray

from itertools import product
from zarr.errors import GroupNotFoundError

from shapely.geometry import Polygon

import carbonplan_trace.v1.utils as utils

# from carbonplan_trace.v1.glas_allometric_eq import (
#     get_lat_lon_tags_from_tile_path,
#     parse_bounding_lat_lon_for_tile,
# )

from gcsfs import GCSFileSystem

fs = GCSFileSystem(cache_timeout=0)

In [None]:
def get_tile_in_xr(path):
    mapper = fsspec.get_mapper(path)
    try:
        ds = xr.open_zarr(mapper, chunks=None)
        ds.attrs["crs"] = "EPSG:4326"

        return ds
    except GroupNotFoundError:
        print(f"{path} empty, skipping")


import regionmask


def convert_gdf_into_tiles(tile_ds, gdf, value_col, value_name):
    # get coordinates of target tile
    lon_res = tile_ds.lon.values[1] - tile_ds.lon.values[0]
    lon = np.arange(tile_ds.lon.values[0], tile_ds.lon.values[-1] + (lon_res / 2), lon_res)
    lat_res = tile_ds.lat.values[1] - tile_ds.lat.values[0]
    lat = np.arange(tile_ds.lat.values[0], tile_ds.lat.values[-1] + (lat_res / 2), lat_res)

    # turn gdf into xarray
    output_da = regionmask.mask_geopandas(gdf, numbers=value_col, lon_or_obj=lon, lat=lat)
    output_da.name = value_name

    return output_da


def convert_raster_into_tiles(tile_ds, raster):
    output = raster.rio.reproject_match(tile_ds)

    return output

# Find target tiles


In [None]:
# Target tiles
lat_tags = [
    "80N",
    "70N",
    "60N",
    "50N",
    "40N",
    "30N",
    "20N",
    "10N",
    "00N",
    "10S",
    "20S",
    "30S",
    "40S",
    "50S",
]
lon_tags = [f"{n:03}W" for n in np.arange(10, 190, 10)] + [
    f"{n:03}E" for n in np.arange(0, 180, 10)
]

tile_paths = []
for lat, lon in list(product(lat_tags, lon_tags)):
    tile_paths.append(f"gs://carbonplan-climatetrace/v0/tiles/{lat}_{lon}.zarr")

In [None]:
len(lat_tags) * len(lon_tags)

In [None]:
# try reading each tile and remove tile name if empty
empty_tiles = []
for tp in tile_paths:
    target_tile = get_tile_in_xr(tp)
    if not target_tile:
        empty_tiles.append(tp)

for tp in empty_tiles:
    tile_paths.remove(tp)

In [None]:
tile_paths

# Read in each dataset, consolidate as appropriate, then turn into an raster in the same format as target tiles


### Ecoregions


In [None]:
fp = "gs://carbonplan-climatetrace/inputs/Ecoregions2017/Ecoregions2017.shp"
ecoregions = gpd.read_file(fp)
ecoregions.head()

In [None]:
def get_list_of_mask_tiles(include=""):
    """
    Ecoregions mask is stored in 10 degree tiles, grab the filepaths
    """
    no_data_tiles = ["00N_070E", "20N_120W", "30N_170W", "40N_070W"]

    fs = GCSFileSystem(cache_timeout=0)
    mask_folder = "gs://carbonplan-climatetrace/intermediates/ecoregions_mask/"
    # fs.ls includes the parent folder itself, skip that link
    mask_paths = [tp for tp in fs.ls(mask_folder) if not tp.endswith("/") and include in tp]

    all_lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(tp) for tp in mask_paths]

    lat_lon_tags = []
    for lat, lon in all_lat_lon_tags:
        fn = f"{lat}_{lon}"
        output_path = (
            f"gs://carbonplan-climatetrace/intermediates/biomass/{lat}_{lon}.zarr/.zmetadata"
        )
        if not fs.exists(output_path) and not fn in no_data_tiles:
            lat_lon_tags.append((lat, lon))

    return lat_lon_tags


lat_lon_tags = get_list_of_mask_tiles()
# this should be in the order of min_lat, max_lat, min_lon, max_lon
bounding_boxes = [utils.parse_bounding_box_from_lat_lon_tags(lat, lon) for lat, lon in lat_lon_tags]

In [None]:
len(bounding_boxes)

In [None]:
for tp in tile_paths:
    # use the same filename as target tiles for output
    fn = tp.split("/")[-1]
    output_path = f"gs://carbonplan-climatetrace/intermediates/ecoregions_mask/{fn}"

    if fs.exists(output_path):
        print(f"Skipping {fn}")
        pass
    else:
        print(f"Processing {fn}")
        # read in the target tile
        target_tile = get_tile_in_xr(tp)

        # convert ecoregions shapefile into target tile format
        output_da = convert_gdf_into_tiles(
            tile_ds=target_tile,
            gdf=ecoregions,
            value_col="ECO_ID",
            value_name="ecoregion",
        )

        # save the output
        save_to_zarr(ds=output_da.to_dataset(), url=output_path)

In [None]:
# load a tile to double check output
ds = get_tile_in_xr(f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/50N_130W.zarr")

In [None]:
ds

In [None]:
ds.ecoregion.isnull().sum().values

In [None]:
ds.ecoregion[::100, ::100].plot()

In [None]:
import time

In [None]:
reruns = ["60N_010E"]
rerun_paths = [f"carbonplan-climatetrace/intermediates/ecoregions_mask/{fn}.zarr" for fn in reruns]

for path in rerun_paths:
    print(path)
    lat, lon = utils.get_lat_lon_tags_from_tile_path(path)
    bounding_box = utils.parse_bounding_box_from_lat_lon_tags(lat, lon)
    min_lat, max_lat, min_lon, max_lon = bounding_box
    tile = Polygon(
        [
            [min_lon, max_lat],
            [max_lon, max_lat],
            [max_lon, min_lat],
            [min_lon, min_lat],
            [min_lon, max_lat],
        ]
    )
    shapes = ecoregions.loc[(ecoregions.intersects(tile))]

    local_path = f"/home/jovyan/temp/{lat}_{lon}.zarr"
    cloud_path = f"gs://carbonplan-climatetrace/intermediates/ecoregions_mask/{lat}_{lon}.zarr"
    source_path = f"gs://carbonplan-climatetrace/v0/tiles/{lat}_{lon}.zarr"

    eco = utils.open_zarr_file(path)
    list_of_vars = list(eco.variables.keys())
    null_frac = eco.ecoregion.isnull().mean().values
    print(list_of_vars)
    print("null fraction = ", null_frac)

    print("reprocessing")
    target_tile = utils.open_zarr_file(source_path)
    output_da = convert_gdf_into_tiles(
        tile_ds=target_tile,
        gdf=shapes.reset_index(drop=True),
        value_col="ECO_ID",
        value_name="ecoregion",
    )

    new_frac = output_da.isnull().mean().values
    print("new null fraction = ", new_frac)

    print("re saving")
    utils.save_to_zarr(
        ds=output_da.to_dataset(promote_attrs=True),
        url=local_path,
        mode="w",
    )

    time.sleep(300)
    fs.rm(cloud_path, recursive=True)
    time.sleep(300)
    fs.put(local_path, cloud_path, recursive=True)
    time.sleep(300)
    shutil.rmtree(local_path)
    del output_da
    del eco

In [None]:
reruns = ["30N_090E"]
rerun_paths = [f"carbonplan-climatetrace/intermediates/ecoregions_mask/{fn}.zarr" for fn in reruns]

mask_folder = "gs://carbonplan-climatetrace/intermediates/ecoregions_mask/"
# fs.ls includes the parent folder itself, skip that link
mask_paths = [tp for tp in fs.ls(mask_folder) if not tp.endswith("/")]

for path in mask_paths:
    print(path)
    eco = utils.open_zarr_file(path)
    list_of_vars = list(eco.variables.keys())
    null_frac = eco.ecoregion.isnull().mean().values
    print(list_of_vars)
    print("null fraction = ", null_frac)

    if "nlcd" not in list_of_vars and "eosd" not in list_of_vars and null_frac > 0:
        print("reprocessing")
        lat, lon = utils.get_lat_lon_tags_from_tile_path(path)
        bounding_box = utils.parse_bounding_box_from_lat_lon_tags(lat, lon)
        min_lat, max_lat, min_lon, max_lon = bounding_box

        tile = Polygon(
            [
                [min_lon, max_lat],
                [max_lon, max_lat],
                [max_lon, min_lat],
                [min_lon, min_lat],
                [min_lon, max_lat],
            ]
        )
        shapes = ecoregions.loc[(ecoregions.intersects(tile))]

        output_da = convert_gdf_into_tiles(
            tile_ds=eco,
            gdf=shapes.reset_index(drop=True),
            value_col="ECO_ID",
            value_name="ecoregion",
        )

        local_path = f"/home/jovyan/temp/{lat}_{lon}.zarr"
        cloud_path = f"gs://carbonplan-climatetrace/intermediates/ecoregions_mask/{lat}_{lon}.zarr"

        new_frac = output_da.isnull().mean().values
        print("new null fraction = ", new_frac)

        if new_frac < (null_frac - 0.01):
            print("re saving")
            utils.save_to_zarr(
                ds=output_da.to_dataset(promote_attrs=True),
                url=local_path,
                mode="w",
            )

            time.sleep(60)
            fs.rm(cloud_path, recursive=True)
            time.sleep(60)
            fs.put(local_path, cloud_path, recursive=True)
            time.sleep(60)
            shutil.rmtree(local_path)
        del output_da
    del eco

### NLCD


In [None]:
nlcd_conus = xr.open_rasterio(
    "gs://carbonplan-data/raw/nlcd/conus/30m/2001.tif", parse_coordinates=True
)

In [None]:
nlcd_conus[::100, ::100].plot()

In [None]:
# look at the bounding box of NLCD data

from pyproj import Transformer

transformer = Transformer.from_crs(nlcd_conus.crs, "EPSG:4326")

lat, lon = transformer.transform(nlcd_conus.x.values[0], nlcd_conus.y.values[0])
lat, lon

In [None]:
lat, lon = transformer.transform(nlcd_conus.x.values[-1], nlcd_conus.y.values[-1])
lat, lon

In [None]:
lons_of_interest = ["130W", "120W", "110W", "100W", "090W", "080W"]
lats_of_interest = ["50N", "40N", "30N"]

for lat, lon in list(product(lats_of_interest, lons_of_interest)):
    # use the same filename as target tiles for output
    fn = f"{lat}_{lon}.zarr"
    #     output_path = f'gs://carbonplan-scratch/trace_scratch/nlcd_cache/{fn}'
    output_path = f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/{fn}"

    if not fs.exists(output_path):
        # if the target tile doesn't exist, then pass
        print(f"Skipping {fn}, file does not exist")
        pass

    elif fs.exists(output_path + "/nlcd/"):
        # if we have already process this tile, also pass
        print(f"Skipping {fn}, NLCD data already present")
        pass

    else:
        # otherwise reproject the tile
        print(f"Processing {fn}")
        # read in the target tile
        target_tile = get_tile_in_xr(output_path)
        target_tile = target_tile.rename(lon="x", lat="y")

        # convert NLCD raster into target tile format
        output_da = convert_raster_into_tiles(tile_ds=target_tile, raster=nlcd_conus)
        output_da = output_da.drop_vars("spatial_ref")
        output_da = output_da.squeeze(dim="band", drop=True)
        output_da.attrs = {"crs": "EPSG:4326"}
        output_da.coords["x"] = target_tile.x
        output_da.coords["y"] = target_tile.y

        target_tile["nlcd"] = output_da
        target_tile = target_tile.rename(x="lon", y="lat")

        # save the output
        save_to_zarr(
            ds=target_tile,
            url=output_path,
            list_of_variables=["nlcd"],
            mode="a",
        )

In [None]:
# load a tile to double check output
ds = get_tile_in_xr(f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/50N_130W.zarr")
ds

In [None]:
ds.nlcd[::100, ::100].plot()

In [None]:
np.unique(ds.nlcd.values)

In [None]:
ds.nlcd.isnull().sum().values

In [None]:
ds.ecoregion.isnull().sum().values

In [None]:
ds.ecoregion[::100, ::100].plot()

In [None]:
nlcd_ak = xr.open_rasterio(
    "https://storage.googleapis.com/carbonplan-data/raw/nlcd/ak/30m/2011.tif",
    parse_coordinates=True,
)

In [None]:
nlcd_ak[::100, ::100].plot()

In [None]:
# look at the bounding box of NLCD data

from pyproj import Transformer

transformer = Transformer.from_crs(nlcd_ak.crs, "EPSG:4326")

lat, lon = transformer.transform(nlcd_ak.x.values[0], nlcd_ak.y.values[0])
lat, lon

In [None]:
lat, lon = transformer.transform(nlcd_ak.x.values[-1], nlcd_ak.y.values[-1])
lat, lon

In [None]:
lons_of_interest = [
    "150E",
    "160E",
    "170E",
    "180W",
    "170W",
    "160W",
    "150W",
    "140W",
]
lats_of_interest = ["70N", "60N"]

for lat, lon in list(product(lats_of_interest, lons_of_interest)):
    # use the same filename as target tiles for output
    fn = f"{lat}_{lon}.zarr"
    output_path = f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/{fn}"

    if not fs.exists(output_path):
        # if the target tile doesn't exist, then pass
        print(f"Skipping {fn}, file does not exist")
        pass

    elif fs.exists(output_path + "/nlcd/"):
        # if we have already process this tile, also pass
        print(f"Skipping {fn}, NLCD data already present")
        pass

    else:
        # otherwise reproject the tile
        print(f"Processing {fn}")
        # read in the target tile
        target_tile = get_tile_in_xr(output_path)
        target_tile = target_tile.rename(lon="x", lat="y")

        # convert NLCD raster into target tile format
        output_da = convert_raster_into_tiles(tile_ds=target_tile, raster=nlcd_ak)
        output_da = output_da.drop_vars("spatial_ref")
        output_da = output_da.squeeze(dim="band", drop=True)
        output_da.attrs = {"crs": "EPSG:4326"}
        output_da.coords["x"] = target_tile.x
        output_da.coords["y"] = target_tile.y

        target_tile["nlcd"] = output_da
        target_tile = target_tile.rename(x="lon", y="lat")

        # save the output
        save_to_zarr(
            ds=target_tile,
            url=output_path,
            list_of_variables=["nlcd"],
            mode="a",
        )

In [None]:
# load a tile to double check output
ds = get_tile_in_xr(f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/70N_140W.zarr")
ds

In [None]:
np.unique(ds.nlcd.values)

In [None]:
ds.nlcd[::100, ::100].plot()

In [None]:
ds.ecoregion[::100, ::100].plot()

### IGBP old version

documentations: https://daac.ornl.gov/daacdata/islscp_ii/vegetation/modis_landcover_xdeg/comp/


In [None]:
# fn = '../data/IGBP/modis_landcover_class_qd.asc'
# # the first 6 lines are additional info not data
# headers = 6
# igbp = np.genfromtxt(fn, skip_header=headers)

In [None]:
# with open(fn) as f:
#     head = [next(f) for x in range(headers)]
# print(head)

In [None]:
# from rasterio.transform import Affine

# # use info in the headers
# ncols = 1440
# nrows = 720
# xll = -180
# yll = -90
# res = .25

# transform = Affine.translation(xll, yll+res*nrows) * Affine.scale(res, -res)
# transform

In [None]:
# import rasterio
# import xarray as xr

# fn = 'IGBP.tif'
# local_target = f"../data/{fn}"
# remote_target = f"gs://carbonplan-scratch/trace_scratch/{fn}"

# os.remove(local_target)
# with rasterio.open(
#     local_target,
#     'w',
#     driver='GTiff',
#     height=igbp.shape[0],
#     width=igbp.shape[1],
#     count=1,
#     dtype=igbp.dtype,
#     crs='+proj=latlong',
#     transform=transform,
# ) as dst:
#     dst.write(igbp, 1)

# dst.close()

# fs.put_file(local_target, remote_target)
# os.remove(local_target)

In [None]:
# igbp = xr.open_rasterio(
#     "https://storage.googleapis.com/carbonplan-scratch/trace_scratch/IGBP.tif",
#     parse_coordinates=True,
# )
# igbp = igbp.squeeze(dim="band", drop=True)
# igbp = igbp.rename(x="lon", y="lat")
# igbp

In [None]:
# igbp.plot()

## IGBP new version

https://lpdaac.usgs.gov/products/mcd12q1v006/  
https://lpdaac.usgs.gov/documents/101/MCD12_User_Guide_V6.pdf


In [None]:
d = "gs://carbonplan-climatetrace/inputs/igbp/"
files = [f for f in fs.ls(d) if not f.endswith("/") and not f.endswith("zarr")]
years = [f.split("/")[-1].split(".")[1] for f in files]
file_df = pd.DataFrame({"file_path": files, "year": years})

In [None]:
for yr, group in file_df.groupby("year"):
    #     if '2009' in yr:
    print(yr[1:5])
    igbp = []
    for i, file in group.iterrows():
        f = xr.open_rasterio(f"gs://{file.file_path}").squeeze(dim="band", drop=True)
        igbp.append(f.to_dataset(name="igbp", promote_attrs=True).chunk({"x": 2400, "y": 2400}))
    igbp = xr.combine_by_coords(igbp, combine_attrs="drop_conflicts")
    attrs = igbp.attrs
    igbp = xr.where(igbp == 255, np.nan, igbp)
    igbp = xr.where(igbp == 17, np.nan, igbp)
    igbp.attrs = attrs
    mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/inputs/igbp/{yr[1:5]}.zarr")
    igbp.to_zarr(mapper, mode="w")

In [None]:
igbp.igbp[::20, ::20].plot()

In [None]:
tilepaths = [
    f
    for f in fs.ls("gs://carbonplan-climatetrace/intermediates/ecoregions_mask/")
    if not f.endswith("/")
]
len(tilepaths)

In [None]:
# for each tile
for tp in tilepaths[4:]:
    print(tp)
    # load tile
    target_tile = get_tile_in_xr("gs://" + tp)
    # preprocess
    target_tile = target_tile.rename(lon="x", lat="y")
    target_tile = target_tile.coarsen({"x": 10, "y": 10}).mean()
    target_tile.attrs["crs"] = "EPSG:4326"
    # get file names
    fn = tp.split("/")[-1].split(".")[0]
    local_path = f"/home/jovyan/temp/{fn}.zarr"
    cloud_path = f"gs://carbonplan-climatetrace/intermediates/igbp/{fn}.zarr"
    if os.path.exists(local_path):
        shutil.rmtree(local_path)

    # load igbp per year
    for yr in np.arange(2003, 2010):
        print(yr)
        mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/inputs/igbp/{yr}.zarr")
        igbp = xr.open_zarr(mapper)
        attrs = igbp.attrs
        igbp = igbp.igbp
        igbp.attrs = attrs

        # transform
        output_da = convert_raster_into_tiles(tile_ds=target_tile, raster=igbp)
        output_da.attrs = {"crs": "EPSG:4326"}
        output_da.coords["x"] = target_tile.x
        output_da.coords["y"] = target_tile.y
        output_da = output_da.rename(x="lon", y="lat")
        output_da = output_da.assign_coords(year=yr).expand_dims("year")

        if not os.path.exists(local_path):
            output_da.to_dataset(promote_attrs=True).to_zarr(local_path, mode="w")
        else:
            output_da.to_dataset(promote_attrs=True).to_zarr(local_path, append_dim="year")

    fs.put(local_path, cloud_path, recursive=True)
    shutil.rmtree(local_path)

In [None]:
mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/intermediates/igbp/50N_130W.zarr")
test = xr.open_zarr(mapper)

test.sel(year=2003).igbp[::10, ::10].plot()

In [None]:
test

## Burned area

https://lpdaac.usgs.gov/products/mcd64a1v006/  
https://lpdaac.usgs.gov/documents/875/MCD64_User_Guide_V6.pdf


In [None]:
d = "gs://carbonplan-climatetrace/inputs/modis_burned_area/"
files = [f for f in fs.ls(d) if not f.endswith("/") and not f.endswith("zarr")]
reference_date = [f.split("/")[-1].split(".")[1] for f in files]
rowpath = [f.split("/")[-1].split(".")[2] for f in files]

file_df = pd.DataFrame({"file_path": files, "reference_date": reference_date, "rowpath": rowpath})
file_df["year"] = file_df.reference_date.str[1:5]
file_df["day"] = file_df.reference_date.str[5:]

In [None]:
file_df.head()

In [None]:
len(file_df)

In [None]:
file_df.groupby("rowpath").file_path.count().unique()

In [None]:
len(file_df.rowpath.unique())

In [None]:
final = []

for rp, group in sub.groupby("rowpath"):
    print(rp)
    rowpath = []
    total = 0
    for i, file in group.iterrows():
        f = xr.open_rasterio(f"gs://{file.file_path}").squeeze(dim="band", drop=True)
        attrs = f.attrs
        f = xr.where(f.isin([0, -1, -2]), np.nan, f)
        total += (f > 0).astype(int).sum().values
        f += float(file.year) * 1000
        f.attrs = attrs
        rowpath.append(
            f.assign_coords(day=float(file.year) * 1000 + float(file.day)).expand_dims("day")
        )

    rowpath = xr.concat(rowpath, dim="day").min(dim="day", keep_attrs=True)
    print(f"total = {total}")
    print("composite = ", (rowpath > 0).astype(int).sum().values)
    print("composite = ", (rowpath > 0).astype(int).mean().values)

    final.append(
        rowpath.to_dataset(name="burned date", promote_attrs=True).chunk({"x": 2400, "y": 2400})
    )

final = xr.combine_by_coords(final, combine_attrs="drop_conflicts")
mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/inputs/modis_burned_area/composite.zarr")
final.to_zarr(mapper, mode="w")

In [None]:
mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/inputs/modis_burned_area/composite.zarr")
burned = xr.open_zarr(mapper)
burned["burned date"].attrs = burned.attrs

In [None]:
(burned["burned date"] < 2000000).sum().values

In [None]:
((burned["burned date"] % 1000) == 0).astype(int).sum().values

In [None]:
(burned["burned date"] > 0).sum().values

In [None]:
burned["burned date"][::20, ::20].plot()

In [None]:
tilepaths = [
    f
    for f in fs.ls("gs://carbonplan-climatetrace/intermediates/ecoregions_mask/")
    if not f.endswith("/")
]
len(tilepaths)

In [None]:
for tp in tilepaths:
    print(tp)
    # load tile
    target_tile = get_tile_in_xr("gs://" + tp)
    # preprocess
    target_tile = target_tile.rename(lon="x", lat="y")
    target_tile = target_tile.coarsen({"x": 10, "y": 10}).mean()
    target_tile.attrs["crs"] = "EPSG:4326"
    # get file names
    fn = tp.split("/")[-1].split(".")[0]
    local_path = f"/home/jovyan/temp/{fn}.zarr"
    cloud_path = f"gs://carbonplan-climatetrace/intermediates/modis_burned_area/{fn}.zarr"
    if os.path.exists(local_path):
        shutil.rmtree(local_path)

    # transform
    output_da = convert_raster_into_tiles(tile_ds=target_tile, raster=burned["burned date"])
    output_da.attrs = {"crs": "EPSG:4326"}
    output_da.coords["x"] = target_tile.x
    output_da.coords["y"] = target_tile.y
    output_da = output_da.rename(x="lon", y="lat")
    output_da.name = "burned_date"

    output_da.to_dataset(promote_attrs=True).to_zarr(local_path, mode="w")

    fs.put(local_path, cloud_path, recursive=True)
    shutil.rmtree(local_path)

In [None]:
mapper = fsspec.get_mapper(
    f"gs://carbonplan-climatetrace/intermediates/modis_burned_area/50N_130W.zarr"
)
test = xr.open_zarr(mapper)

test.burned_date[::10, ::10].plot()

### EOSD


The following block copies the shapefiles from a ftp site.

Legend can be found
[here](https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_lcc_csc/doc/GeoBase_lcc_en_Catalogue.pdf).
[Other docs](https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_lcc_csc/doc/)

[Alternative source for EOSD data in tif format](http://tree.pfc.forestry.ca/)


In [None]:
# import shutil
# import ftplib
# import urllib.request as request
# from contextlib import closing

# # source filepaths
# ftp_server = 'ftp.maps.canada.ca'
# path = '/pub/nrcan_rncan/vector/geobase_lcc_csc/shp_en/'
# # dest filepaths
# dest_path = 'carbonplan-scratch/trace_scratch/EOSD/'

# ftp = ftplib.FTP(ftp_server)
# ftp.login()
# ftp.cwd(path)
# folders = ftp.nlst()

# for folder in folders:
#     fnames = ftp.nlst(folder)
#     for fn in fnames:
#         fp = f'ftp://{ftp_server}{path}{fn}'
#         print(fp)
#         with closing(request.urlopen(fp)) as r:
#             uri = dest_path + fn
#             with fs.open(uri, 'wb') as f:
#                 shutil.copyfileobj(r, f)

The following blocks sorts the EOSD raw files according to the bounding box of each tile


In [None]:
from carbonplan_trace.v1.glas_allometric_eq import (
    parse_bounding_lat_lon_for_tile,
)
from shapely import geometry

In [None]:
# get the list of tiles of interest and their respective bounding boxes
all_tiles = [
    p.split("/")[-1].split(".")[0]
    for p in fs.ls("gs://carbonplan-scratch/trace_scratch/ecoregions_mask/")
    if not p.endswith("/")
]

tile_poly = []
for tile in all_tiles:
    min_lat, max_lat, min_lon, max_lon = parse_bounding_lat_lon_for_tile(tile)
    tile_poly.append(
        geometry.Polygon(
            [
                [min_lon, min_lat],
                [min_lon, max_lat],
                [max_lon, max_lat],
                [max_lon, min_lat],
            ]
        )
    )

tile_gdf = gpd.GeoDataFrame({"tile_name": all_tiles, "geometry": tile_poly}, crs="EPSG:4326")

In [None]:
eosd_folder = "gs://carbonplan-scratch/trace_scratch/EOSD/"
eosd_subfolders = [p for p in fs.ls(eosd_folder) if not p.endswith("/")]

for eosd_subfolder in eosd_subfolders:
    zip_files = fs.ls(eosd_subfolder)
    for zip_file in zip_files:
        fn = zip_file.split("/")[-1]
        input_file = f"gs://{zip_file}"
        eosd_raw = gpd.read_file(input_file)
        min_lon, min_lat, max_lon, max_lat = eosd_raw.total_bounds
        eosd_poly = geometry.Polygon(
            [
                [min_lon, min_lat],
                [min_lon, max_lat],
                [max_lon, max_lat],
                [max_lon, min_lat],
            ]
        )
        # figure out which tile it belongs to
        intersect_tiles = tile_gdf.loc[tile_gdf.intersects(eosd_poly)].tile_name.values
        for intersect_tile in intersect_tiles:
            fs.cp(
                f"gs://{zip_file}",
                f"gs://carbonplan-scratch/trace_scratch/EOSD_sorted/{intersect_tile}/{fn}",
            )

In [None]:
mapper = fsspec.get_mapper(f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/50N_080W.zarr")
ds = xr.open_dataset(mapper, engine="zarr", cache=False)
ds

For each tile, concatenate everything and turn into raster


In [None]:
all_tiles = [
    p for p in fs.ls(f"gs://carbonplan-scratch/trace_scratch/EOSD_sorted/") if not p.endswith("/")
]

for tile in all_tiles:
    fn = tile.split("/")[-1]
    tile_path = f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/{fn}.zarr"

    if fs.exists(tile_path + "/eosd/"):
        # if we have already process this tile, also pass
        print(f"Skipping {fn}, EOSD data already present")
        pass

    else:
        print(f"Processing {fn}")
        zip_files = fs.ls(tile)
        eosd = []
        for zf in zip_files:
            print(f"    reading {zf}")
            temp = gpd.read_file("gs://" + zf)
            print(temp.total_bounds)
            eosd.append(temp)

        print("concat")
        eosd = pd.concat(eosd, ignore_index=True)

        # read in the target tile
        target_tile = get_tile_in_xr(tile_path)

        eosd = eosd[["COVTYPE", "geometry"]]
        eosd = eosd.sort_values(by="COVTYPE").reset_index(drop=True)

        print("convert")
        # convert ecoregions shapefile into target tile format
        eosd_index_a = (
            convert_gdf_into_tiles(
                tile_ds=target_tile.isel(lat=slice(0, 20000), lon=slice(0, 20000)),
                gdf=eosd,
                value_col=None,
                value_name="eosd_index",
            )
            .chunk({"lat": 625, "lon": 1250})
            .to_dataset()
        )

        eosd_index_b = (
            convert_gdf_into_tiles(
                tile_ds=target_tile.isel(lat=slice(20000, 40000), lon=slice(0, 20000)),
                gdf=eosd,
                value_col=None,
                value_name="eosd_index",
            )
            .chunk({"lat": 625, "lon": 1250})
            .to_dataset()
        )

        eosd_index_c = (
            convert_gdf_into_tiles(
                tile_ds=target_tile.isel(lat=slice(0, 20000), lon=slice(20000, 40000)),
                gdf=eosd,
                value_col=None,
                value_name="eosd_index",
            )
            .chunk({"lat": 625, "lon": 1250})
            .to_dataset()
        )

        eosd_index_d = (
            convert_gdf_into_tiles(
                tile_ds=target_tile.isel(lat=slice(20000, 40000), lon=slice(20000, 40000)),
                gdf=eosd,
                value_col=None,
                value_name="eosd_index",
            )
            .chunk({"lat": 625, "lon": 1250})
            .to_dataset()
        )

        eosd_index = xr.combine_by_coords([eosd_index_a, eosd_index_b, eosd_index_c, eosd_index_d])[
            "eosd_index"
        ]

        print("nulls in total dataset", eosd_index.isnull().sum().values)

        print("get output dataset")
        eosd_cov = xr.DataArray(
            np.nan,
            dims=["lat", "lon"],
            coords=[target_tile.coords["lat"], target_tile.coords["lon"]],
        ).chunk({"lat": 625, "lon": 1250})

        print("assigning covers")
        covers = eosd.COVTYPE.unique()
        for c in covers:
            min_ind = np.where(eosd.COVTYPE == c)[0].min()
            max_ind = np.where(eosd.COVTYPE == c)[0].max()
            eosd_cov = xr.where(
                ((eosd_index >= min_ind) & (eosd_index <= max_ind)),
                x=c,
                y=eosd_cov,
            )

        print("put to output ds")
        target_tile["eosd"] = eosd_cov

        print("saving")
        # save the output
        save_to_zarr(ds=target_tile, url=tile_path, list_of_variables=["eosd"], mode="a")

        del eosd_index
        del eosd_index_a
        del eosd_index_b
        del eosd_index_c
        del eosd_index_d
        del eosd_cov
        del target_tile
        del eosd

In [None]:
mapper = fsspec.get_mapper(f"gs://carbonplan-scratch/trace_scratch/ecoregions_mask/70N_100W.zarr")
ds = xr.open_zarr(mapper)
ds

In [None]:
ds.ecoregion.isnull().sum().values

In [None]:
ds.ecoregion[::100, ::100].plot()

In [None]:
ds.eosd.isnull().sum().values

In [None]:
ds.eosd[::100, ::100].plot()

## SRTM

https://lpdaac.usgs.gov/products/srtmgl1v003/#tools


In [None]:
import math
import random


def parse_bounding_lat_lon_for_srtm_tile(srtm_tile):
    lat = srtm_tile[0:3]
    if lat.startswith("N"):
        min_lat = int(lat[1:])
    else:
        min_lat = -1 * int(lat[1:])

    max_lat = min_lat + 1

    lon = srtm_tile[3:]
    if lon.startswith("E"):
        min_lon = int(lon[1:])
    else:
        min_lon = -1 * int(lon[1:])

    max_lon = min_lon + 1

    return min_lat, max_lat, min_lon, max_lon


def convert_srtm_tile_to_10_by_10_tile(srtm_file):
    srtm_tile = srtm_file.split("/")[-1].split(".")[0]
    min_lat, max_lat, min_lon, max_lon = parse_bounding_lat_lon_for_srtm_tile(srtm_tile)

    tile_lat = int(math.ceil(max_lat / 10.0)) * 10
    if tile_lat >= 0:
        tile_lat = str(abs(tile_lat)).zfill(2) + "N"
    else:
        tile_lat = str(abs(tile_lat)).zfill(2) + "S"

    tile_lon = int(math.floor(min_lon / 10.0)) * 10
    if tile_lon >= 0:
        tile_lon = str(abs(tile_lon)).zfill(3) + "E"
    else:
        tile_lon = str(abs(tile_lon)).zfill(3) + "W"

    return [srtm_tile, f"{tile_lat}_{tile_lon}"]

In [None]:
d = "gs://carbonplan-climatetrace/inputs/srtm/"
files = [f for f in fs.ls(d) if not f.endswith("/") and not f.endswith("zarr")]
srtm_tiles = [convert_srtm_tile_to_10_by_10_tile(f)[0] for f in files]
tile_names = [convert_srtm_tile_to_10_by_10_tile(f)[1] for f in files]

file_df = pd.DataFrame({"file_path": files, "srtm_tile": srtm_tiles, "tile_name": tile_names})

In [None]:
file_df.loc[file_df.srtm_tile.str.startswith("N")].sort_values(by="srtm_tile").tail()

In [None]:
len(file_df.tile_name.unique())

In [None]:
def check(row):
    lat, lon = get_lat_lon_tags_from_tile_path(row.tile_name)
    min_lat, max_lat, min_lon, max_lon = parse_bounding_lat_lon_for_tile(lat, lon)

    (
        smin_lat,
        smax_lat,
        smin_lon,
        smax_lon,
    ) = parse_bounding_lat_lon_for_srtm_tile(row.srtm_tile)

    try:
        assert min_lat <= smin_lat <= max_lat
        assert min_lat <= smax_lat <= max_lat
        assert min_lon <= smin_lon <= max_lon
        assert min_lon <= smax_lon <= max_lon
    except:
        print(min_lat, max_lat, min_lon, max_lon)
        print(smin_lat, smax_lat, smin_lon, smax_lon)
        raise Error

In [None]:
for i, row in file_df.iterrows():
    check(row)

In [None]:
for tn, group in file_df.groupby("tile_name"):
    local_path = f"/home/jovyan/temp/{tn}.zarr"
    cloud_path = f"gs://carbonplan-climatetrace/intermediates/srtm/{tn}.zarr"

    if fs.exists(cloud_path):
        pass

    else:
        print(f"processing {tn}")
        tile = []
        for i, file in group.iterrows():
            f = xr.open_rasterio(f"gs://{file.file_path}").squeeze(dim="band", drop=True)
            attrs = f.attrs
            f = xr.where(f.isin([-32768]), np.nan, f)
            f.attrs = attrs
            f = f.isel(x=slice(0, 3600), y=slice(0, 3600))
            f = f.rename({"x": "lon", "y": "lat"})
            tile.append(
                f.to_dataset(name="srtm", promote_attrs=True).chunk({"lon": 1200, "lat": 1200})
            )

        tile = xr.combine_by_coords(tile, combine_attrs="drop_conflicts").chunk(
            {"lon": 1200, "lat": 1200}
        )
        tile.attrs = {"crs": "EPSG:4326"}

        save_to_zarr(ds=tile, url=local_path, list_of_variables=["srtm"], mode="w")

        fs.put(local_path, cloud_path, recursive=True)
        shutil.rmtree(local_path)

In [None]:
mapper = fsspec.get_mapper(f"gs://carbonplan-climatetrace/intermediates/srtm/50N_130W.zarr")
test = xr.open_zarr(mapper)
test.srtm[::10, ::10].plot()