# Test add water occurrence

The water watch algorithm makes use of the Global Water Occurrence dataset where water presence cannot be detected from the image (clouds, missing data, etc.).
Because this data is needed in one datacube, we need to add the water occurrence as a band in one datacube together with the optical data.

In this notebook, this process is tested.

In [None]:
# imports
from typing import List, Dict, Tuple, Union
from pathlib import Path

import geojson
from openeo import connect, Connection
from openeo.rest.datacube import DataCube
from pyproj import CRS, Proj, Transformer
from pyproj.aoi import AreaOfInterest
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import transform

from utils import Reservoir

In [None]:
# Connect to backend:
openeo_platform_url: str = "openeo.cloud"
vito_url: str = "https://openeo.vito.be/openeo/1.0"
vito_dev_url: str = "https://openeo-dev.vito.be/openeo/1.0"

backend_url = vito_url

con: Connection = connect(backend_url)
con.authenticate_oidc(provider_id="egi")

debug = True

out_dir: Path = Path("output")
out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Find Level 1C product of Sentinel 2 mission
collections = con.list_collections()
if backend_url == vito_url or backend_url == vito_dev_url:
    collection_id = "SENTINEL2_L1C_SENTINELHUB"
elif backend_url == openeo_platform_url:
    collection_id = "SENTINEL2_L1C"

In [None]:
# Get reservoirs from database
reservoir_dir: Path = out_dir / "reservoirs"

reservoirs: List[Reservoir] = Reservoir.from_gcp(reservoir_dir)

In [None]:
import math

def get_utm_zone(lon: float) -> int:
    return math.ceil((180 + lon) / 6)

## Setup AoI and parameters
In case of debug, we just take the bounding box of one of the reservoirs in Chzechia that show seasonal variation and extend it so that the reservoirs fit.
Otherwise the entirety of Chzechia is used.

In [None]:
if debug:
    geojson_str = "{\"type\":\"Polygon\",\"coordinates\":[[[16.258372886421807,49.561646293673824],[16.314909857006697,49.561646293673824],[16.314909857006697,49.58980547068479],[16.258372886421807,49.58980547068479],[16.258372886421807,49.561646293673824]]],\"geodesic\":false}"
    gjson: geojson.Polygon = geojson.loads(geojson_str)
    bbox = Polygon(gjson.coordinates[0])
else:
    # entire chzechia
    bbox = Polygon([[12.09,51.06],[12.09, 48.55], [18.87,48.55], [18.87, 51.06], [12.09,51.06]])

# convert bbox polygon to utm zone
wgs84: CRS = CRS('EPSG:4326')
utm_zone: int = get_utm_zone(min(bbox.exterior.xy[0]))
utm: CRS = CRS(proj='utm', zone=utm_zone)
project_to_utm: Transformer = Transformer.from_crs(wgs84, utm, always_xy=True)
project_to_latlon: Transformer = Transformer.from_crs(utm, wgs84, always_xy=True)

bbox_utm = transform(project_to_utm.transform, bbox)
if debug:
    # transform and buffer 1km so all imagery plus buffers is loaded.
    bbox_utm = bbox_utm.buffer(1000.)
    bbox = transform(project_to_latlon.transform, bbox_utm)

band_names = ["green", "nir", "swir", "cloudmask", "cloudp"]
band_codes = ["B03", "B08", "B11", "CLM", "CLP"]

# after crs transform, we get a distorted box, take extremities as bbox
xys = bbox_utm.exterior.coords.xy
bbox_openeo = {
    "west": min(xys[0]),
    "east": max(xys[0]),
    "south": min(xys[1]),
    "north": max(xys[1]),
    "crs": ":".join(utm.to_authority())
}

print(f"openeo spatial extent: {bbox_openeo}")
print(f"UTM zone: {utm_zone}")
if debug:
    start = "2021-05-01"
    stop = "2021-08-01"
else:
    start = "2017-04-01"
    stop = "2021-01-01"

## Buffer reservoirs using 300m buffer
In order to pickup on flooding / high water levels, we buffer the reservoirs using a 300m buffer. As the AoI needs to be given to the `chunk_polygon` method, we this this locally and not on the cluster.

In [None]:
# Select reservoirs within bbox and buffer 300m
from copy import copy

def buffer_in_utm(reservoir, buffer_m):
    try:
        new_res = copy(reservoir)
        bounds = new_res.geometry.bounds
        min_lon = bounds[0]
        _utm_zone: int = get_utm_zone(min_lon)
        if abs(_utm_zone - utm_zone) > 1:
            # If not close to utm zone, then not in AoI
            return None
        buffered_geom = transform(project_to_utm.transform, new_res.geometry).buffer(buffer_m, 1)
        latlon_geom = transform(project_to_latlon.transform, buffered_geom)
        new_res.geometry = latlon_geom
    except ValueError as e:
        print(reservoir.geometry.wkt)
    return new_res
    

selected = list(
    filter(lambda r: bbox.covers(r.geometry),
    filter(lambda r: r is not None,
    map(lambda r: buffer_in_utm(r, 300.),
        reservoirs
    )))
)
selected_mp = MultiPolygon(list(map(lambda s: s.geometry, selected)))
selected[0].geometry

## Load optical data

In [None]:
dc_optical: DataCube = con.load_collection(
        collection_id=collection_id,
        spatial_extent=bbox_openeo,
        temporal_extent=(start, stop),
        bands=band_codes
    ).rename_labels(dimension="bands", source=band_codes, target=band_names)

## Filter optical data

In [None]:
def load_udf(path: Path):
    with open(path, 'r+') as f:
        return f.read()

udf_path: Path = Path.cwd().parent / "udfs" / "filter_mostly_clean_images.py"
quality_score_udf = load_udf(udf_path)

In [None]:
from shapely.geometry.base import BaseGeometry

def filter_mostly_clean_images(
    dc: DataCube,
    geometry: BaseGeometry,
    quality_score_udf: str,
    cutoff_percentile: int = 35,
    score_percentile: int = 75,
    quality_band: str = 'cloudp',
    
) -> DataCube:
    """
    filters images based on cloud coverage percentile
    """
    process = lambda data: data.run_udf(udf=quality_score_udf, runtime="Python")
    return dc.chunk_polygon(chunks=geometry, process=process, context={
        "cutoff_percentile": cutoff_percentile,
        "quality_band": quality_band,
        "score_percentile": score_percentile
    })

# filtered_dc: DataCube = filter_mostly_clean_images(dc_optical, bbox, quality_score_udf)
filtered_dc: DataCube = filter_mostly_clean_images(dc_optical, selected_mp, quality_score_udf)

## Download filtered data

In [None]:
# Download cube for the fix
job = filtered_dc.create_job("netcdf", title="get_filtered_data", description="get filtered datacube")
job = job.start_and_wait()

In [None]:
from openeo.rest.job import BatchJob

job = BatchJob("j-41641109c6d14ad2bc4b6ea9bcb00654", con)
filtered_path = out_dir / "filtered.nc"
job.get_results().get_assets()[0].download(filtered_path)

In [None]:
import rioxarray
import xarray as xr

filtered_path = out_dir / "filtered.nc"
ds_filtered: xr.Dataset = rioxarray.open_rasterio(filtered_path)
ds_filtered

## Visualize filtered DataSet

In [None]:
import cartopy.crs as ccrs

import geoviews as gv
import holoviews as hv
import numpy as np

from holoviews import opts, streams
from holoviews.element.tiles import OSM

gv.extension("bokeh","matplotlib")

In [None]:
kdims = ["x", "y", "t"]
vdims = ["green", "nir", "swir", "cloudmask", "cloudp"]

hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d-%H:%M'  # readable time format
gv_filtered = gv.Dataset(ds_filtered, kdims=kdims, vdims=vdims, crs=ccrs.UTM(utm_zone)).redim(x="lon", y="lat")
print(repr(gv_filtered))

In [None]:
dmap = gv_filtered.to(gv.Image, ["lon", "lat"], "green", group="filtered", label="filtered", datatype=["xarray"], dynamic=True)
overlay = OSM() * dmap
overlay.opts(
    opts.Image(cmap="turbo", colorbar=True, clim=(0, 2**12), alpha=0.8, height=500, width=500, tools=["hover"]),
    # opts.Image(cmap="turbo", colorbar=True, clim=(0, 100), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Tiles(height=500, width=500))

overlay

## Load udf

In [None]:
def load_udf(path: Path):
    with open(path, 'r+') as f:
        return f.read()

udf_path: Path = Path.cwd().parent / "udfs" / "preprocess_polygons.py"
preprocess_polygons_udf = load_udf(udf_path)

## Execute Locally

In [None]:
from openeo.udf import execute_local_udf
from openeo.udf.udf_data import UdfData
preprocessed: UdfData = execute_local_udf(preprocess_polygons_udf, filtered_path, fmt='netcdf')

In [None]:
preprocessed_dcs: List[DataCube] = preprocessed.get_datacube_list()
preprocessed_dcs[0]

## Run UDF on backend

In [None]:
def preprocess_polygons(
    dc: DataCube,
    geometry: BaseGeometry,
    minimum_filled_fraction: int = 0.35,
    quality_check_bands: List[str] = ["green", "nir", "swir"]
    
) -> DataCube:
    """
    
    """
    process = lambda data: data.run_udf(udf=preprocess_polygons_udf, runtime="Python")
    return dc.chunk_polygon(chunks=geometry, process=process, context={
        "minimum_filled_fraction": minimum_filled_fraction,
        "quality_check_bands": quality_check_bands
    })

preprocessed_dc: DataCube = preprocess_polygons(filtered_dc, selected_mp, quality_score_udf)

In [None]:
job = preprocessed_dc.create_job("GTiff", title="quality_score", description="test_quality_score")
job = job.start_and_wait()

In [None]:
from openeo.rest.job import BatchJob, JobResults, ResultAsset
import re

job = BatchJob("j-aa1d35202dfb48af88f2f2aafb3903e2", con)
preprocessed_path: Path = out_dir / "preprocessed"
results: JobResults = job.get_results()
assets: List[ResultAsset] = results.get_assets()
for asset in assets:
    asset.download(preprocessed_path / asset.name)

In [None]:
len(assets)

In [None]:
from datetime import date
import pandas as pd
import rioxarray
import re
import xarray as xr
from typing import Tuple

preprocessed_path: Path = out_dir / "preprocessed"

preprocessed_paths = list(preprocessed_path.glob("*.tif"))
preprocessed_dates: List[str] = [re.match(r".+(\d{4}-\d{2}-\d{2}).+", path.name).groups()[0] for path in preprocessed_paths]
preprocessed_pd: list[Tuple[str, Path]] = [(date(*map(lambda g: int(g), re.match(r".+(\d{4})-(\d{2})-(\d{2}).+", path.name).groups())), path) for path in preprocessed_paths]

# check same start
print(preprocessed_dates[0])
print(preprocessed_paths[0])
print(preprocessed_pd[0])

t: xr.Variable = xr.Variable("t", pd.DatetimeIndex(preprocessed_dates))
ds = rioxarray.open_rasterio(preprocessed_paths[0])

In [None]:
ds

In [None]:
das = []
for d, p in preprocessed_pd:
    da = rioxarray.open_rasterio(p)
    coords = da.coords
    coords.update({"t": d})
    da = da.assign_coords(coords)
    das.append(da)
combined: xr.DataArray = xr.concat(das, dim=t)

In [None]:
ds: xr.Dataset = combined.to_dataset('band').rename({1: "green", 2: "nir", 3: "swir", 4: "cloudmask", 5: "cloudp"})
ds

In [None]:
import cartopy.crs as ccrs

import geoviews as gv
import holoviews as hv
import numpy as np

from holoviews import opts, streams
from holoviews.element.tiles import OSM

gv.extension("bokeh","matplotlib")

In [None]:
kdims = ["x", "y", "t"]
vdims = ["green", "nir", "swir", "cloudmask", "cloudp"]

hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d-%H:%M'  # readable time format
gv_preprocessed = gv.Dataset(ds, kdims=kdims, vdims=vdims, crs=ccrs.UTM(utm_zone)).redim(x="lon", y="lat")

print(repr(gv_preprocessed))

In [None]:
dmap = gv_preprocessed.to(gv.Image, ["lon", "lat"], "green", group="preprocessed_data", label="preprocessed", datatype=["xarray"], dynamic=True)
overlay = OSM() * dmap
overlay.opts(
    opts.Image(cmap="turbo", colorbar=True, clim=(0, 7000), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Tiles(height=500, width=500))

overlay

## Load water occurrence data

In [None]:
con.describe_collection("GLOBAL_SURFACE_WATER")

In [None]:
dc_wo: DataCube = con.load_collection(
    collection_id="GLOBAL_SURFACE_WATER",
    spatial_extent=bbox_openeo,
    bands=["occurrence"]
)

As the temporal extent works in a weird way with the water occurrence data, either from 1984 until 2019, or until 2020, we have to filter after loading in both date ranges. After of filtering, we want to drop the t-axis. This is because this does not correlate with time the same way as the optical datacube.

In [None]:
dc_wo_latest: DataCube = dc_wo.filter_temporal(extent=("2019-12-31", "2020-01-02")).drop_dimension("t")

Now we resample spatially onto the optical datacube

In [None]:
dc_wo_resampled: DataCube = dc_wo_latest.resample_cube_spatial(preprocessed_dc, method="nearest")

## Calculate MNDWI

Next step is to calculate the MNDWI of the datacube and merge this cube with the JRC datacube.

In [None]:
green: DataCube = preprocessed_dc.band("green")
swir: DataCube = preprocessed_dc.band("swir")
mndwi: DataCube = (green - swir) / (green + swir)

Can now be done quicker using built-in openeo support

In [None]:
from openeo.extra import spectral_indices
spectral_indices.list_indices()

## Merge Water Occurrence and MNDWI

Finally we can merge the two DataCubes: first we need to add a dimension that differs between both cubes if we want to keep both values.

In [None]:
# Download cube for the fix
job = mndwi.create_job("netcdf", title="get_timestamps", description="get timestamps for aggregate workaround")
job = job.start_and_wait()

In [None]:
timestamps_path = out_dir / "timestamps.nc"
job.get_results().get_assets()[0].download(timestamps_path)

In [None]:
import rioxarray
import xarray as xr

timestamps_path = out_dir / "timestamps.nc"
da_timestamps: xr.DataArray = rioxarray.open_rasterio(timestamps_path)
da_timestamps

In [None]:
import cftime
import datetime

timestamps = list(map(lambda t: cftime.DatetimeGregorian.strftime(t), da_timestamps["t"].values))
timestamps.append(datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")) # add today as aggregate temporal is left-inclusive
daterange = [[timestamps[i], timestamps[i+1]] for i in range(len(timestamps) - 1)]
daterange

In [None]:
from openeo import processes

mndwi_mergeable = mndwi.add_dimension(name="bands", label="MNDWI", type="bands")
# Workaround for https://discuss.eodc.eu/t/merging-datacubes/310/5?u=jaapel
mndwi_mergeable = mndwi_mergeable.aggregate_temporal(daterange, reducer=processes.max)
mndwi_mergeable.metadata.dimension_names()

Multiply the datacube by 1.0 otherwise we try to merge cubes with different data types (int16 vs float32)

In [None]:
dc_wo_m: DataCube = dc_wo_resampled.drop_dimension("bands") * 1.0
# dc_wo_m = dc_wo_m.add_dimension(name="", label="", type="temporal")
dc_wo_m = dc_wo_m.add_dimension(name="bands", label="wo", type="bands")
dc_wo_m.metadata.dimension_names()

In [None]:
mndwi_m = mndwi.add_dimension(name="bands", label="MNDWI", type="bands")
mndwi_m.metadata.dimension_names()

## Merge DataCube

In [None]:
from openeo import processes

# dc_wo_m: DataCube = dc_wo_resampled.add_dimension("source", "JRC", type="other")
# dc_optical_m: DataCube = dc_optical.add_dimension("source", "S2_L1C", type="other")
# dc_merged: DataCube = dc_optical_m.merge_cubes(dc_wo_m, overlap_resolver=processes.max)
dc_merged: DataCube = mndwi_m.add_dimension("bands", "MNDWI", "bands").merge_cubes(dc_wo_m)

Download the to-be-merged dc for the water occurrence

In [None]:
job = dc_wo_m.create_job("netcdf", title="dl_merging_wo", description="download water occurrence merge cube")
job = job.start_and_wait()

In [None]:
dc_wo_m_path = out_dir / "m_wo.nc"
job.get_results().get_assets()[0].download(dc_wo_m_path)

In [None]:
import rioxarray
import xarray as xr

dc_wo_m_path = out_dir / "m_wo.nc"
da_wo_m: xr.Dataset = rioxarray.open_rasterio(dc_wo_m_path)
da_wo_m

In [None]:
import cartopy.crs as ccrs

import geoviews as gv
import holoviews as hv
import numpy as np

from holoviews import opts, streams
from holoviews.element.tiles import OSM

gv.extension("bokeh","matplotlib")

In [None]:
kdims = ["x", "y"]
vdims = ["wo"]

hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d-%H:%M'  # readable time format
gv_wo_m = gv.Dataset(da_wo_m, kdims=kdims, vdims=vdims, crs=ccrs.UTM(utm_zone)).redim(x="lon", y="lat")
print(repr(gv_wo_m))

In [None]:
dmap = gv_wo_m.to(gv.Image, ["lon", "lat"], "wo", group="mergeable_wo", label="mergeable_wo", datatype=["xarray"])
overlay = OSM() * dmap
overlay.opts(
    opts.Image(cmap="turbo", colorbar=True, clim=(0, 100), alpha=0.8, height=500, width=500, tools=["hover"]),
    # opts.Image(cmap="turbo", colorbar=True, clim=(0, 100), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Tiles(height=500, width=500))

overlay

Downloading the mndwi cube

In [None]:
job = mndwi_mergeable.create_job("netcdf", title="mndwi_merging", description="download mndwi merge cube")
job = job.start_and_wait()

In [None]:
merge_mndwi_path = out_dir / "mergeable_mndwi.nc"
job.get_results().get_assets()[0].download(merge_mndwi_path)

In [None]:
import rioxarray
import xarray as xr

merge_mndwi_path = out_dir / "mergeable_mndwi.nc"
da_merged_mndwi: xr.DataArray = rioxarray.open_rasterio(merge_mndwi_path)
da_merged_mndwi

In [None]:
import cartopy.crs as ccrs

import geoviews as gv
import holoviews as hv
import numpy as np

from holoviews import opts, streams
from holoviews.element.tiles import OSM

gv.extension("bokeh","matplotlib")

In [None]:
kdims = ["x", "y", "t"]
vdims = ["MNDWI"]

hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d-%H:%M'  # readable time format
gv_mndwi_m = gv.Dataset(da_merged_mndwi, kdims=kdims, vdims=vdims, crs=ccrs.UTM(utm_zone)).redim(x="lon", y="lat")
print(repr(gv_mndwi_m))

In [None]:
dmap = gv_mndwi_m.to(gv.Image, ["lon", "lat"], "MNDWI", group="mergeable_mndwi", label="mergeable_mndwi", datatype=["xarray"], dynamic=True)
overlay = OSM() * dmap
overlay.opts(
    opts.Image(cmap="turbo", colorbar=True, clim=(-1, 1), alpha=0.8, height=500, width=500, tools=["hover"]),
    # opts.Image(cmap="turbo", colorbar=True, clim=(0, 100), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Tiles(height=500, width=500))

overlay

## Download and inspect result

In [None]:
from openeo.rest.job import RESTJob
job: RESTJob = dc_merged.create_job("netcdf", title="merging_wo", description="merging water occurrence.")
job = job.start_and_wait()

In [None]:
merged_path = out_dir / "merged.nc"
job.get_results().get_assets()[0].download(merged_path)

In [None]:
import rioxarray
import xarray as xr

merged_path = out_dir / "merged.nc"
fixed_merged_path: Path = out_dir / "merged_fixed.nc"
ds_merged: xr.Dataset = rioxarray.open_rasterio(merged_path)
ds_merged = ds_merged.drop("crs")
ds_merged.to_netcdf(fixed_merged_path)
ds_merged

In [None]:
import cartopy.crs as ccrs

import geoviews as gv
import holoviews as hv
import numpy as np

from holoviews import opts, streams
from holoviews.element.tiles import OSM

gv.extension("bokeh","matplotlib")

In [None]:
kdims = ["x", "y", "t"]
vdims = ["wo", "MNDWI"]

hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d-%H:%M'  # readable time format
gv_merged = gv.Dataset(ds_merged, kdims=kdims, vdims=vdims, crs=ccrs.UTM(utm_zone)).redim(x="lon", y="lat")
# gv_merged = gv.Dataset(da_merged, kdims=kdims, vdims=vdims).redim(x="lon", y="lat")
print(repr(gv_merged))

In [None]:
dmap = gv_merged.to(gv.Image, ["lon", "lat"], "wo", group="raw_data", label="raw", datatype=["xarray"], dynamic=True)
overlay = OSM() * dmap
overlay.opts(
    # opts.Image(cmap="turbo", colorbar=True, clim=(0, 100), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Image(cmap="turbo", colorbar=True, clim=(-1, 1), alpha=0.8, height=500, width=500, tools=["hover"]),
    opts.Tiles(height=500, width=500))

overlay