In [None]:
import os

# This is required while pygeos is installed
os.environ['USE_PYGEOS'] = '0'
import json

import folium
import geohash
import geopandas
import holoviews as hv
import hvplot.xarray  # noqa: F401
import numpy as np
import odc.geo.xr
import shapely
import xarray as xr
from dea_tools.spatial import xr_rasterize, xr_vectorize
from fsspec.implementations.http import HTTPFileSystem
from holoviews import opts

from emit_tools import emit_xarray
from utils import get_earthdata_token, get_rgb_dataset, load_emit_granule

# Waterbody Extraction

This notebook is a worked example. The goal of the analysis is to identify waterbodies that
may have an algal bloom. To do this, we need to identify water bodies, examine their spectra
and then calculate absorption depth before using a threshold to select waterbodies with high
values.

## Load data

First we load data.


In [None]:
%%time
# Loading data can take around 3-4 minutes on a 100 Mbps connection

# See README.md for instructions on how to get an Earthdata token
token = get_earthdata_token()

# Refer to the README.md for instructions on how to find different granule IDs
granule = "EMIT_L2A_RFL_001_20230131T221923_2303114_008"  # Kerang Lakes

ds = load_emit_granule(granule, token)
ds

## Cleaning data

Next we clean up the empty bands, before creating a simple water/not-water
layer using the normalised-difference wetness index and bands that had
high or low values over water as identified in the previous animations notebook.

In [None]:
# Clean up empty bands.
ds = ds.fillna(np.nan).where(ds.reflectance!=-0.01)

In [None]:
# Create a water layer
high = ds.reflectance.sel(bands=450, method="nearest")
low = ds.reflectance.sel(bands=1275, method="nearest")

water = ((high - low) / (high + low)) > 0
ds["water"] = water.fillna(float("nan")).where(water)

In [None]:
ds.water.hvplot(aspect="equal", frame_width=600)

## Filtering and creating IDs

First, we want to filter by area so we only have large waterbodies. We
do this using projected coordinates, rather than latitude/longitudes.

Next we compute a geohash of the centroid of each waterbody, which can be
used to help give a label to the waterbody, without needing to know the
name of it.

Note that we shrink the waterbody by a small amount.

In [None]:
MIN_AREA = 80  # Hectares


def add_geohash(row):
    return geohash.encode(row.geometry.centroid.y, row.geometry.centroid.x, precision=9)


# Create polygons from the water layer
water_polygons = xr_vectorize(ds.water, crs="epsg:4326", mask=ds.water.values == 1)
water_polygons["area"] = water_polygons.to_crs("epsg:3577").area / 10000

# Drop geopandas rows where the area is less than MIN_AREA
water_polygons = water_polygons.drop(
    water_polygons[water_polygons["area"] < MIN_AREA].index
)

# Compute a geohash for each polygon at level 9
geohashes = []
for _, row in water_polygons.iterrows():
    geohashes.append(add_geohash(row))

water_polygons["geohash"] = geohashes

# Add an ID row
water_polygons["id"] = range(1, water_polygons.shape[0] + 1)

# Show us what we've got
print(
    f"Found {water_polygons.shape[0]} water polygons that are larger than {MIN_AREA} hectare(s)"
)

In [None]:
# View the water layer on an interactive map

# Reduce the polygons by a small amount (in meters)
SHRINK_AMOUNT = 100

m = folium.Map(control_scale=True, tiles=None)

for _, row in water_polygons.iterrows():
    geometry = water_polygons.geometry.to_crs("epsg:3577").buffer(-1 * SHRINK_AMOUNT).to_crs("epsg:4326")
    geojson = folium.GeoJson(
        data=json.dumps(
            shapely.geometry.mapping(geometry)
        ),
        style_function=lambda x: {"fillColor": "blue", "Color": "blue"},
        tooltip=f"{row.geohash}",
    )
    folium.Popup(
        f"<p><strong>geohash:</strong> {row.geohash}<br><strong>area:</strong> {row['area']:.3f} Ha</p>"
    ).add_to(geojson)
    geojson.add_to(m)

# Zoom map
m.fit_bounds(ds.odc.map_bounds())

tile = folium.TileLayer(
    tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
    attr="Esri",
    name="Esri Satellite",
    control=True,
).add_to(m)

folium.LayerControl().add_to(m)
display(m)

In [None]:
# Rasterise the polygons again, so we can join on the geohash later
# First shrink them by SHRINK_AMOUNT
water_polygons.geometry = water_polygons.geometry.to_crs("epsg:3577").buffer(-1 * SHRINK_AMOUNT).to_crs("epsg:4326")
water_raster = xr_rasterize(water_polygons, ds, attribute_col="id", crs="epsg:4326")

# Join the rasterised polygons to the dataset
ds["id"] = xr.DataArray(water_raster, dims=("latitude", "longitude"))

# Create another empty array of strings
ds["geohash"] = xr.DataArray(
    np.full((ds.latitude.size, ds.longitude.size), "", dtype="U9"),
    dims=("latitude", "longitude"),
)

for _, row in water_polygons.iterrows():
    # I think 'where' works the opposite of what you'd expect
    ds["geohash"] = ds.geohash.where(ds.id != row.id, row.geohash)

# Mask the empty values
ds["geohash"] = ds.geohash.where(ds.geohash != "", drop=False)
del ds["id"]

## Calculate summaries

This next section creates summary values per waterbody (geohash).

These will help us understand the average response and variability of
the waterbody.

In [None]:
%%capture --no-stdout

means = ds.groupby("geohash").mean()
std_dev = ds.groupby("geohash").std()
min = ds.groupby("geohash").min()
max = ds.groupby("geohash").max()

# Create a new dataset with the mean, standard deviation, min and max values
# for each geohash
water_summaries = xr.Dataset(
    {
        "mean": means.reflectance,
        "std_dev": std_dev.reflectance,
        "min": min.reflectance,
        "max": max.reflectance,
    }
)

In [None]:
# Mean and std_dev plots
color_cycle = hv.Cycle("Category20")

plots = []
for geohash in water_summaries.geohash.values:
    row = water_summaries.sel(geohash=geohash)

    plots.append(
        (
            hv.Spread(row, vdims=["mean", "std_dev", "std_dev"], label=f"{geohash}")
            * hv.Curve(row, vdims="mean", label=f"{geohash}")
        )
    )

hv.Layout(plots).opts(
    opts.Spread(color=color_cycle, show_legend=True),
    opts.Curve(color=color_cycle, show_legend=True),
    opts.Overlay(
        show_title=True, frame_width=200, frame_height=50, show_legend=False, yaxis=None
    ),
).cols(4)

## Normalised Difference Chlorophyll Index

Reference: [https://doi.org/10.1016/j.rse.2011.10.016](https://doi.org/10.1016/j.rse.2011.10.016).

The Normalised Difference Chlorophyll Index can be calculated using either Sentinel-2 or EMIT data
Now we calculate the NDCI for each pixel for each waterbody.  This means we can use violin plots to further
understand the variability of NDCI within each waterbody.

In [None]:
# Calculate NDCI
reference_band = 700
absorption_band = 671

ds["reference_band"] = ds.reflectance.sel(bands=reference_band, method = 'nearest')
ds["absorption_band"] = ds.reflectance.sel(bands=absorption_band, method = 'nearest')
ds["ND"] = ((ds.reference_band - ds.absorption_band)/(ds.reference_band + ds.absorption_band))

In [None]:
# Plot, masking to show only the water areas we identified earlier
ds.ND.where(ds.water == 1).hvplot(aspect="equal", frame_width=600)

In [None]:
# Simplify to a summary dataset, removing the bands dimension and reflectance data
ds_summary = ds.drop_dims("bands")

# Violin plots grouped by geohash
ds_summary.hvplot.violin(
    y="ND",
    by="geohash",
).opts(
    opts.Violin(
        width=1000,
        height=600,
        xrotation=45,
        show_legend=False,
        title="Normalised Difference Chlorophyll Index",
        ylim=(-0.1, 0.5),
        violin_fill_color='ND',
        cmap = 'Spectral_r',
        clim = (0, 0.4),
    )
)

## Absorption depth

Reference: [https://doi.org/10.1080/01431161003789549](https://doi.org/10.1080/01431161003789549).

Now we calculate the absorption depth for each waterbody. This is done
on the underlying reflectance values over three different bands, and not
on the summary data. This means we can use violin plots to further
understand the variability of this new index.

In [None]:
# Calculate absorption depth, to be plotted per geohash
absorption = 627
reference_band = 560
reference_band2 = 648

absorption = ds.reflectance.sel(bands=absorption, method = 'nearest')
reference1= ds.reflectance.sel(bands=reference_band, method = 'nearest')
reference2 = ds.reflectance.sel(bands=reference_band2, method = 'nearest')
ds["absorption_depth"] = (reference1 + reference2)/2 - absorption

In [None]:
# Simplify to a summary dataset, removing the bands dimension and reflectance data
ds_summary_2 = ds.drop_dims("bands")

# Violin plots grouped by geohash
ds_summary_2.hvplot.violin(
    y="absorption_depth",
    by="geohash",
).opts(
    opts.Violin(
        width=1000,
        height=600,
        xrotation=45,
        show_legend=False,
        title="Absorption Depth",
        ylim=(-0.02, 0.03),
        violin_fill_color='absorption_depth',
        cmap = 'Spectral_r',
        clim = (0, 0.02),
    )
)


## Picking a threshold

Here we're going to select water bodies (geohashes) by thresholding
the absorption depth. First plot the absorption depths on a scatter plot.
The x axis is just the waterbody's geohash, so it's meaningless.

The following cell uses a threshold of 0.01, so selects the three
waterbodies with a mean absorption_depth of above this and plots the
spectra for these three waterbodies.

In [None]:
geohash_absorption_depths = ds_summary_2.groupby("geohash").mean()

geohash_absorption_depths.hvplot(
    x="geohash",
    y="absorption_depth",
    kind="scatter",
    title="Mean absorption depth",
    color="absorption_depth",
    cmap = 'magma_r',
    # Line color
    line_color="grey",
    size=40,
    xaxis=None,
)

In [None]:
abs_d_gt_001 = geohash_absorption_depths.where(
    geohash_absorption_depths.absorption_depth > 0.01, drop=True
)
high_absv = list(abs_d_gt_001.geohash.values)

# Mean and std_dev plots
color_cycle = hv.Cycle("Category20")

plots = []
for geohash in high_absv:
    row = water_summaries.sel(geohash=geohash)

    plots.append(
        (
            hv.Spread(row, vdims=["mean", "std_dev", "std_dev"], label=f"{geohash}")
            * hv.Curve(row, vdims="mean", label=f"{geohash}")
        )
    )

# Add a mean of all waterbodies plot too
mean_all = water_summaries.mean("geohash")
plots.append(
    (
        hv.Spread(mean_all, vdims=["mean", "std_dev", "std_dev"], label=f"all")
        * hv.Curve(mean_all, vdims="mean", label=f"all")
    )
)

hv.Overlay(plots).opts(
    opts.Spread(color=color_cycle, show_legend=True),
    opts.Curve(color=color_cycle, show_legend=True),
    opts.Overlay(
        show_title=True, frame_width=600, frame_height=300, show_legend=True, yaxis=None
    ),
)

## Show the selected waterbodies on a map

In [None]:
m = folium.Map(control_scale=True, tiles=None)

for _, row in water_polygons.iterrows():
    # Skip the ones we haven't selected
    if row.geohash not in high_absv:
        continue
    geojson = folium.GeoJson(
        data=json.dumps(
            shapely.geometry.mapping(row.geometry)
        ),
        style_function=lambda x: {"fillColor": "blue", "Color": "blue"},
        tooltip=f"{row.geohash}",
    )
    folium.Popup(
        f"<p><strong>geohash:</strong> {row.geohash}<br><strong>area:</strong> {row['area']:.3f} Ha</p>"
    ).add_to(geojson)
    geojson.add_to(m)

# Zoom map
m.fit_bounds(ds.odc.map_bounds())

tile = folium.TileLayer(
    tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
    attr="Esri",
    name="Esri Satellite",
    control=True,
).add_to(m)

folium.LayerControl().add_to(m)
display(m)