In [53]:
import os
import sys
import numpy as np
import xarray as xr
import dask
import dask.distributed
import dask.array as da

from typing import Tuple, Optional

import eumdac.collection

sys.path.append(os.path.abspath('..'))
from src.connectors.eumdac_connector import EumdacConnector
from utils.opensearch_query_formatter import OpenSearchQueryFormatter

In [92]:
def identify_leading_edge(waveform: np.array, tau: float) -> np.array:
    """
    Identify the startgate and stopgate of a waveform.
    From algo described here: https://climate.esa.int/sites/default/files/Sea_State_cci_ATBD_v1.1-signed_0.pdf

    :param np.array waveform: numpy array of waveform values
    :param float tau: time spacing between consecutive gates in seconds
    :return: 1D np.array with 2 values:
      - startgate: index of the start gate (Optional[int])
      - stopgate: index of the stop gate (Optional[int])
    :rtype: np.array

    """
    res: np.array = np.array([-1.0, -1.0], dtype=np.float64)
    try:
        # The waveform is normalised with normalisation factor N, 
        # where N = 1.3 * median(waveform)
        N: float = 1.3 * np.median(waveform)
        normalized_waveform: np.ndarray = waveform / N
    
        # The leading edge starts when the normalised waveform has a 
        # rise of 0.01 units compared to the previous gate (startgate)
        startgate: int | None = None
        for i in range(1, len(normalized_waveform)):
            if normalized_waveform[i] - normalized_waveform[i-1] >= 0.01:
                startgate = i
                break
    
        if startgate is None:
            return res  # No valid startgate found
    
        # At this point, the leading edge is considered valid if, for at least four gates 
        # after startgate, it does not decrease below 0.1 units (10% of the normalised power).
        valid_leading_edge: bool = (
            startgate + 4 < len(normalized_waveform) and
            np.all(normalized_waveform[startgate:startgate + 5] >= 0.1)
        )
    
        if not valid_leading_edge:
            return res  # Leading edge is not valid
    
        # The end of the leading edge (stopgate) is fixed at the first gate 
        # in which the derivative changes sign (i.e. the signal start decreasing 
        # and the trailing edge begins), if the change of sign is kept 
        # for the following 3 gates.
        stopgate: int | None = None
        for i in range(startgate + 1, len(normalized_waveform) - 3):
            # Calculate the derivative
            derivative: float = normalized_waveform[i + 1] - normalized_waveform[i]
            if derivative < 0:  # Start of decrease
                # Check if the derivative stays negative for the next 3 gates
                if (normalized_waveform[i + 2] - normalized_waveform[i + 1] < 0 and
                    normalized_waveform[i + 3] - normalized_waveform[i + 2] < 0):
                    stopgate = i
                    break
    
        res = np.array([startgate, stopgate], dtype=np.float64)
        print(res)
        return res
    except:
        return res

In [83]:
if __name__ == "__main__":
    # Simulated waveform data
    waveform: np.ndarray = np.array([0.0, 0.0, 0.0, 0.0001, 0.0015, 0.002, 0.004, 0.01, 0.02, 0.05, 0.15, 0.1, 0.08, 0.07, 0.03, 0.01, 0.0])
    tau: float = 3.125e-9  # Time spacing in seconds

    res = identify_leading_edge(waveform, tau)
    print(res)
    startgate, stopgate = res
    print(f"Startgate: {startgate}, Stopgate: {stopgate}")

(17,)
[ 4. 10.]
Startgate: 4.0, Stopgate: 10.0


In [27]:
COLLECTION_ID: str = "EO:EUM:DAT:0415"
DOWNLOAD_DIR: str = "/tmp/products"
MEASUREMENTS_FILENAME: str = "enhanced_measurement.nc"
INDEX_DIMENSION: str = "time_01"
download_dir: str = os.path.join(os.getcwd(), DOWNLOAD_DIR)

connector: EumdacConnector = EumdacConnector()
datastore: eumdac.datastore.DataStore = connector.datastore

# Query a few data files for Sentinel3A and 3B SRAL (Level2 data)
opensearch_query: str = OpenSearchQueryFormatter(
    query_params={
        "pi": COLLECTION_ID,
        "dtstart": "2024-09-23T00:20:00Z",
        "dtend": "2024-09-23T00:30:00Z",
    }
).format()
products: eumdac.collection.SearchResults = datastore.opensearch(query=opensearch_query)
product_ids: list[str] = [str(x) for x in products]
# If in local mode, process only a subset of the products for faster execution
if os.getenv("LOCAL_MODE", "1"):
    print("Local mode: processing every 50th product to debug faster")
    product_ids = product_ids[::50]
print("%s matching products found", len(product_ids))
print("Listed products are: %s", product_ids)

# Download files - benefits of dask parallelization
print("Downloading products (dask parallelized)...")
downloaded_folders: list[str] = connector.download_products(
    COLLECTION_ID, product_ids, download_dir, MEASUREMENTS_FILENAME
)

Local mode: processing every 50th product to debug faster
%s matching products found 1
Listed products are: %s ['S3B_SR_2_WAT____20240923T002358_20240923T002431_20240923T015119_0033_098_016______MAR_O_NR_005.SEN3']
Downloading products (dask parallelized)...


Perhaps you already have a cluster running?
Hosting the HTTP server on port 40087 instead


In [93]:
# Use a local cluster, and threads only
cluster: dask.distributed.LocalCluster = dask.distributed.LocalCluster(processes=False)
client: dask.distributed.Client = dask.distributed.Client(cluster)

# Process only 1 file
ds = xr.open_dataset(f"{DOWNLOAD_DIR}/S3B_SR_2_WAT____20240923T002358_20240923T002431_20240923T015119_0033_098_016______MAR_O_NR_005.SEN3/{MEASUREMENTS_FILENAME}")
ds.close()

lrm_da = ds["waveform_20_plrm_ku"]

# I know this is very in efficient, to have few kB of data to process in dask -> MBs is the usual dask standard
# But it is for map_blocks learning purpose
lrm_dda = da.from_array(lrm_da, chunks=(1, 128))
tau: float = 3.125e-9

# identify_leading_edge(waveform: np.ndarray, tau: float) -> np.array 1D with 2 values
result_array = lrm_dda.map_blocks(
    # Squeeze the 2D array (1, 128) to 1D (128) as expected by 'identify_leading_edge'
    lambda x: identify_leading_edge(np.squeeze(x), tau),
    meta=np.zeros(2, dtype=np.float64)
)
result = result_array.compute()
print(result)

client.close()
cluster.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46883 instead


[20. 53.]
[26. 52.]
[ 5. 14.]
[ 7. 53.]
[ 5. 36.]
[ 1. 83.]
[ 1. 13.]
[2. 3.]
[1. 8.]
[ 2. 13.]
[ 4. 19.]
[ 2. 13.]
[4. 5.]
[1. 5.]
[ 3. 30.]
[1. 3.]
[ 3. 10.]
[1. 4.]
[ 3. 33.]
[ 1. 32.]
[ 2. 35.]
[  7. 102.]
[ 1. 68.]
[1. 6.]
[1. 2.]
[ 4. 21.]
[ 3. 15.]
[1. 9.]
[1. 6.]
[ 1. 14.]
[1. 2.]
[1. 2.]
[1. 2.]
[1. 2.]
[ 3. 17.][ 6. 17.]
[ 4. 20.]
[ 2. 23.]
[ 3. 16.]
[ 4. 66.]
[ 3. 15.]

[7. 8.]
[2. 3.]
[1. 7.]
[1. 5.]
[ 6. 10.]
[2. 9.]
[3. 9.]
[11. 15.]
[ 1. 10.]
[ 6. 41.]
[3. 7.]
[ 3. 14.]
[ 6. 10.]
[ 6. 13.]
[7. 8.]
[4. 8.]
[ 2. 10.]
[ 5. 16.]
[2. 3.]
[ 2. 25.]
[11. 54.]
[11. 56.]
[ 7. 39.]
[42. 66.]
[40. 50.]
[41. 91.]
[40. 62.]
[37. 39.]
[36. 60.]
[ 6. 10.]
[ 7. 32.]
[ 7. 61.]
[ 6. 90.]
[35. 49.]
[34. 45.]
[32. 48.]
[39. 56.]
[34. 93.]
[37. 65.]
[36. 46.]
[40. 51.]
[42. 68.]
[1. 2.]
[41. 61.]
[41. 50.]
[33. 55.]
[39. 50.]
[43. 55.]
[ 9. 12.]
[ 6. 10.]
[ 7. 14.]
[ 8. 19.]
[ 7. 33.]
[ 1. 47.]
[ 7. 27.]
[ 7. 37.]
[12. 35.]
[1. 2.]
[ 7. 27.]
[ 9. 56.]
[48. 57.]
[43. 59.]
[13. 26.]
[12. 56.]


IndexError: tuple index out of range