# STAC EuroSAT

This notebook demonstrates how to convert annotations provided by [EuroSAT](https://github.com/phelber/EuroSAT) dataset
into STAC-compatible definitions with extensions relevant for machine learning tasks.
To facilitate parsing of EuroSAT data,
the [torchgeo.datasets.EuroSAT](https://torchgeo.readthedocs.io/en/stable/api/datasets.html#torchgeo.datasets.EuroSAT)
class will be used to handle the metadata extraction process, splits definition and sample generation.

In [1]:
import os
from typing import Any, Dict, List, Literal, NotRequired, Required, Tuple, TypedDict, Union

from osgeo import gdal

# pick one:
from torchgeo.datasets import EuroSAT100 as DatasetEuroSAT  # subset of (6 train, 2 val, 2 test) images per class
# from torchgeo.datasets import EuroSAT as DatasetEuroSAT  # full dataset

General configurations

In [2]:
DATA_ROOT_DIR = os.path.abspath("../data")
EUROSAT_ROOT_DIR = os.path.join(DATA_ROOT_DIR, "EuroSAT")
EUROSAT_DATA_DIR = os.path.join(EUROSAT_ROOT_DIR, "data")
EUROSAT_STAC_DIR = os.path.join(EUROSAT_ROOT_DIR, "stac")
EUROSAT_STAC_URL = "https://example.com/data/"  # base URL where samples would be accessible from (links in STAC)

Base STAC definitions

In [None]:
# technically, tuples would be better for bbox/point, but not the types used in JSON
BoundingBox = List[Union[int, float]]  # 4 value
Point = List[int, float]  # 2 values
DateTimeInterval = List[Union[str, None]]
GeoJSONGeometry = TypedDict(
    "GeoJSONGeometry",
    {
        "type": Literal["Polygon", "MultiPolygon"],  # others exist, but not really applicable for this case
        "coordinates": List[Point],  # at least 4 normally for square/polygon bbox, but more valid if multi-polygon
    }
)
SpatialExtent = TypedDict(
    "SpatialExtent",
    {
        "bbox": Required[List[BoundingBox]],
    }
)
TemporalExtent = TypedDict(
    "TemporalExtent",
    {
        "interval": Required[List[DateTimeInterval]],
    }
)
Extent = TypedDict(
    "Extent",
    {
        "spatial": Required[SpatialExtent],
        "temporal": Required[TemporalExtent],
    }
)
Provider = TypedDict(
    "Provider",
    {
        "name": str,
        "roles": List[str],
        "url": str,
    }
)
Link = TypedDict(
    "Link",
    {
        "rel": str,
        "href": str,
    },
    total=False,
)
STACMetadata = TypedDict(
    "STACMetadata",
    {
        "stac_version": Required[str],
        "stac_extensions": Required[List[str]],
        "type": Required[Literal["Collection", "Feature"]],  # NB: Feature == STAC Item
        "id": Required[str],
        "title": NotRequired[str],
        "description": NotRequired[str],
        "version": Required[str],
        "keywords": NotRequired[List[str]],
        "license": Required[str],  # anything, but commonly "CC-BY-SA-4.0"
        "links": Required[List[Link]],
    }
)
STACCollectionCore = TypedDict(
    "STACCollectionCore",
    {
        "extent": Required[Extent],
        "providers": NotRequired[List[Provider]],
    }
)
STACCollection = Union[STACMetadata, STACCollectionCore]
STACItemCore = TypedDict(
    "STACItemCore",
    {
        "bbox": Required[BoundingBox],
        "geometry": Required[GeoJSONGeometry],
        "assets": Required[List[Dict[str, Any]]],
        "properties": Required[Dict[str, Any]],
    }
)
STACLabelRef = TypedDict(
    "STACLabelRef",
    {
        "title": str,
        "href": str,    # URL to GeoJSON FeatureCollection or GeoTiff/COG
        "type": str,    # media-type
    }
)
STACLabelAssets = TypedDict(
    "STACLabelAssets",
    {
        "labels": STACLabelRef,
        "raster": STACLabelRef,
    }
)
STACLabelProperties = TypedDict(
    "STACLabelProperties",
    {
        "datetime": str,
        "license": str,
        "label:properties": List[str],  # label classes

    }
)
STACLabelExtension = TypedDict(
    "STACLabelExtension",
    {
        "assets": Required[STACLabelAssets],
        "properties": Required[STACLabelProperties]
    }
)
STACItemLabel = Union[STACMetadata, STACItemCore, STACLabelExtension]

In [3]:
EUROSAT_STAC_COLLECTION_BASE: STACCollection = {  # type: ignore  # missing field for the moment
    "stac_version": "1.0.0-rc.1",
    "stac_extensions": [
        "https://stac-extensions.github.io/version/v1.0.0/schema.json"
    ],
    "type": "Collection",
    # other items to fill by script
}

**Note** <br>
Because we need the metadata to populate STAC Collections, Items and Assets, the `download=True` parameter is used.
However, we don't need the actual data (imagery pixel values), but instead the metadata that each GeoTiff contains.
Therefore, we override the `_load_image` method to retrieve only metadata, and make parsing faster.
Nevertheless, it can take some time to download and extract the ZIP contents on the first run.

In [None]:
os.makedirs(EUROSAT_DATA_DIR, exist_ok=True)
os.makedirs(EUROSAT_STAC_DIR, exist_ok=True)

ImageMetadata = Dict[str, Any]
LabelMetadata = Dict[str, Any]


class DataLoaderEuroSAT(DatasetEuroSAT):
    def _load_image(
        self,
        sample_index: int,
    ) -> Tuple[ImageMetadata, LabelMetadata]:  # type: ignore  # mismatch 'Tuple[Tensor, Tensor]' on purpose
        path, target_index = self.samples[sample_index]
        class_name = self.classes[target_index]
        url = path.replace(EUROSAT_ROOT_DIR, EUROSAT_STAC_URL)

        img = gdal.Open(path)
        geo = img.GetGeoTransform()

        return geo, target


for split in DatasetEuroSAT.splits:
    stac_collection_file = os.path.join(EUROSAT_STAC_DIR, f"collection-{split}.json")
    stac_collection = EUROSAT_STAC_COLLECTION_BASE.copy()
    data_loader = DataLoaderEuroSAT(root=EUROSAT_DATA_DIR, split=split, download=True)
    for sample in data_loader:
        label = sample["label"]
        if not label:  # ignore tiles by themselves without annotations
            continue

        image = sample["image"]

        print(sample)
