In [1]:
import pystac
from pystac import Link, Asset
from datetime import datetime
import os
from osgeo import gdal, ogr, osr
from sklearn.model_selection import train_test_split
import numpy as np
import sys
from pystac.extensions.label import LabelExtension
from pystac.extensions.label import LabelType
from pystac.extensions.label import LabelClasses
from pystac.extensions.label import LabelStatistics

from pystac.extensions.version import ItemVersionExtension
from pystac_client import Client

# sys.path.append(".")
from utils import pixel_to_coords, to_geojson

os.environ["GDAL_DATA"] = "/opt/conda/envs/env_label/share/gdal"
os.environ["PROJ_LIB"] = "/opt/conda/envs/env_label/share/proj"

import shapely.wkt

In [2]:
# Access to Catalog
URL = "https://earth-search.aws.element84.com/v1/"

headers = []

cat = Client.open(URL, headers=headers, ignore_conformance=True)
cat



In [3]:
# Collection
collections = (["sentinel-2-l2a"],)

# Start and end dates
start_date = datetime.fromisoformat("2023-05-14T00:00:00")
stop_date = datetime.fromisoformat("2023-06-20T00:00:00")

bbox = [-121.698, 37.457, -119.938, 38.295]

# Other metadata
cloud_cover = 30

# Define EPSG code
epsg = "EPSG:4326"

In [4]:
# Query by AOI, start and end date and other params
query = cat.search(
    collections=collections,
    datetime=(start_date, stop_date),
    bbox=bbox,
    query={"eo:cloud_cover": {"lt": 5}},
)

In [5]:
tiles = ["10SFH", "10SFG", "10SGH", "11SKC", "10SGG", "11SKB"]


selected_items = [
    item for item in query.get_all_items() if any(tile in item.id for tile in tiles)
]
selected_items



[<Item id=S2A_10SFG_20230618_0_L2A>,
 <Item id=S2A_10SGG_20230618_0_L2A>,
 <Item id=S2A_11SKB_20230618_0_L2A>,
 <Item id=S2A_10SFH_20230618_0_L2A>,
 <Item id=S2B_10SFG_20230613_0_L2A>,
 <Item id=S2B_10SFH_20230613_0_L2A>,
 <Item id=S2B_10SFG_20230603_0_L2A>,
 <Item id=S2B_10SGG_20230603_0_L2A>,
 <Item id=S2B_11SKB_20230603_0_L2A>,
 <Item id=S2B_10SFH_20230603_0_L2A>,
 <Item id=S2B_10SGH_20230603_0_L2A>,
 <Item id=S2B_11SKC_20230603_0_L2A>,
 <Item id=S2B_10SFG_20230531_0_L2A>,
 <Item id=S2B_10SFH_20230531_0_L2A>,
 <Item id=S2A_10SFH_20230529_0_L2A>,
 <Item id=S2A_10SFG_20230526_0_L2A>,
 <Item id=S2A_10SFH_20230526_0_L2A>,
 <Item id=S2B_10SGG_20230524_0_L2A>,
 <Item id=S2B_11SKB_20230524_0_L2A>,
 <Item id=S2B_10SFG_20230521_0_L2A>,
 <Item id=S2B_10SGG_20230521_0_L2A>,
 <Item id=S2B_11SKB_20230521_0_L2A>,
 <Item id=S2B_10SFH_20230521_0_L2A>,
 <Item id=S2A_10SFG_20230519_0_L2A>,
 <Item id=S2A_10SGG_20230519_0_L2A>,
 <Item id=S2A_11SKB_20230519_0_L2A>,
 <Item id=S2A_10SFH_20230519_0_L2A>,
 

In [6]:
seeds = [42, 35, 2, 53, 101, 1]

In [7]:
label_classes = {
    "name": "class",
    "classes": [
        "NO_DATA",
        "SATURATED_OR_DEFECTIVE",
        "CAST_SHADOWS",
        "CLOUD_SHADOWS",
        "VEGETATION",
        "NOT_VEGETATED",
        "WATER",
        "UNCLASSIFIED",
        "CLOUD_MEDIUM_PROBABILITY",
        "CLOUD_HIGH_PROBABILITY",
        "THIN_CIRRUS",
        "SNOW or ICE",
    ],
}

In [8]:
look_up = {
    0: "NO_DATA",
    1: "SATURATED_OR_DEFECTIVE",
    2: "CAST_SHADOWS",
    3: "CLOUD_SHADOWS",
    4: "VEGETATION",
    5: "NOT_VEGETATED",
    6: "WATER",
    7: "UNCLASSIFIED",
    8: "CLOUD_MEDIUM_PROBABILITY",
    9: "CLOUD_HIGH_PROBABILITY",
    10: "THIN_CIRRUS",
    11: "SNOW or ICE",
}

In [9]:


for index, item in enumerate(selected_items):
    
    water_count = 0
    non_water_count = 0

    source = source = gdal.Open(
        f"/vsicurl/{item.get_assets()['scl'].get_absolute_href()}"
    )
    scl = source.GetRasterBand(1)

    np.random.seed(42)
    xy = np.random.randint(1, 5490, size=(10000, 2))

    x_values = []
    y_values = []

    for pos in xy:
        observed = look_up[
                int(
                    scl.ReadAsArray(
                        xoff=int(pos[0]), yoff=int(pos[1]), win_xsize=1, win_ysize=1
                    )[0][0]
                )
            ]
        
        if observed in ["NO_DATA", "UNCLASSIFIED", "SATURATED_OR_DEFECTIVE"]:
            continue
        
        if observed in ["WATER"]:
            water_count = water_count + 1
            if water_count <= 150: 
                x_values.append([*pixel_to_coords(source, pos[0], pos[1])])
                y_values.append(observed)
        else:
            non_water_count = non_water_count + 1 
            if non_water_count <= 150: 
                x_values.append([*pixel_to_coords(source, pos[0], pos[1])])
                y_values.append(observed)   
            
        if water_count > 150:
            break

    to_geojson(f"{item.id}", x_values, y_values)

    label_item = pystac.Item(
        id=f"label_{item.id}",
        geometry=item.geometry,
        bbox=item.bbox,
        datetime=datetime.utcnow(),
        properties={},
    )

    label = LabelExtension.ext(label_item, add_if_missing=True)
    label.apply(
        label_description="Land cover labels",
        label_type=LabelType.VECTOR,
        label_tasks=["segmentation", "regression"],
        label_classes=[LabelClasses(label_classes)],
        label_methods=["manual"],
        label_properties=["class"],
    )
    label.add_source(source_item=item)
    label.add_geojson_labels(f"label-{item.id}.geojson")

    version = ItemVersionExtension(label_item)

    version.apply(version="0.1", deprecated=False)

    label_item.stac_extensions.append(
        "https://stac-extensions.github.io/version/v1.2.0/schema.json"
    )

    label_item.validate()
    pystac.write_file(label_item, dest_href=f"item-label-{item.id}.json")