In [56]:
import os
from osgeo import gdal, ogr, osr
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

gdal.UseExceptions()

In [57]:
os.environ['PROJ_DATA']='/workspace/.conda/envs/env_labels/share/proj' 
# os.environ['GDAL_DATA']='/workspace/.conda/envs/env_labels/share/gdal'
# os.environ['GTIFF_SRS_SOURCE'] = 'EPSG'

In [58]:
import pystac
from pystac import Link, Asset

In [59]:
from pystac.extensions.label import LabelExtension
from pystac.extensions.label import LabelType
from pystac.extensions.label import LabelClasses
from pystac.extensions.label import LabelStatistics

from pystac.extensions.version import ItemVersionExtension

In [60]:
# Set dataframe to None 
df = None

In [61]:
def pixel_to_coords(source, x, y):
    """Returns global coordinates in EPSG:4326 from pixel x, y coords"""

    geo_transform = source.GetGeoTransform()

    x_min = geo_transform[0]
    x_size = geo_transform[1]
    y_min = geo_transform[3]
    y_size = geo_transform[5]
    px = x * x_size + x_min
    py = y * y_size + y_min

    srs = osr.SpatialReference()
    srs.ImportFromWkt(source.GetProjection())

    srs_4326 = srs.CloneGeogCS()
    ct = osr.CoordinateTransformation(srs, srs_4326)

    long, lat, _ = ct.TransformPoint(px, py)

    return long, lat

## Read STAC Item

In [62]:
aws_url = "https://earth-search.aws.element84.com/v0/collections/sentinel-s2-l2a-cogs/items/S2B_10TFK_20210713_0_L2A"

In [63]:
item = pystac.read_file(aws_url)
display(item.properties)

{'datetime': '2021-07-13T19:03:24Z',
 'platform': 'sentinel-2b',
 'constellation': 'sentinel-2',
 'instruments': ['msi'],
 'gsd': 10,
 'view:off_nadir': 0,
 'proj:epsg': 32610,
 'sentinel:utm_zone': 10,
 'sentinel:latitude_band': 'T',
 'sentinel:grid_square': 'FK',
 'sentinel:sequence': '0',
 'sentinel:product_id': 'S2B_MSIL2A_20210713T184919_N0301_R113_T10TFK_20210713T213143',
 'sentinel:data_coverage': 100,
 'eo:cloud_cover': 0,
 'sentinel:valid_cloud_cover': True,
 'created': '2021-07-13T23:57:53.846Z',
 'updated': '2021-07-13T23:57:53.846Z'}

In [64]:
print(f'Available bands: {list(item.assets.keys())}')

Available bands: ['thumbnail', 'overview', 'info', 'metadata', 'visual', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL']


## Extract values of selected band(s) for a defined random number of pixels
Eg start with `SCL` and `B04` bands

In [65]:
band = 'SCL' # 'B04'

In [66]:
print('band:', band)

# Extract band
b_href = item.assets[band].href
print('href:', b_href)

# Get gdal object
b_g = gdal.Open(b_href)
b_rst = b_g.GetRasterBand(1)

# Get array
b_arr = b_rst.ReadAsArray()
print('Shape:', np.shape(b_arr))
n_cl = len(np.unique(b_arr))
print(f'Number of LC classes: {n_cl}')

band: SCL
href: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2021/7/S2B_10TFK_20210713_0_L2A/SCL.tif
Shape: (5490, 5490)
Number of LC classes: 11


In [54]:
item.assets['B04'].to_dict()

{'href': 'https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2021/7/S2B_10TFK_20210713_0_L2A/B04.tif',
 'type': 'image/tiff; application=geotiff; profile=cloud-optimized',
 'title': 'Band 4 (red)',
 'gsd': 10,
 'eo:bands': [{'name': 'B04',
   'common_name': 'red',
   'center_wavelength': 0.6645,
   'full_width_half_max': 0.038}],
 'proj:shape': [10980, 10980],
 'proj:transform': [10, 0, 600000, 0, -10, 4500000, 0, 0, 1],
 'roles': ['data']}

In [55]:
item.assets['SCL'].to_dict()

{'href': 'https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2021/7/S2B_10TFK_20210713_0_L2A/SCL.tif',
 'type': 'image/tiff; application=geotiff; profile=cloud-optimized',
 'title': 'Scene Classification Map (SCL)',
 'proj:shape': [5490, 5490],
 'proj:transform': [20, 0, 600000, 0, -20, 4500000, 0, 0, 1],
 'roles': ['data']}

### Generate (fixed) random sample of image coordinates 

In [67]:
# Define number of pixels 
no_pixels = 500 
np.random.seed(42) # keep this fixed 

# Generate array representing the pair of coordinates 
xy = np.random.randint(1, np.shape(b_arr)[0], size=(no_pixels, 2))
xy[:5]

array([[ 861, 5391],
       [5227, 5192],
       [3773, 3093],
       [ 467, 5335],
       [4427, 3445]])

Note that With **SCL** I get these:

```
Shape: (5490, 5490)

array([[ 861, 5391],
       [5227, 5192],
       [3773, 3093],
       [ 467, 5335],
       [4427, 3445]])
```

In [68]:
item.properties

{'datetime': '2021-07-13T19:03:24Z',
 'platform': 'sentinel-2b',
 'constellation': 'sentinel-2',
 'instruments': ['msi'],
 'gsd': 10,
 'view:off_nadir': 0,
 'proj:epsg': 32610,
 'sentinel:utm_zone': 10,
 'sentinel:latitude_band': 'T',
 'sentinel:grid_square': 'FK',
 'sentinel:sequence': '0',
 'sentinel:product_id': 'S2B_MSIL2A_20210713T184919_N0301_R113_T10TFK_20210713T213143',
 'sentinel:data_coverage': 100,
 'eo:cloud_cover': 0,
 'sentinel:valid_cloud_cover': True,
 'created': '2021-07-13T23:57:53.846Z',
 'updated': '2021-07-13T23:57:53.846Z'}

### Other band (with different resolution)

In [23]:
# For B04 
# Extract band
b_href2 = item.assets['B04'].href
print('href:', b_href2)

# Get gdal object
b_g2 = gdal.Open(b_href2)
b_rst2 = b_g2.GetRasterBand(1)

# Get array
b_arr2 = b_rst2.ReadAsArray()
print('Shape:', np.shape(b_arr2))
n_cl2 = len(np.unique(b_arr2))
print(f'Number of LC classes: {n_cl2}')

# Define number of pixels 
np.random.seed(42) # keep this fixed 

# Generate array representing the pair of coordinates 
xy_s = np.random.randint(1, np.shape(b_arr2)[0], size=(no_pixels, 2))
xy_s[:5]

href: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2021/7/S2B_10TFK_20210713_0_L2A/B04.tif
Shape: (10980, 10980)
Number of LC classes: 3394


array([[7271,  861],
       [5391, 5192],
       [5735, 6266],
       [ 467, 4427],
       [5579, 8323]])

In [13]:
# bands = ['SCL'] #['SCL', 'B04']
# for band in :
#     # Extract bands
#     b_href = item.assets[band].href

#     # Get gdal object
#     b_g = gdal.Open(b_href)
#     scl_rst = scl_g.GetRasterBand(1)

#     # Info about the image
#     scl_arr = scl_rst.ReadAsArray()
#     print('Shape:', np.shape(scl_arr))
#     n_cl = len(np.unique(scl_arr))
#     print(f'Number of LC classes: {n_cl}')

### Extract values of selected band for each pair of coordinates

In [69]:
x_values = []
y_values = []

# note that this will become a function, and as main paramtere will be source_g which is the gdal object of my input tif image  

for pos in xy:
    
    x_values.append([*pixel_to_coords(b_g, pos[0], pos[1])])

    y_values.append(
        int(
            b_rst.ReadAsArray(
                xoff=int(pos[0]), yoff=int(pos[1]), win_xsize=1, win_ysize=1
            )[0][0]
        )
    )
# print(x_values[:10])
# print(y_values[:10])

## Make Pandas dataframe
**Note**: The *pandas* dataframe will be used as input for the EDA Notebook from the ARSET training.

In [71]:
# make dictionary
data = {'long': [x[0] for x in x_values], 
        'lat': [x[1] for x in x_values], 
        band: y_values}
# data

In [72]:
xy[:10]

array([[ 861, 5391],
       [5227, 5192],
       [3773, 3093],
       [ 467, 5335],
       [4427, 3445],
       [3172, 2920],
       [ 131, 1686],
       [ 770, 2392],
       [2434, 5312],
       [5052, 1185]])

In [73]:
if df is None: 
    print('Creating Dataframe')
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    df.index.name = 'Index'
else: 
    print('Adding to existing Dataframe')
    
print(df)


Creating Dataframe
            long         lat  SCL
Index                            
0      39.671405 -121.633305    5
1      39.690800 -120.614640    4
2      40.075266 -120.942279    4
3      39.682539 -121.724972    5
4      40.009060 -120.791033    4
...          ...         ...  ...
495    39.785007 -120.631006    4
496    40.459491 -121.303745    4
497    40.241506 -121.186469    6
498    40.091115 -121.533920    4
499    39.902230 -120.754232    4

[500 rows x 3 columns]


In [74]:
import copy
data_bk = copy.deepcopy(df)
data_bk

Unnamed: 0_level_0,long,lat,SCL
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39.671405,-121.633305,5
1,39.690800,-120.614640,4
2,40.075266,-120.942279,4
3,39.682539,-121.724972,5
4,40.009060,-120.791033,4
...,...,...,...
495,39.785007,-120.631006,4
496,40.459491,-121.303745,4
497,40.241506,-121.186469,6
498,40.091115,-121.533920,4


In [75]:
# Export dataframe 
df.to_csv('dataframe.csv')

## Split dataset into train and validation

In [76]:
print(len(x_values), len(y_values))

500 500


In [77]:
x_train, x_rem, y_train, y_rem = train_test_split(
    np.array(x_values), np.array(y_values), train_size=0.8
)
print(len(x_train),len(y_train))
print(len(x_rem),len(y_rem))

400 400
100 100


In [78]:
x_valid, x_test, y_valid, y_test = train_test_split(
    np.array(x_rem), np.array(y_rem), test_size=0.5
)
print(len(x_valid),len(y_valid))
print(len(x_test),len(y_test))

50 50
50 50


In [79]:
x_valid.shape, x_train.shape, x_test.shape

((50, 2), (400, 2), (50, 2))

In [80]:
y_valid.shape, y_train.shape, y_test.shape

((50,), (400,), (50,))

In [81]:
def to_geojson(t, x, y):
    """Converts the given x, y, and split dataset type (train, test, validate ) to a geojson file
    The geojson file is saved in the current directory with the name label-{t}.geojson
    """

    field_name = "class"
    field_type = ogr.OFTInteger

    # Create the output Driver
    out_driver = ogr.GetDriverByName("GeoJSON")

    geojson_filename = f"label-{t}.geojson"
    # Create the output GeoJSON
    out_datasource = out_driver.CreateDataSource(geojson_filename)
    out_layer = out_datasource.CreateLayer("labels", geom_type=ogr.wkbPolygon)
    id_field = ogr.FieldDefn(field_name, field_type)
    out_layer.CreateField(id_field)
    # Get the output Layer's Feature Definition
    feature_def = out_layer.GetLayerDefn()

    for index, v in enumerate(y):
        point = ogr.Geometry(ogr.wkbPoint)
        point.AddPoint(x[index][0], x[index][1])

        # create a new feature
        out_feature = ogr.Feature(feature_def)

        # Set new geometry
        out_feature.SetGeometry(point)

        out_feature.SetField(field_name, int(v))
        # Add new feature to output Layer
        out_layer.CreateFeature(out_feature)

        # dereference the feature
        out_feature = None

    # Save and close DataSources
    out_datasource = None

In [82]:
to_geojson("train2", x_train, y_train)
# to_geojson("test", x_test, y_test)
# to_geojson("validate", x_valid, y_valid)