In [1]:
import os
from osgeo import gdal, ogr, osr
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import osgeo

gdal.UseExceptions()

In [2]:
os.environ['PROJ_DATA']='/workspace/.conda/envs/env_labels/share/proj' 
# os.environ['GDAL_DATA']='/workspace/.conda/envs/env_labels/share/gdal'
# os.environ['GTIFF_SRS_SOURCE'] = 'EPSG'

In [3]:
import pystac
from pystac import Link, Asset

In [4]:
from pystac.extensions.label import LabelExtension
from pystac.extensions.label import LabelType
from pystac.extensions.label import LabelClasses
from pystac.extensions.label import LabelStatistics

from pystac.extensions.version import ItemVersionExtension

In [5]:
# Set dataframe to None 
df = None

In [6]:
def pixel_to_coords(source, x, y):
    """Returns global coordinates in EPSG:4326 from pixel x, y coords"""

    geo_transform = source.GetGeoTransform()

    x_min = geo_transform[0]
    x_size = geo_transform[1]
    y_min = geo_transform[3]
    y_size = geo_transform[5]
    px = x * x_size + x_min
    py = y * y_size + y_min

    srs = osr.SpatialReference()
    
    # GDAL 3 changes axis order: https://github.com/OSGeo/gdal/issues/1546
    if int(osgeo.__version__[0]) >= 3:
        srs.SetAxisMappingStrategy(osgeo.osr.OAMS_TRADITIONAL_GIS_ORDER)
    
    srs.ImportFromWkt(source.GetProjection())

    srs_4326 = srs.CloneGeogCS()
    ct = osr.CoordinateTransformation(srs, srs_4326)

    long, lat, _ = ct.TransformPoint(px, py)

    return long, lat

## Read STAC Item

In [7]:
aws_url = "https://earth-search.aws.element84.com/v0/collections/sentinel-s2-l2a-cogs/items/S2B_10TFK_20210713_0_L2A"
item = pystac.read_file(aws_url)
display(item.properties)

{'datetime': '2021-07-13T19:03:24Z',
 'platform': 'sentinel-2b',
 'constellation': 'sentinel-2',
 'instruments': ['msi'],
 'gsd': 10,
 'view:off_nadir': 0,
 'proj:epsg': 32610,
 'sentinel:utm_zone': 10,
 'sentinel:latitude_band': 'T',
 'sentinel:grid_square': 'FK',
 'sentinel:sequence': '0',
 'sentinel:product_id': 'S2B_MSIL2A_20210713T184919_N0301_R113_T10TFK_20210713T213143',
 'sentinel:data_coverage': 100,
 'eo:cloud_cover': 0,
 'sentinel:valid_cloud_cover': True,
 'created': '2021-07-13T23:57:53.846Z',
 'updated': '2021-07-13T23:57:53.846Z'}

In [8]:
print(f'Available bands: {list(item.assets.keys())}')

Available bands: ['thumbnail', 'overview', 'info', 'metadata', 'visual', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL']


## Extract values of selected band(s) for a defined random number of pixels
Select a band with highest resolution, ie `B02`, `B03`, `B04`, `B08`.

In [9]:
high_res_bands = ['B02', 'B03', 'B04', 'B08']

In [10]:
band = 'B04' # 'B04'
assert band in ['B02', 'B03', 'B04', 'B08'], f'choose high res. band, ie {high_res_bands}'

In [11]:
print('Band:', band)
print(f'- Res: {item.assets[band].to_dict()["gsd"]}m')
# Extract band
b_href = item.assets[band].href
print('- href:', b_href)

# Get gdal object
b_g = gdal.Open(b_href)
# b_rst = b_g.GetRasterBand(1)

# # Get array
# b_arr = b_rst.ReadAsArray()
# print('- Shape:', np.shape(b_arr))
# # n_cl = len(np.unique(b_arr))
# # print(f'- Number of LC classes: {n_cl}')

Band: B04
- Res: 10m
- href: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2021/7/S2B_10TFK_20210713_0_L2A/B04.tif


In [12]:
# Run the function for testing
assert pixel_to_coords(b_g, 100, 100) == (-121.8056363767779, 40.63567108385486)
b_g = None

### Generate (fixed) random sample of image coordinates 

In [13]:
# Define number of pixels 
no_pixels = 500 
np.random.seed(42) # keep this fixed 

# Generate array representing the pair of coordinates 
# xy = np.random.randint(1, np.shape(b_arr)[0], size=(no_pixels, 2))
xy = np.random.randint(1, 10980, size=(no_pixels, 2))

xy[:5]

array([[7271,  861],
       [5391, 5192],
       [5735, 6266],
       [ 467, 4427],
       [5579, 8323]])

Note that with **SCL** I get the `xy` below, but because of different shape:

```
Shape: (5490, 5490)

array([[ 861, 5391],
       [5227, 5192],
       [3773, 3093],
       [ 467, 5335],
       [4427, 3445]])
```

### Other band (with different resolution)

In [14]:
# # For B04 
# # Extract band
# b_href2 = item.assets['B04'].href
# print('href:', b_href2)

# # Get gdal object
# b_g2 = gdal.Open(b_href2)
# b_rst2 = b_g2.GetRasterBand(1)

# # Get array
# b_arr2 = b_rst2.ReadAsArray()
# print('Shape:', np.shape(b_arr2))
# n_cl2 = len(np.unique(b_arr2))
# print(f'Number of LC classes: {n_cl2}')

# # Define number of pixels 
# np.random.seed(42) # keep this fixed 

# # Generate array representing the pair of coordinates 
# xy_s = np.random.randint(1, np.shape(b_arr2)[0], size=(no_pixels, 2))
# xy_s[:5]

In [15]:
# bands = ['SCL'] #['SCL', 'B04']
# for band in :
#     # Extract bands
#     b_href = item.assets[band].href

#     # Get gdal object
#     b_g = gdal.Open(b_href)
#     scl_rst = scl_g.GetRasterBand(1)

#     # Info about the image
#     scl_arr = scl_rst.ReadAsArray()
#     print('Shape:', np.shape(scl_arr))
#     n_cl = len(np.unique(scl_arr))
#     print(f'Number of LC classes: {n_cl}')

### Extract values of selected band for each pair of coordinates

In [31]:
band = 'B08'

In [32]:
print('Working on band:', band)

# Open gdal object and get raster band
b_g = gdal.Open(item.assets[band].href)
b_rst = b_g.GetRasterBand(1)

Working on band: B08


In [33]:
print('Extracting values')
x_values = []
y_values = []

# note that this will become a function, and as main paramtere will be b_g which is the gdal object of my input tif image  

for pos in xy:
    
    x_values.append([*pixel_to_coords(b_g, pos[0], pos[1])])
    
    y_values.append(
        int(
            b_rst.ReadAsArray(
                xoff=int(pos[0]), yoff=int(pos[1]), win_xsize=1, win_ysize=1
            )[0][0]
        )
    )

# Empty b_rst
b_rst = None

print(x_values[:10])
print(y_values[:10])

Extracting values
[[-120.96012778569894, 40.55527781468857], [-121.19251157034941, 40.168990282416786], [-121.15474969268766, 40.0716397686962], [-121.7693750098431, 40.24548034397307], [-121.177963156973, 39.88669153535598], [-121.61937653607472, 40.57323757988562], [-120.58869608519073, 39.999619134722295], [-121.53988808239271, 40.163082567554305], [-121.23516902776696, 40.058929293462704], [-121.68520356464738, 40.232932915840905]]
[1732, 1817, 2655, 1985, 169, 2754, 2078, 2985, 2103, 2746]


## Make Pandas dataframe
**Note**: The *pandas* dataframe will be used as input for the EDA Notebook from the ARSET training.

In [34]:
# make dictionary
data = {'long': [x[0] for x in x_values], 
        'lat': [x[1] for x in x_values], 
        band: y_values}
# data

In [55]:
df2 = pd.DataFrame(data)
df2.index.name = 'Index'
df2

Unnamed: 0_level_0,long,lat,B08
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-120.960128,40.555278,1732
1,-121.192512,40.168990,1817
2,-121.154750,40.071640,2655
3,-121.769375,40.245480,1985
4,-121.177963,39.886692,169
...,...,...,...
495,-120.577296,40.370247,2016
496,-121.487336,40.461230,2425
497,-120.619159,40.311844,2310
498,-120.740778,39.909446,1752


In [56]:
df

Unnamed: 0_level_0,long,lat,B04
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-120.960128,40.555278,510
1,-121.192512,40.168990,394
2,-121.154750,40.071640,369
3,-121.769375,40.245480,241
4,-121.177963,39.886692,220
...,...,...,...
495,-120.577296,40.370247,1352
496,-121.487336,40.461230,600
497,-120.619159,40.311844,270
498,-120.740778,39.909446,352


In [62]:
# Check dataframes have the same long and lat values
assert df['long'].isin(df2['long']).value_counts().values[0] == no_pixels
assert df['lat'].isin(df2['lat']).value_counts().values[0] == no_pixels

In [63]:
# Merge the DataFrames based on matching columns
merged_df = pd.merge(df, df2, on=['Index', 'long', 'lat'])

merged_df

Unnamed: 0_level_0,long,lat,B04,B08
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-120.960128,40.555278,510,1732
1,-121.192512,40.168990,394,1817
2,-121.154750,40.071640,369,2655
3,-121.769375,40.245480,241,1985
4,-121.177963,39.886692,220,169
...,...,...,...,...
495,-120.577296,40.370247,1352,2016
496,-121.487336,40.461230,600,2425
497,-120.619159,40.311844,270,2310
498,-120.740778,39.909446,352,1752


In [20]:
if df is None: 
    print('Creating Dataframe')
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    df.index.name = 'Index'
else: 
    print('Adding to existing Dataframe')
    
print(df)

Creating Dataframe
             long        lat   B04
Index                             
0     -120.960128  40.555278   510
1     -121.192512  40.168990   394
2     -121.154750  40.071640   369
3     -121.769375  40.245480   241
4     -121.177963  39.886692   220
...           ...        ...   ...
495   -120.577296  40.370247  1352
496   -121.487336  40.461230   600
497   -120.619159  40.311844   270
498   -120.740778  39.909446   352
499   -121.627961  40.621445   432

[500 rows x 3 columns]


In [21]:
import copy
data_bk = copy.deepcopy(df)
data_bk

Unnamed: 0_level_0,long,lat,B04
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-120.960128,40.555278,510
1,-121.192512,40.168990,394
2,-121.154750,40.071640,369
3,-121.769375,40.245480,241
4,-121.177963,39.886692,220
...,...,...,...
495,-120.577296,40.370247,1352
496,-121.487336,40.461230,600
497,-120.619159,40.311844,270
498,-120.740778,39.909446,352


In [37]:
data_bk

Unnamed: 0_level_0,long,lat,B04
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-120.960128,40.555278,510
1,-121.192512,40.168990,394
2,-121.154750,40.071640,369
3,-121.769375,40.245480,241
4,-121.177963,39.886692,220
...,...,...,...
495,-120.577296,40.370247,1352
496,-121.487336,40.461230,600
497,-120.619159,40.311844,270
498,-120.740778,39.909446,352


In [22]:
# Export dataframe 
df.to_csv('dataframe.csv')

## Split dataset into train and validation

In [23]:
x_values[:10]

[[-120.96012778569894, 40.55527781468857],
 [-121.19251157034941, 40.168990282416786],
 [-121.15474969268766, 40.0716397686962],
 [-121.7693750098431, 40.24548034397307],
 [-121.177963156973, 39.88669153535598],
 [-121.61937653607472, 40.57323757988562],
 [-120.58869608519073, 39.999619134722295],
 [-121.53988808239271, 40.163082567554305],
 [-121.23516902776696, 40.058929293462704],
 [-121.68520356464738, 40.232932915840905]]

In [24]:
print(len(x_values), len(y_values))

500 500


In [25]:
x_train, x_rem, y_train, y_rem = train_test_split(
    np.array(x_values), np.array(y_values), train_size=0.8
)
print(len(x_train),len(y_train))
print(len(x_rem),len(y_rem))

400 400
100 100


In [26]:
x_valid, x_test, y_valid, y_test = train_test_split(
    np.array(x_rem), np.array(y_rem), test_size=0.5
)
print(len(x_valid),len(y_valid))
print(len(x_test),len(y_test))

50 50
50 50


In [27]:
x_valid.shape, x_train.shape, x_test.shape

((50, 2), (400, 2), (50, 2))

In [28]:
y_valid.shape, y_train.shape, y_test.shape

((50,), (400,), (50,))

In [29]:
def to_geojson(t, x, y):
    """Converts the given x, y, and split dataset type (train, test, validate ) to a geojson file
    The geojson file is saved in the current directory with the name label-{t}.geojson
    """

    field_name = "class"
    field_type = ogr.OFTInteger

    # Create the output Driver
    out_driver = ogr.GetDriverByName("GeoJSON")

    geojson_filename = f"label-{t}.geojson"
    # Create the output GeoJSON
    out_datasource = out_driver.CreateDataSource(geojson_filename)
    out_layer = out_datasource.CreateLayer("labels", geom_type=ogr.wkbPolygon)
    id_field = ogr.FieldDefn(field_name, field_type)
    out_layer.CreateField(id_field)
    # Get the output Layer's Feature Definition
    feature_def = out_layer.GetLayerDefn()

    for index, v in enumerate(y):
        point = ogr.Geometry(ogr.wkbPoint)
        point.AddPoint(x[index][0], x[index][1])

        # create a new feature
        out_feature = ogr.Feature(feature_def)

        # Set new geometry
        out_feature.SetGeometry(point)

        out_feature.SetField(field_name, int(v))
        # Add new feature to output Layer
        out_layer.CreateFeature(out_feature)

        # dereference the feature
        out_feature = None

    # Save and close DataSources
    out_datasource = None

In [30]:
to_geojson(f"train_{band}_3", x_train, y_train)
# to_geojson(f"test_{band}", x_test, y_test)
# to_geojson(f"validate_{band}", x_valid, y_valid)