In [1]:
import os
from osgeo import gdal, ogr, osr
import pyproj
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import osgeo
import copy
import json

gdal.UseExceptions()

In [2]:
os.environ['PROJ_DATA']='/workspace/.conda/envs/env_labels/share/proj' 
# os.environ['GDAL_DATA']='/workspace/.conda/envs/env_labels/share/gdal'
# os.environ['GTIFF_SRS_SOURCE'] = 'EPSG'

In [3]:
import pystac
from pystac import Link, Asset
from pystac.extensions.label import LabelExtension
from pystac.extensions.label import LabelType
from pystac.extensions.label import LabelClasses
from pystac.extensions.label import LabelStatistics
from pystac.extensions.version import ItemVersionExtension

In [4]:
# Set dataframe to None 
df = None

In [5]:
# Read GeoJSON file and extract point coordinates
def read_geojson_coordinates(geojson_file):
    with open(geojson_file, 'r') as file:
        geojson_data = json.load(file)
    #print(geojson_data['features'])
    
    points = []
    for feature in geojson_data['features']:
        if feature['geometry']['type'] == 'Point':
            lon, lat, _ = feature['geometry']['coordinates']
            points.append((lon, lat))
    return points

In [32]:
# Function to transform unprojected coordinates to projected coordinates
def transform_coordinates(coordinates, epsg_s, epsg_t):
    source_crs = pyproj.CRS(f'EPSG:{epsg_s}') 
    target_crs = pyproj.CRS(f'EPSG:{epsg_t}')  
    transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True)
    transformed_coords = [transformer.transform(lon, lat) for lon, lat in coordinates]
    
    transformed_coords_int = [[int(tc[0]), int(tc[1])] for tc in transformed_coords]
    return transformed_coords_int

In [21]:
# Function to extract pixel values from a GeoTIFF at given coordinates
def extract_pixel_values(b_g, transformed_coords):
    gt = b_g.GetGeoTransform()
    b_rst = b_g.GetRasterBand(1)
    
    values = []

    for lon, lat in transformed_coords:
        px = int((lon - gt[0]) / gt[1])  # Convert longitude to pixel x
        py = int((lat - gt[3]) / gt[5])  # Convert latitude to pixel y

        value = b_rst.ReadAsArray(px, py, 1, 1)[0][0]
        values.append(value)

    return values

# Read ML STAC Item

In [6]:
# Define name of the ML item
ml_item_fname = "ml-aoi-item.json"

In [7]:
ml_item = pystac.read_file(ml_item_fname)
display(ml_item.properties)

{'ml-aoi:split': 'train', 'datetime': '2023-07-26T10:45:52.749205Z'}

## Load S2 scene

In [None]:
s2_href = [l.target for l in ml_item.links if "S2" in l.target][0]
print('href of the S2 scene:', s2_href)

In [None]:
# Read STAC Item
s2item = pystac.read_file(s2_href)
display(s2item.properties)

In [27]:
print(f'Available bands: {list(s2item.assets.keys())}')
epsg_t = s2item.properties['proj:epsg']
print(f'Target EPSG:{epsg_t}')

Available bands: ['thumbnail', 'overview', 'info', 'metadata', 'visual', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL']
Target EPSG:32610


## Load geojson points

In [8]:
geojson_href = ml_item.assets['label'].href
print('href of the geojson file:', geojson_href)

href of the geojson file: label-train.geojson


In [18]:
# Open asset (geojson format) and read all the coordinates within 
coordinates = read_geojson_coordinates(geojson_href)
coordinates[:10]

[(-121.27859163156747, 40.38218701036475),
 (-121.19953674120562, 39.91585786896506),
 (-121.33843034984275, 40.4972903822101),
 (-121.68743345874819, 40.07457719936124),
 (-121.26139230895694, 40.290492371933254),
 (-121.74308928683688, 40.18555755803047),
 (-120.6389470529706, 40.56726326945035),
 (-121.69538478759036, 40.44917135042127),
 (-121.5419727694332, 39.75660703796542),
 (-121.33944402091774, 39.878035273240194)]

In [33]:
# Transofrm coordinates
epsg_s = '4326'
transformed_coords = transform_coordinates(coordinates, epsg_s, epsg_t)
transformed_coords[:10]

[[646120, 4471599],
 [653880, 4419970],
 [640800, 4484280],
 [611920, 4436859],
 [647780, 4461450],
 [607000, 4449109],
 [699869, 4493400],
 [610629, 4478430],
 [624900, 4401760],
 [641999, 4415539]]

## Extract values of selected band(s) for each point in the geojson file
Select a band or a list of bands, ie ```[`B02`, `B03`, `B04`, `B08`,`SCL`]```

In [13]:
bands = ['B02', 'B03', 'B04', 'B08', 'SCL']

In [44]:
band = bands[3]
assert band in list(s2item.assets.keys()), f'Band does not exist'

In [45]:
print('Band:', band)
print(f'- Res: {s2item.assets[band].to_dict()["proj:transform"][0]}m')
# Extract band
b_href = s2item.assets[band].href
print('- href:', b_href)


# Get gdal object
b_g = gdal.Open(b_href)

Band: B08
- Res: 10m
- href: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2022/5/S2A_10TFK_20220524_0_L2A/B08.tif


In [46]:
pixel_values = extract_pixel_values(b_g, transformed_coords)

# print(pixel_values)

# Print the extracted pixel values
for (lon, lat), value in zip(transformed_coords[:10], pixel_values[:10]):
    print(f"Coordinate: {lon}, {lat} - Pixel Value: {value}")

#b_g = None

Coordinate: 646120, 4471599, Pixel Value: 1115
Coordinate: 653880, 4419970, Pixel Value: 1062
Coordinate: 640800, 4484280, Pixel Value: 1534
Coordinate: 611920, 4436859, Pixel Value: 1952
Coordinate: 647780, 4461450, Pixel Value: 1454
Coordinate: 607000, 4449109, Pixel Value: 2100
Coordinate: 699869, 4493400, Pixel Value: 2526
Coordinate: 610629, 4478430, Pixel Value: 2980
Coordinate: 624900, 4401760, Pixel Value: 2556
Coordinate: 641999, 4415539, Pixel Value: 1067


### Extract values of a specific band for each pair of coordinates

In [14]:
band = 'SCL'

In [15]:
print('Working on band:', band)

# Open gdal object and get raster band
b_g = gdal.Open(s2item.assets[band].href)
b_rst = b_g.GetRasterBand(1)

Working on band: SCL


In [16]:
print('Extracting values')
x_values = []
y_values = []

if b_g.GetGeoTransform()[1] == 10:
    print("res = 10m, no need to rescale")
    
    for pos in xy:

        x_values.append([*pixel_to_coords(b_g_10m, pos[0], pos[1])])

        y_values.append(
            int(
                b_rst.ReadAsArray(
                    xoff=int(pos[0]), yoff=int(pos[1]), win_xsize=1, win_ysize=1
                )[0][0]
            )
        )

elif b_g.GetGeoTransform()[1] == 20:
    print("Rescaling to res = 20m")
    
    # Need to find the pixel index of the raster which shape is half the size of the high res band, so need to divide by a factor of 2
    xy2 = np.round(xy/2, 0)
    
    for pos,pos2 in zip(xy,xy2):

        x_values.append([*pixel_to_coords(b_g_10m, pos[0], pos[1])]) # b_g must be the same as the pair of coordinates must be the same for all bands 

        y_values.append(
            int(
                b_rst.ReadAsArray(
                    xoff=int(pos2[0]), yoff=int(pos2[1]), win_xsize=1, win_ysize=1
                )[0][0]
            )
        )

# Empty b_rst
b_rst = None

print(x_values[:10])
print(y_values[:10])

Extracting values
Rescaling to res = 20m
[[-120.96012778569894, 40.55527781468857], [-121.19251157034941, 40.168990282416786], [-121.15474969268766, 40.0716397686962], [-121.7693750098431, 40.24548034397307], [-121.177963156973, 39.88669153535598], [-121.61937653607472, 40.57323757988562], [-120.58869608519073, 39.999619134722295], [-121.53988808239271, 40.163082567554305], [-121.23516902776696, 40.058929293462704], [-121.68520356464738, 40.232932915840905]]
[4, 4, 4, 4, 6, 4, 4, 4, 5, 5]


## Make Pandas dataframe
**Note**: The *pandas* dataframe will be used as input for the EDA Notebook from the ARSET training.

In [17]:
# make dictionary
data = {'long': [x[0] for x in x_values], 
        'lat': [x[1] for x in x_values], 
        band: y_values}
# data

In [18]:
if df is None: 
    print('Creating Dataframe')
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    df.index.name = 'Index'
    
    # Create backup
    data_bk = copy.deepcopy(df)

else: 
    print('Adding to existing Dataframe')
    
    # Create temp dataframe
    df2 = pd.DataFrame(data)
    df2.index.name = 'Index'
    
    # Assert the two dataframes have the same long and lat values
    assert df['long'].isin(df2['long']).value_counts().values[0] == no_pixels
    assert df['lat'].isin(df2['lat']).value_counts().values[0] == no_pixels
    
    # Merge temp dataframe with original dataframe, based on matching columns
    df = pd.merge(df, df2, on=['Index', 'long', 'lat'])   
    # Empty memory
    df2 = None
    
display(df)

Creating Dataframe


Unnamed: 0_level_0,long,lat,SCL
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-120.960128,40.555278,4
1,-121.192512,40.168990,4
2,-121.154750,40.071640,4
3,-121.769375,40.245480,4
4,-121.177963,39.886692,6
...,...,...,...
495,-120.577296,40.370247,5
496,-121.487336,40.461230,5
497,-120.619159,40.311844,4
498,-120.740778,39.909446,4


In [80]:
# # Export dataframe 
# df.to_csv('dataframe_multiband.csv')

## Split dataset into train and validation

In [19]:
def to_geojson(t, x, y):
    """Converts the given x, y, and split dataset type (train, test, validate ) to a geojson file
    The geojson file is saved in the current directory with the name label-{t}.geojson
    """

    field_name = "class"
    field_type = ogr.OFTInteger

    # Create the output Driver
    out_driver = ogr.GetDriverByName("GeoJSON")

    geojson_filename = f"label-{t}.geojson"
    # Create the output GeoJSON
    out_datasource = out_driver.CreateDataSource(geojson_filename)
    out_layer = out_datasource.CreateLayer("labels", geom_type=ogr.wkbPolygon)
    id_field = ogr.FieldDefn(field_name, field_type)
    out_layer.CreateField(id_field)
    # Get the output Layer's Feature Definition
    feature_def = out_layer.GetLayerDefn()

    for index, v in enumerate(y):
        point = ogr.Geometry(ogr.wkbPoint)
        point.AddPoint(x[index][0], x[index][1])

        # create a new feature
        out_feature = ogr.Feature(feature_def)

        # Set new geometry
        out_feature.SetGeometry(point)

        out_feature.SetField(field_name, int(v))
        # Add new feature to output Layer
        out_layer.CreateFeature(out_feature)

        # dereference the feature
        out_feature = None

    # Save and close DataSources
    out_datasource = None

### Test with 1 variable 

In [20]:
x_values[:10]

[[-120.96012778569894, 40.55527781468857],
 [-121.19251157034941, 40.168990282416786],
 [-121.15474969268766, 40.0716397686962],
 [-121.7693750098431, 40.24548034397307],
 [-121.177963156973, 39.88669153535598],
 [-121.61937653607472, 40.57323757988562],
 [-120.58869608519073, 39.999619134722295],
 [-121.53988808239271, 40.163082567554305],
 [-121.23516902776696, 40.058929293462704],
 [-121.68520356464738, 40.232932915840905]]

In [21]:
y_values[:10]

[4, 4, 4, 4, 6, 4, 4, 4, 5, 5]

In [22]:
assert len(x_values) == len(y_values)
print(len(x_values), len(y_values))

500 500


In [23]:
x_train, x_rem, y_train, y_rem = train_test_split(
    np.array(x_values), np.array(y_values), train_size=0.8
)
print(len(x_train),len(y_train))
print(len(x_rem),len(y_rem))

400 400
100 100


In [24]:
x_valid, x_test, y_valid, y_test = train_test_split(
    np.array(x_rem), np.array(y_rem), test_size=0.5
)
print(len(x_valid),len(y_valid))
print(len(x_test),len(y_test))

50 50
50 50


In [25]:
x_valid.shape, x_train.shape, x_test.shape

((50, 2), (400, 2), (50, 2))

In [26]:
y_valid.shape, y_train.shape, y_test.shape

((50,), (400,), (50,))

In [27]:
to_geojson(f"train", x_train, y_train)
# to_geojson(f"test", x_test, y_test)
# to_geojson(f"validate", x_valid, y_valid)

### Test with multiple variables (one geojson per variable)

In [19]:
# Read CSV file and make pandas dataframe
df_pandas = pd.read_csv('dataframe_multiband.csv')
df_pandas

Unnamed: 0,Index,long,lat,B04,B02,SCL,B03,B08
0,0,-120.960128,40.555278,510,291,4,452,1732
1,1,-121.192512,40.168990,394,250,4,378,1817
2,2,-121.154750,40.071640,369,230,4,521,2655
3,3,-121.769375,40.245480,241,183,4,295,1985
4,4,-121.177963,39.886692,220,258,6,258,169
...,...,...,...,...,...,...,...,...
495,495,-120.577296,40.370247,1352,819,5,1046,2016
496,496,-121.487336,40.461230,600,384,5,642,2425
497,497,-120.619159,40.311844,270,235,4,365,2310
498,498,-120.740778,39.909446,352,108,4,346,1752


In [42]:
y_names = ['B02', 'B03', 'B04', 'B08', 'SCL']

In [53]:
for y_name in y_names:
    print('Working on:', y_name)
    
    y_values = df_pandas[y_name].values
    assert len(x_values) == len(y_values)
    print('Total # of values:', len(x_values), len(y_values))
    
    # Training 
    x_train, x_rem, y_train, y_rem = train_test_split(
        np.array(x_values), np.array(y_values), train_size=0.8
    )
    print('# used for training:', len(x_train),len(y_train))
    print('# residuals:', len(x_rem),len(y_rem))
    
    # Testing and Validation
    x_valid, x_test, y_valid, y_test = train_test_split(
        np.array(x_rem), np.array(y_rem), test_size=0.5
    )
    print('# used for validation:', len(x_valid),len(y_valid))
    print('# used for testing:', len(x_test),len(y_test))
    
    print('x_shapes:', x_valid.shape, x_train.shape, x_test.shape)
    print('y_shapes:', y_valid.shape, y_train.shape, y_test.shape)
    
    # Now creating the geojson files
    to_geojson(f"train_{y_name}", x_train, y_train)
    # to_geojson(f"test_{y_name}", x_test, y_test)
    # to_geojson(f"validate_{y_name}", x_valid, y_valid)
    print()

Working on: B02
Total # of values: 500 500
# used for training: 400 400
# residuals: 100 100
# used for validation: 50 50
# used for testing: 50 50
x_shapes: (50, 2) (400, 2) (50, 2)
y_shapes: (50,) (400,) (50,)

Working on: B03
Total # of values: 500 500
# used for training: 400 400
# residuals: 100 100
# used for validation: 50 50
# used for testing: 50 50
x_shapes: (50, 2) (400, 2) (50, 2)
y_shapes: (50,) (400,) (50,)

Working on: B04
Total # of values: 500 500
# used for training: 400 400
# residuals: 100 100
# used for validation: 50 50
# used for testing: 50 50
x_shapes: (50, 2) (400, 2) (50, 2)
y_shapes: (50,) (400,) (50,)

Working on: B08
Total # of values: 500 500
# used for training: 400 400
# residuals: 100 100
# used for validation: 50 50
# used for testing: 50 50
x_shapes: (50, 2) (400, 2) (50, 2)
y_shapes: (50,) (400,) (50,)

Working on: SCL
Total # of values: 500 500
# used for training: 400 400
# residuals: 100 100
# used for validation: 50 50
# used for testing: 50 50


### Test with multiple variables in unique geojson

In [45]:
df_pandas

Unnamed: 0,Index,long,lat,B04,B02,SCL,B03,B08
0,0,-120.960128,40.555278,510,291,4,452,1732
1,1,-121.192512,40.168990,394,250,4,378,1817
2,2,-121.154750,40.071640,369,230,4,521,2655
3,3,-121.769375,40.245480,241,183,4,295,1985
4,4,-121.177963,39.886692,220,258,6,258,169
...,...,...,...,...,...,...,...,...
495,495,-120.577296,40.370247,1352,819,5,1046,2016
496,496,-121.487336,40.461230,600,384,5,642,2425
497,497,-120.619159,40.311844,270,235,4,365,2310
498,498,-120.740778,39.909446,352,108,4,346,1752


In [63]:
y_val = df_pandas['SCL']#.values

In [64]:
X = df_pandas[['long', 'lat', 'B02', 'B03', 'B04', 'B08']]

In [78]:
# Training (directly from dataframe)

# x_train, x_rem, y_train, y_rem = train_test_split(
#         np.array(x_values), np.array(y_values), train_size=0.8
#     )

long_train, long_rem, lat_train, lat_rem, B02_train, B02_rem, B03_train, B03_rem, B04_train, B04_rem, B08_train, B08_rem, SCL_train, SCL_rem = train_test_split(X['long'], X['lat'], X['B02'], X['B03'], X['B04'], X['B08'], y_val, train_size=0.8, random_state=42)
print('# used for training:', len(long_train))
print('# residuals:', len(long_rem))

# used for training: 400
# residuals: 100


In [88]:
# Testing and Validation

# x_valid, x_test, y_valid, y_test = train_test_split(
#     np.array(x_rem), np.array(y_rem), test_size=0.5
# )

long_valid, long_test, lat_valid, lat_test, B02_valid, B02_test, B03_valid, B03_test, B04_valid, B04_test, B08_valid, B08_test, SCL_valid, SCL_test = train_test_split(long_rem, lat_rem, B02_rem, B03_rem, B04_rem, B08_rem, SCL_rem, train_size=0.5)

print('# used for validation:', len(long_valid))
print('# used for testing:', len(long_test))

# used for validation: 50
# used for testing: 50
