In [1]:
import os
from osgeo import gdal, ogr, osr
import pyproj
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import osgeo
import copy
import json

gdal.UseExceptions()

In [2]:
os.environ['PROJ_DATA']='/workspace/.conda/envs/env_labels/share/proj' 
# os.environ['GDAL_DATA']='/workspace/.conda/envs/env_labels/share/gdal'
# os.environ['GTIFF_SRS_SOURCE'] = 'EPSG'

In [3]:
import pystac
from pystac import Link, Asset
from pystac.extensions.label import LabelExtension
from pystac.extensions.label import LabelType
from pystac.extensions.label import LabelClasses
from pystac.extensions.label import LabelStatistics
from pystac.extensions.version import ItemVersionExtension

In [4]:
# Set dataframe to None 
df = None

In [5]:
# Read GeoJSON file and extract point coordinates
def read_geojson_coordinates(geojson_file):
    with open(geojson_file, 'r') as file:
        geojson_data = json.load(file)
    #print(geojson_data['features'])
    
    points = []
    for feature in geojson_data['features']:
        if feature['geometry']['type'] == 'Point':
            lon, lat, _ = feature['geometry']['coordinates']
            points.append((lon, lat))
    return points

In [32]:
# Function to transform unprojected coordinates to projected coordinates
def transform_coordinates(coordinates, epsg_s, epsg_t):
    source_crs = pyproj.CRS(f'EPSG:{epsg_s}') 
    target_crs = pyproj.CRS(f'EPSG:{epsg_t}')  
    transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True)
    transformed_coords = [transformer.transform(lon, lat) for lon, lat in coordinates]
    
    transformed_coords_int = [[int(tc[0]), int(tc[1])] for tc in transformed_coords]
    return transformed_coords_int

In [49]:
# Function to extract pixel values from a GeoTIFF at given coordinates
def extract_pixel_values(b_g, transformed_coords):
    gt = b_g.GetGeoTransform()
    b_rst = b_g.GetRasterBand(1)
    
    values = []

    for lon, lat in transformed_coords:
        px = int((lon - gt[0]) / gt[1])  # Convert longitude to pixel x
        py = int((lat - gt[3]) / gt[5])  # Convert latitude to pixel y

        value = b_rst.ReadAsArray(px, py, 1, 1)[0][0]
        values.append(value)
    
    # Empty raster 
    b_rst = None
    
    return values

# Read ML STAC Item

In [6]:
# Define name of the ML item
ml_item_fname = "ml-aoi-item.json"

In [7]:
ml_item = pystac.read_file(ml_item_fname)
display(ml_item.properties)

{'ml-aoi:split': 'train', 'datetime': '2023-07-26T10:45:52.749205Z'}

## Load S2 scene

In [47]:
s2_href = [l.target for l in ml_item.links if "S2" in l.target][0]
print('href of the S2 scene:', s2_href)

href of the S2 scene: https://earth-search.aws.element84.com/v0/collections/sentinel-s2-l2a-cogs/items/S2A_10TFK_20220524_0_L2A


In [48]:
# Read STAC Item
s2item = pystac.read_file(s2_href)
display(s2item.properties)

{'datetime': '2022-05-24T19:03:29Z',
 'platform': 'sentinel-2a',
 'constellation': 'sentinel-2',
 'instruments': ['msi'],
 'gsd': 10,
 'view:off_nadir': 0,
 'proj:epsg': 32610,
 'sentinel:utm_zone': 10,
 'sentinel:latitude_band': 'T',
 'sentinel:grid_square': 'FK',
 'sentinel:sequence': '0',
 'sentinel:product_id': 'S2A_MSIL2A_20220524T184921_N0400_R113_T10TFK_20220525T004817',
 'sentinel:data_coverage': 100,
 'eo:cloud_cover': 0.24,
 'sentinel:valid_cloud_cover': True,
 'sentinel:processing_baseline': '04.00',
 'sentinel:boa_offset_applied': True,
 'created': '2022-05-25T03:20:43.295Z',
 'updated': '2022-05-25T03:20:43.295Z'}

In [27]:
print(f'Available bands: {list(s2item.assets.keys())}')
epsg_t = s2item.properties['proj:epsg']
print(f'Target EPSG:{epsg_t}')

Available bands: ['thumbnail', 'overview', 'info', 'metadata', 'visual', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL']
Target EPSG:32610


## Load geojson points

In [8]:
geojson_href = ml_item.assets['label'].href
print('href of the geojson file:', geojson_href)

href of the geojson file: label-train.geojson


In [18]:
# Open asset (geojson format) and read all the coordinates within 
coordinates = read_geojson_coordinates(geojson_href)
coordinates[:10]

[(-121.27859163156747, 40.38218701036475),
 (-121.19953674120562, 39.91585786896506),
 (-121.33843034984275, 40.4972903822101),
 (-121.68743345874819, 40.07457719936124),
 (-121.26139230895694, 40.290492371933254),
 (-121.74308928683688, 40.18555755803047),
 (-120.6389470529706, 40.56726326945035),
 (-121.69538478759036, 40.44917135042127),
 (-121.5419727694332, 39.75660703796542),
 (-121.33944402091774, 39.878035273240194)]

In [33]:
# Transofrm coordinates
epsg_s = '4326'
transformed_coords = transform_coordinates(coordinates, epsg_s, epsg_t)
transformed_coords[:10]

[[646120, 4471599],
 [653880, 4419970],
 [640800, 4484280],
 [611920, 4436859],
 [647780, 4461450],
 [607000, 4449109],
 [699869, 4493400],
 [610629, 4478430],
 [624900, 4401760],
 [641999, 4415539]]

## Extract values of selected band(s) for each point in the geojson file
Select a band or a list of bands, ie ```[`B02`, `B03`, `B04`, `B08`,`SCL`]```

In [13]:
bands = ['B02', 'B03', 'B04', 'B08', 'SCL']

In [75]:
band = bands[3]
assert band in list(s2item.assets.keys()), f'Band does not exist'

In [None]:
print('Band:', band)
print(f'- Res: {s2item.assets[band].to_dict()["proj:transform"][0]}m')
# Extract band
b_href = s2item.assets[band].href
print('- href:', b_href)

# Get gdal object
b_g = gdal.Open(b_href)

Band: B08
- Res: 10m
- href: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/10/T/FK/2022/5/S2A_10TFK_20220524_0_L2A/B08.tif


In [None]:
pixel_values = extract_pixel_values(b_g, transformed_coords)

# print(pixel_values)

# Print the extracted pixel values
for (lon, lat), value in zip(transformed_coords[:10], pixel_values[:10]):
    print(f"Coordinate: {lon}, {lat} - Pixel Value: {value}")

# Empty b_g
#b_g = None

Coordinate: 646120, 4471599 - Pixel Value: 1115
Coordinate: 653880, 4419970 - Pixel Value: 1062
Coordinate: 640800, 4484280 - Pixel Value: 1534
Coordinate: 611920, 4436859 - Pixel Value: 1952
Coordinate: 647780, 4461450 - Pixel Value: 1454
Coordinate: 607000, 4449109 - Pixel Value: 2100
Coordinate: 699869, 4493400 - Pixel Value: 2526
Coordinate: 610629, 4478430 - Pixel Value: 2980
Coordinate: 624900, 4401760 - Pixel Value: 2556
Coordinate: 641999, 4415539 - Pixel Value: 1067


## Make Pandas dataframe
**Note**: The *pandas* dataframe will be used as input for the EDA Notebook from the ARSET training.

In [None]:
# make dictionary
data = {'long': [x[0] for x in transformed_coords], 
        'lat': [x[1] for x in transformed_coords], 
        band: pixel_values}
# data

In [83]:
if df is None: 
    print('Creating Dataframe')
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    df.index.name = 'Index'
    
else: 
    print('Adding to existing Dataframe')
    
    # Create temp dataframe
    df2 = pd.DataFrame(data)
    df2.index.name = 'Index'
    
    # Assert the two dataframes have the same long and lat values
    assert df['long'].isin(df2['long']).value_counts().values[0] == len(pixel_values)
    assert df['lat'].isin(df2['lat']).value_counts().values[0] == len(pixel_values)
    
    # Merge temp dataframe with original dataframe, based on matching columns
    df = pd.merge(df, df2, on=['Index', 'long', 'lat'])   
    # Empty memory
    df2 = None
    
display(df)

Adding to existing Dataframe


Unnamed: 0_level_0,long,lat,SCL,B04,B08
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,646120,4471599,5,754,1115
1,653880,4419970,5,738,1062
2,640800,4484280,5,872,1534
3,611920,4436859,4,533,1952
4,647780,4461450,5,1044,1454
...,...,...,...,...,...
395,668920,4432610,2,29,61
396,679400,4444339,4,710,2927
397,672460,4433990,5,891,1265
398,635849,4404110,4,443,2766


In [80]:
# # Export dataframe 
df.to_csv('df_extractedpixels.csv')

In [50]:
print('END')

END
