In [1]:
import ee
print(ee.__version__)
import geemap
from pprint import pprint 
from google.cloud.bigquery import Client

project = "g4g-eaas"
# Set the credentials and project
ee.Initialize(project=project,
                  opt_url="https://earthengine-highvolume.googleapis.com"
              )


1.5.14


return embeddings record count by ESA Grid ID from the EarthGenome embeddings BigQuery table

In [2]:
# equivalent fc.aggregate_histogram('tile').getInfo() which errors out on mem in EE
query = """
SELECT 
    tile, 
    COUNT(*) AS row_count
FROM 
    `g4g-eaas.embeddings_sea.earthgenome`
GROUP BY 
    tile
ORDER BY 
    row_count DESC
"""
client = Client()
query_job = client.query(query)
records_tile = {}
for row in query_job:
    print(row)
    records_tile[row['tile']] = row['row_count']
print("record count by tile:")
pprint(records_tile)

Row(('47PRP', 234973), {'tile': 0, 'row_count': 1})
Row(('47PRQ', 234942), {'tile': 0, 'row_count': 1})
record count by tile:
{'47PRP': 234973, '47PRQ': 234942}


return chunks of the BQ table as a FC, do the same processing, but then no need to filter or limit or slice the FC upon export

In [None]:
chunk_size = 1000
iter_offset = 0

query = f"""
SELECT 
eg.id as id, 
eg.tile as tile,
eg.geometry as geometry,
FROM `g4g-eaas.embeddings_sea.earthgenome` as eg
ORDER BY
id
LIMIT {chunk_size}
OFFSET {iter_offset}
"""

client = Client()
query_job = client.query(query)
ids = []
for row in query_job:
    ids.append(row['id'])
# this will be problematic if after pulling a chunk of records from BQ we need to check if any of the records fall in EFM image coverage
cambodia = ee.FeatureCollection("FAO/GAUL/2015/level0").filterMetadata('ADM0_NAME',"equals",'Cambodia')
eg_fc = ee.FeatureCollection.runBigQuery(
   query=query,
   geometryColumn='geometry'
).filterBounds(cambodia)
ids_ee_bq = eg_fc.aggregate_array('id').sort().getInfo()
# pprint(eg_bq_chunk_fc.aggregate_array('id').sort().getInfo())

print("ids from BigQuery:")
pprint(ids)
print("ids from FeatureCollection.runBigQuery()")
print(ids_ee_bq)
print(ids_ee_bq == ids)
# guess it doesn't matter that they match.. just want to make sure that runBigQuery() is honnoring the variables injected with the f-string..

ids from BigQuery:
[31736517531612187,
 31736517555744346,
 31736517563380488,
 31736517566271837,
 31736517711983886,
 31736517716358495,
 31736517729001551,
 31736517733212490,
 31736517736279882,
 31736517737089308,
 31736517740493339,
 31736517741300749,
 31736517744440153,
 31736517746644045,
 31736517748651593,
 31736517750887752,
 31736517753125643,
 31736517753932893,
 31736517757336414,
 31736517758143832,
 31736517761283868,
 31736517763661849,
 31736517765669388,
 31736517767872777,
 31736517780055066,
 31736517784265995,
 31736517796898142,
 31736517801111630,
 31736517804464201,
 31736517806276175,
 31736517808675097,
 31736517811823197,
 31736517814059356,
 31736517816066904,
 31736517818270797,
 31736517818875679,
 31736517821307916,
 31736517823130138,
 31736517825518685,
 31736517828666952,
 31736517833655581,
 31736517926217735,
 31736517933584963,
 31736517958499926,
 31736517961259328,
 31736517966504708,
 31736517968643073,
 31736518059875519,
 31736518060667579,
 

In [4]:
s2_tiles = ee.FeatureCollection("projects/g4g-eaas/assets/esa_tiles_sea")
s2_tiles = s2_tiles.filterBounds(eg_fc.bounds())

In [5]:
m = geemap.Map()
m.addLayer(eg_fc, {}, "EG embeddings")
m.addLayer(s2_tiles, {}, "S2 tiles")
m.centerObject(eg_fc.first(), 6)
m

Map(center=[11.6607045897346, 101.75310433669502], controls=(WidgetControl(options=['position', 'transparent_bâ€¦

In [6]:
# grid = s2_tiles.geometry().coveringGrid(proj="EPSG:4326", scale=320)
# m.addLayer(grid, {}, "grid")
def point_to_patch(point):
    """Convert points to a grid."""
    
    patch = point.geometry().buffer(160).bounds()
    return ee.Feature(patch).copyProperties(point)
# eg_patches = eg_fc.map(lambda f: ee.Feature(f.geometry()).buffer(1000).bounds()).copyProperties(f)
eg_patches = eg_fc.map(point_to_patch)
m.addLayer(eg_patches, {}, "buffered grid")

In [7]:
pprint(eg_patches.first().getInfo())

{'geometry': {'coordinates': [[[101.75164614408118, 11.659265324490415],
                               [101.75456715987515, 11.659265324490415],
                               [101.75456715987515, 11.662144408286865],
                               [101.75164614408118, 11.662144408286865],
                               [101.75164614408118, 11.659265324490415]]],
              'geodesic': False,
              'type': 'Polygon'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'id': 31736517531612188, 'tile': '47PRP'},
 'type': 'Feature'}


In [13]:
# load Google EFM embedding image over EG data coverage
efm = ee.ImageCollection('projects/mldp-partners/assets/preview/efm_v2_preview')
efm = (efm
       .filterDate('2024-01-01', '2024-12-31') # EG embeddings are from 2024
       # .filterBounds(eg_fc.bounds())
)
m.addLayer(efm, {}, "EFM")

Aggregate G EFM embedding imagery to EarthGenome embedding Point locations at 320m resolution

In [9]:
def reduce_nested(img, 
                   fc, 
                   reducer:ee.Reducer,
                   scale:int, 
                   crs:str, 
                   crs_transform:ee.List,
                   best_effort:bool, 
                   maxPixels:int, 
                   tileScale:int
                   ):
    def reduce(f):
        reduced = img.reduceRegion(reducer, 
                               f.geometry(), 
                               scale, 
                               crs, 
                               crs_transform, 
                               best_effort, 
                               maxPixels, 
                               tileScale
                               )	
        return f.set(reduced)
    all_reduced = fc.map(reduce)
    return all_reduced

# EG patch embeddings are produced from 32x32 px patches of Sentinel-2 (so 320m^2 image footprints), 
efm_patch_embed = reduce_nested(efm.mosaic(),
                           eg_patches,
                           reducer=ee.Reducer.mean(),
                           scale=10,
                           crs='EPSG:4326',
                           crs_transform=None,
                           best_effort=True,
                           maxPixels=1e13,
                           tileScale=16
                          )

pprint(efm_patch_embed.first().getInfo()['properties'])

{'id': 31736517531612188, 'tile': '47PRP'}


In [10]:
# convert band-wise embedding properties returned from img.reduceRegion() to one embedding array property of type list[float]
def embedding_array_prop(feature):
    # Get all property names
    property_names = feature.propertyNames()
    
    # Filter property names starting with "A"
    matching_properties = property_names.filter(ee.Filter.stringStartsWith('item', 'A'))
    
    # Get the values of the matching properties
    matching_values = matching_properties.map(lambda prop: feature.get(prop))
    
    # Add a new property with the list of matching values
    return feature.set('embedding_efm', matching_values)

# Apply the function to the feature collection
updated_fc = efm_patch_embed.map(embedding_array_prop)
updated_fc = updated_fc.select(['id','tile','geometry', 'embedding_efm'])
# Print the first feature to verify
pprint(updated_fc.first().getInfo())

{'geometry': {'coordinates': [[[101.75164614408118, 11.659265324490415],
                               [101.75456715987515, 11.659265324490415],
                               [101.75456715987515, 11.662144408286865],
                               [101.75164614408118, 11.662144408286865],
                               [101.75164614408118, 11.659265324490415]]],
              'geodesic': False,
              'type': 'Polygon'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'embedding_efm': [], 'id': 31736517531612188, 'tile': '47PRP'},
 'type': 'Feature'}


In [11]:
# can't export 'embedding' property as list[float]... so took the suggestion in the export error to encode it to a string.. we'll convert it back in BQ
def convert_embedding_to_string(feature):
    embedding = feature.get('embedding_efm')
    # Convert the list to a string
    embedding_str = ee.String.encodeJSON(embedding)
    return feature.set('embedding_efm', embedding_str)

updated_fc = updated_fc.map(convert_embedding_to_string)
pprint(updated_fc.first().get('embedding_efm').getInfo())

'[]'


In [12]:
task = ee.batch.Export.table.toAsset(
    collection=updated_fc,
    description=f'efm_patch_embed_bq_chunks_{chunk_size}_{iter_offset}',
    assetId='projects/g4g-eaas/assets/efm_patch_embed_bq_chunks'
)
# task.start()