In [46]:
import ee
print(ee.__version__)
import geemap
from pprint import pprint 
from google.cloud.bigquery import Client

project = "g4g-eaas"
# Set the credentials and project
ee.Initialize(project=project,
                  opt_url="https://earthengine-highvolume.googleapis.com"
              )


1.5.14


In [None]:
# equivalent fc.aggregate_histogram('tile').getInfo() which errors out on mem in EE
query = """
SELECT 
    tile, 
    COUNT(*) AS row_count
FROM 
    `g4g-eaas.embeddings_sea.earthgenome`
GROUP BY 
    tile
ORDER BY 
    row_count DESC
"""
client = Client()
query_job = client.query(query)
records_tile = {}
for row in query_job:
    print(row)
    records_tile[row['tile']] = row['row_count']
print("record count by tile:")
pprint(records_tile)

Row(('47PRP', 234973), {'tile': 0, 'row_count': 1})
Row(('47PRQ', 234942), {'tile': 0, 'row_count': 1})
record count by tile:
{'47PRP': 234973, '47PRQ': 234942}


In [28]:
# load the earthgenome embeddings data from bq
query = """
SELECT 
eg.id as id, 
eg.tile as tile,
eg.geometry as geometry,
eg.embedding as embedding
FROM `g4g-eaas.embeddings_sea.earthgenome` as eg
WHERE eg.tile = '47PRQ'
"""
cambodia = ee.FeatureCollection("FAO/GAUL/2015/level0").filterMetadata('ADM0_NAME',"equals",'Cambodia')
eg_fc = ee.FeatureCollection.runBigQuery(
   query=query,
   geometryColumn='geometry'
)#.filterBounds(cambodia)
pprint(eg_fc.first().getInfo())

{'geometry': {'coordinates': [102.227687464647, 13.086492701779],
              'type': 'Point'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'embedding': {'list': [{'element': 4.781926155090332},
                                       {'element': -0.6935403943061829},
                                       {'element': 4.532716751098633},
                                       {'element': 0.28692296147346497},
                                       {'element': 0.3915715217590332},
                                       {'element': 1.1426082849502563},
                                       {'element': 1.2646913528442383},
                                       {'element': -0.7714712619781494},
                                       {'element': 3.0937726497650146},
                                       {'element': 1.0353734493255615},
                                       {'element': 1.3869132995605469},
                                       {'element': -3.7336509227752686},
        

In [29]:
s2_tiles = ee.FeatureCollection("projects/g4g-eaas/assets/esa_tiles_sea")
s2_tiles = s2_tiles.filterBounds(eg_fc.bounds())

In [30]:
m = geemap.Map()
m.addLayer(eg_fc, {}, "EG embeddings")
m.addLayer(s2_tiles, {}, "S2 tiles")
m.centerObject(eg_fc.first(), 6)
m

Map(center=[13.086492701779, 102.227687464647], controls=(WidgetControl(options=['position', 'transparent_bg']â€¦

In [36]:
# grid = s2_tiles.geometry().coveringGrid(proj="EPSG:4326", scale=320)
# m.addLayer(grid, {}, "grid")
def point_to_patch(point):
    """Convert points to a grid."""
    
    patch = point.geometry().buffer(160).bounds()
    return ee.Feature(patch).copyProperties(point)
# eg_patches = eg_fc.map(lambda f: ee.Feature(f.geometry()).buffer(1000).bounds()).copyProperties(f)
eg_patches = eg_fc.map(point_to_patch)
m.addLayer(eg_patches, {}, "buffered grid")

In [38]:
print(eg_patches.first().getInfo())

{'type': 'Feature', 'geometry': {'geodesic': False, 'type': 'Polygon', 'coordinates': [[[102.22622128933631, 13.085053436534452], [102.22915829609246, 13.085053436534452], [102.22915829609246, 13.087932520331266], [102.22622128933631, 13.087932520331266], [102.22622128933631, 13.085053436534452]]]}, 'id': '_bq_auto_fc2a8563', 'properties': {'embedding': {'list': [{'element': 4.781926155090332}, {'element': -0.6935403943061829}, {'element': 4.532716751098633}, {'element': 0.28692296147346497}, {'element': 0.3915715217590332}, {'element': 1.1426082849502563}, {'element': 1.2646913528442383}, {'element': -0.7714712619781494}, {'element': 3.0937726497650146}, {'element': 1.0353734493255615}, {'element': 1.3869132995605469}, {'element': -3.7336509227752686}, {'element': 1.0351009368896484}, {'element': -0.11400630325078964}, {'element': 0.7370259761810303}, {'element': 0.6315008401870728}, {'element': -0.3531653583049774}, {'element': 1.9662549495697021}, {'element': -0.08766700327396393}, 

In [39]:
# load Google EFM embedding image over EG data coverage
efm = ee.ImageCollection('projects/mldp-partners/assets/preview/efm_v2_preview')
efm = (efm
       .filterDate('2024-01-01', '2024-12-31')
       .filterBounds(eg_fc.bounds())
)
m.addLayer(efm, {}, "EFM")

Aggregate G EFM embedding imagery to EarthGenome embedding Point locations at 320m resolution

In [40]:
def reduce_nested(img, 
                   fc, 
                   reducer:ee.Reducer,
                   scale:int, 
                   crs:str, 
                   crs_transform:ee.List,
                   best_effort:bool, 
                   maxPixels:int, 
                   tileScale:int
                   ):
    def reduce(f):
        reduced = img.reduceRegion(reducer, 
                               f.geometry(), 
                               scale, 
                               crs, 
                               crs_transform, 
                               best_effort, 
                               maxPixels, 
                               tileScale
                               )	
        return f.set(reduced)
    all_reduced = fc.map(reduce)
    return all_reduced

# EG patch embeddings are produced from 32x32 px patches of Sentinel-2 (so 320m^2 image footprints), 
efm_patch_embed = reduce_nested(efm.mosaic(),
                           eg_patches,
                           reducer=ee.Reducer.mean(),
                           scale=10,
                           crs='EPSG:4326',
                           crs_transform=None,
                           best_effort=True,
                           maxPixels=1e13,
                           tileScale=16
                          )
pprint(efm_patch_embed.first().getInfo()['properties'])

{'A00': -0.09589244357506375,
 'A01': 0.020284815629647974,
 'A02': 0.25316772258079623,
 'A03': -0.12266790583546784,
 'A04': 0.11663409863342016,
 'A05': 0.009284601711464734,
 'A06': -0.08787574803108589,
 'A07': -0.2163538821850765,
 'A08': 0.23511399505304373,
 'A09': -0.0770182178336104,
 'A10': 0.020882212964414382,
 'A11': 0.09514047978957305,
 'A12': 0.053084823706730254,
 'A13': -0.03624917264744753,
 'A14': -0.1674137166324829,
 'A15': 0.24574710335890526,
 'A16': -0.11576087539177206,
 'A17': -0.09184697651475733,
 'A18': -0.1334280282715771,
 'A19': 0.09227426803669218,
 'A20': -0.08425396319293091,
 'A21': -0.17057193238418566,
 'A22': 0.13526795147820336,
 'A23': -0.043167294699910784,
 'A24': 0.012711469243570363,
 'A25': -0.03582048261049095,
 'A26': 0.09508179273527369,
 'A27': 0.054695972362788656,
 'A28': 0.12488155290758521,
 'A29': -0.08784246707534632,
 'A30': 0.1483954156718905,
 'A31': -0.0936398762862009,
 'A32': 0.11698218416454442,
 'A33': 0.0645421625809645

In [42]:
# convert band-wise embedding properties returned from img.reduceRegion() to one embedding array property of type list[float]
def embedding_array_prop(feature):
    # Get all property names
    property_names = feature.propertyNames()
    
    # Filter property names starting with "A"
    matching_properties = property_names.filter(ee.Filter.stringStartsWith('item', 'A'))
    
    # Get the values of the matching properties
    matching_values = matching_properties.map(lambda prop: feature.get(prop))
    
    # Add a new property with the list of matching values
    return feature.set('embedding_efm', matching_values)

# Apply the function to the feature collection
updated_fc = efm_patch_embed.map(embedding_array_prop)
updated_fc = updated_fc.select(['id','tile','geometry', 'embedding_efm'])

# Print the first feature to verify
pprint(updated_fc.first().getInfo())

{'geometry': {'coordinates': [[[102.22622128933631, 13.085053436534452],
                               [102.22915829609246, 13.085053436534452],
                               [102.22915829609246, 13.087932520331266],
                               [102.22622128933631, 13.087932520331266],
                               [102.22622128933631, 13.085053436534452]]],
              'geodesic': False,
              'type': 'Polygon'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'embedding_efm': [-0.09589244357506375,
                                  0.020284815629647974,
                                  0.25316772258079623,
                                  -0.12266790583546784,
                                  0.11663409863342016,
                                  0.009284601711464734,
                                  -0.08787574803108589,
                                  -0.2163538821850765,
                                  0.23511399505304373,
                                  -0.0

In [43]:
# can't export 'embedding' property as list[float]... so took the suggestion in the export error to encode it to a string.. we'll convert it back in BQ
def convert_embedding_to_string(feature):
    embedding = feature.get('embedding_efm')
    # Convert the list to a string
    embedding_str = ee.String.encodeJSON(embedding)
    return feature.set('embedding_efm', embedding_str)
updated_fc = updated_fc.map(convert_embedding_to_string)

In [None]:
import math
chunk_size = 1000
for tile in records_tile.keys():
    record_count = records_tile[tile]
    chunks = math.ceil(record_count / chunk_size)
    step = math.ceil(records_tile[tile] / chunks)
    print(record_count, chunks, step)
    
    tile_list = updated_fc.toList(record_count)

    for i in range(chunks):
        start = i * step
        end = (i + 1) * step
        print(start, end)
        record_chunk = tile_list.slice(i*step, (i+1)*step)

        task = ee.batch.Export.table.toBigQuery(
            collection=record_chunk,
            description=f"EG_embeddings_efm_{start}_{end}",
            table='g4g-eaas.embeddings_sea.earthgenome_efm_chunked',
            append=True,
            selectors=['id', 'tile', 'geometry', 'embedding_efm']
        )
        task.start()
    break
# chunk_size = 1000 
# step = 
# float_list = [i * step for i in range(chunks)]
# print(float_list)

# for i in float_list:
#     start = i
#     end = i + step
#     print(f"Exporting {start} to {end}")
#     filtered_fc = for_export.filter(ee.Filter.rangeContains('random', start, end))
#     task = ee.batch.Export.table.toBigQuery(
#         collection=filtered_fc,
#         description=f"EG_embeddings_efm_{start}_{end}",
#         table='g4g-eaas.embeddings_sea.earthgenome_efm',
#         append=True,
#         selectors=['id', 'tile', 'geometry', 'embedding_efm']
#     )
#     task.start()
#     break

234973 235 1000
0 1000
1000 2000
2000 3000
3000 4000
4000 5000
5000 6000
6000 7000
7000 8000
8000 9000
9000 10000
10000 11000
11000 12000
12000 13000
13000 14000
14000 15000
15000 16000
16000 17000
17000 18000
18000 19000
19000 20000
20000 21000
21000 22000
22000 23000
23000 24000
24000 25000
25000 26000
26000 27000
27000 28000
28000 29000
29000 30000
30000 31000
31000 32000
32000 33000
33000 34000
34000 35000
35000 36000
36000 37000
37000 38000
38000 39000
39000 40000
40000 41000
41000 42000
42000 43000
43000 44000
44000 45000
45000 46000
46000 47000
47000 48000
48000 49000
49000 50000
50000 51000
51000 52000
52000 53000
53000 54000
54000 55000
55000 56000
56000 57000
57000 58000
58000 59000
59000 60000
60000 61000
61000 62000
62000 63000
63000 64000
64000 65000
65000 66000
66000 67000
67000 68000
68000 69000
69000 70000
70000 71000
71000 72000
72000 73000
73000 74000
74000 75000
75000 76000
76000 77000
77000 78000
78000 79000
79000 80000
80000 81000
81000 82000
82000 83000
83000 8400

In [None]:
# bq_task = ee.batch.Export.table.toBigQuery(
#     updated_fc,
#     description='efm_agg_embed_on_eg_patches_ones2tile',
#     table='g4g-eaas.embeddings_sea.efm_agg_embed_on_eg_patches_ones2tile',
#     append=True,
#     selectors=['id', 'tile', 'geo', 'embedding_efm'],)
# bq_task.start()

In [None]:
# then with a BQ convert 'embedding' field in the table back to ARRAY<FLOAT64> type saved to a new table
from google.cloud import bigquery

query = """
SELECT
  geo,
  ARRAY(
    SELECT
      CAST(JSON_EXTRACT_SCALAR(value, '$') AS FLOAT64)
    FROM
      UNNEST(JSON_EXTRACT_ARRAY(embedding, '$')) AS value
  ) AS embedding
FROM
  `g4g-eaas`.embeddings_sea.efm_patch_embed_100grids;
  """
# Run the query and save the result to a new table
result_table = 'g4g-eaas.embeddings_sea.efm_patch_embed_100grids_array'
job_config = bigquery.QueryJobConfig(destination=result_table)
client = bigquery.Client()
job = client.query(query, job_config=job_config)
job.result()  # Wait for the job to complete

In [None]:
# Check if the result_table exists
def table_exists(client, table_id):
    try:
        client.get_table(table_id)
        print(f"Table {table_id} exists.")
        return True
    except Exception as e:
        print(f"Table {table_id} does not exist. Error: {e}")
        return False

table_exists(client, result_table)

Table g4g-eaas.embeddings_sea.efm_patch_embed_100grids_array exists.


True

In [None]:
# check the resulting table's schema and data 
query = f"SELECT * FROM `{result_table}` LIMIT 10"
query_job = client.query(query)
# print schema 
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)