### We have EarthGenome's Patch Embeddings accessible as GeoParquet files on Source.Coop. 

### In previous [notebook](src/01_earthgenome_embeddings_bq_vectorsearch.ipynb) we load individual .gpq files into a BigQuery table to enable vector search.

### Now we want to test if pixel-based embedding datasets like Google's EFM can be used the same way for patch-based vector search.



### The compute workflow is a little clunky and requires a few intermediate steps:

1. Load earthgenome embeddings BQ table into EE, convert each record's geometry (currently Point) to a Geometry Box representing EG embeddings' image patch dimensions, export result as FeatureCollection
2. Use the pre-exported EG patch FeatureCollection to average Google EFM 2024 image mosaic over the patch areas, export result to a new BQ table
3. Postprocessing step in BQ to deal with some data type conversion limitations going between EE<>BQ, then join the EFM patch embedding BQ table to the original EG embedding BQ table.

# Step 1

In [17]:
import ee
print(ee.__version__)
import geemap
from pprint import pprint 
from google.cloud.bigquery import Client

project = "g4g-eaas"
# Set the credentials and project
ee.Initialize(project=project,
                  opt_url="https://earthengine-highvolume.googleapis.com"
              )


1.5.14


return embeddings record count by ESA Grid ID from the EarthGenome embeddings BigQuery table

In [18]:
# equivalent fc.aggregate_histogram('tile').getInfo() which errors out on mem in EE
PROJECT_ID = "g4g-eaas"
DATASET_ID = "embeddings_sea"
TABLE_ID = "earthgenome_cambodia_v1"
query = f"""
SELECT 
    tile, 
    COUNT(*) AS row_count
FROM 
    `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
GROUP BY 
    tile
ORDER BY 
    row_count DESC
"""
client = Client(project=PROJECT_ID)
query_job = client.query(query)
records_tile = {}
for row in query_job:
    # print(row)
    records_tile[row['tile']] = row['row_count']
print(f"Total tiles in BQ table: {len(records_tile)}")
print("record count by tile:")
pprint(records_tile)

Total tiles in BQ table: 36
record count by tile:
{'47PRP': 234973,
 '47PRQ': 234942,
 '47PRR': 234927,
 '48PTA': 235319,
 '48PTR': 235437,
 '48PTS': 235396,
 '48PTT': 235393,
 '48PTU': 235383,
 '48PTV': 235335,
 '48PUA': 235520,
 '48PUB': 235520,
 '48PUS': 235644,
 '48PUT': 235638,
 '48PUU': 235624,
 '48PUV': 235520,
 '48PVA': 235698,
 '48PVB': 235674,
 '48PVS': 235758,
 '48PVT': 235748,
 '48PVU': 235720,
 '48PVV': 235709,
 '48PWA': 235708,
 '48PWS': 235737,
 '48PWT': 235712,
 '48PWU': 235727,
 '48PWV': 235699,
 '48PXA': 235520,
 '48PXB': 235516,
 '48PXS': 235606,
 '48PXT': 235520,
 '48PXU': 235520,
 '48PXV': 235520,
 '48PYA': 235296,
 '48PYB': 235267,
 '48PYU': 235331,
 '48PYV': 235318}


load this embeddings table from BQ to an EE FC

In [19]:
# load the earthgenome embeddings data from bq
query = f"""
SELECT 
eg.id as id, 
eg.tile as tile,
eg.geometry as geometry,
FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}` as eg
"""
cambodia = ee.FeatureCollection("FAO/GAUL/2015/level0").filterMetadata('ADM0_NAME',"equals",'Cambodia')
eg_fc = ee.FeatureCollection.runBigQuery(
   query=query,
   geometryColumn='geometry'
).filterBounds(cambodia) # some embedding records are outside Google EFM processing coverage i.e. far off coasts
pprint(eg_fc.first().getInfo())

{'geometry': {'coordinates': [102.657078342824, 12.5748187422147],
              'type': 'Point'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'id': 31738133314582080, 'tile': '47PRP'},
 'type': 'Feature'}


In [20]:
def point_to_patch(point):
    """Convert points to a grid."""
    
    patch = point.geometry().buffer(160).bounds()
    return ee.Feature(patch).copyProperties(point)
eg_patches = eg_fc.map(point_to_patch)
# m.addLayer(eg_patches, {}, "buffered grid")

In [21]:
pprint(eg_patches.first().getInfo())

{'geometry': {'coordinates': [[[102.65561514660341, 12.573379476970283],
                               [102.65854618563993, 12.573379476970283],
                               [102.65854618563993, 12.576258560766968],
                               [102.65561514660341, 12.576258560766968],
                               [102.65561514660341, 12.573379476970283]]],
              'geodesic': False,
              'type': 'Polygon'},
 'id': '_bq_auto_fc2a8563',
 'properties': {'id': 31738133314582080, 'tile': '47PRP'},
 'type': 'Feature'}


In [23]:
# first test, if exporting the patch grid fc as intermediate result FC will it work.. 
# the EG embeddings as loaded into BQ has a complicated nesting structure that prohibits easy export
# what if we instead just remove all properties except geom and id to avoid difficulties with exporting the weird nesting structure of the embedding?

for tile in records_tile.keys():
    # if tile != '47PRP':
    #     continue
    print("tile:", tile)
    eg_patches_tile = (eg_patches
                         .select(['id', 'geometry','tile'])
                         .filter(ee.Filter.eq('tile',tile)))
    task = ee.batch.Export.table.toAsset(
        collection=eg_patches_tile,
        description=f'eg_pt_to_patch_fc_{tile}',
        assetId=f'projects/g4g-eaas/assets/intmd/eg_pt_to_patch_fc_tile_{tile}'
    )
    task.start()



tile: 47PRP


In [None]:
eg_patches_asset = (ee.FeatureCollection("projects/g4g-eaas/assets/eg_pt_to_patch_fc_tile_47PRP")
                    .filterBounds(cambodia))

m=geemap.Map()
m.addLayer(cambodia, {}, "country")
m.addLayer(eg_patches_asset,{}  ,"EG patches")
