# postprocess embedding_efm column back to `array<float>` data type

In [10]:
# then with a BQ convert 'embedding' field in the table back to ARRAY<FLOAT64> type saved to a new table
from google.cloud import bigquery
PROJECT_ID = "g4g-eaas"
DATASET_ID = "embeddings_sea"
TABLE_ID = "google_efm_cambodia"
query = f"""
SELECT
  geo,
  id,
  tile,
  ARRAY(
    SELECT
      CAST(JSON_EXTRACT_SCALAR(value, '$') AS FLOAT64)
    FROM
      UNNEST(JSON_EXTRACT_ARRAY(embedding_efm, '$')) AS value
  ) AS embedding
FROM
  `{PROJECT_ID}`.{DATASET_ID}.{TABLE_ID};
  """
# Run the query and save the result to a new table
result_table = f'{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}_conversion_test1'
job_config = bigquery.QueryJobConfig(destination=result_table)
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query, job_config=job_config)
job.result()  # Wait for the job to complete

<google.cloud.bigquery.table.RowIterator at 0x7fc29b9e20d0>

In [11]:
# Check if the result_table exists
def table_exists(client, table_id):
    try:
        client.get_table(table_id)
        print(f"Table {table_id} exists.")
        return True
    except Exception as e:
        print(f"Table {table_id} does not exist. Error: {e}")
        return False

table_exists(client, result_table)

Table g4g-eaas.embeddings_sea.google_efm_cambodia_conversion_test1 exists.


True

In [12]:
# check the resulting table's schema and data 
query = f"SELECT * FROM `{result_table}` LIMIT 10"
query_job = client.query(query)
# print schema 
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

geo: GEOGRAPHY
id: INTEGER
tile: STRING
embedding: FLOAT
Row(('POLYGON((105.051448925516 10.7005940153157, 105.054360720269 10.7005939632297, 105.054360720269 10.7034745880594, 105.051448925516 10.703474588663, 105.051448925516 10.7005940153157))', 31647001323698347, '48PVS', [-0.2313850938233602, -0.05231038920176694, 0.08356363798402106, 0.20064622934840878, 0.003954438262345897, -0.010949231068228236, 0.1946944913612553, -0.3685272875868009, 0.2342286242556888, -0.09714995457329867, -0.0679700103875479, -0.009555064409590398, -0.023743032738178342, 0.23121663442273632, 0.07961704900300105, 0.09225343373022382, -0.02910634693481924, -0.06428530274790685, 0.03139708546473783, -0.03343438733264721, -0.09090136904219503, -0.1594993312423201, 0.24071301675244727, 0.05401874198466279, 0.14401889273919008, -0.04333250976075561, -0.006985321274473042, -0.1333130497986482, -0.012656497215399402, -0.19883739648210536, -0.015142549210852682, -0.18591267376236187, -0.020845815334439025, -0.1633

# Index table for vector search

In [None]:
# test VECTOR SEARCH operations
in_table = '.'.join(result_table.split(".")[1:])
print(f'indexing {in_table} for vector search')
query = f"""
CREATE VECTOR INDEX my_index ON {in_table}(embedding)
OPTIONS(distance_type='COSINE', index_type='IVF', ivf_options='{{"num_lists": 1000}}');
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query)
job.result()  # Wait for the job to complete

indexing embeddings_sea.earthgenome_cambodia_v1 for vector search


<google.cloud.bigquery.table._EmptyRowIterator at 0x7dd0264afb60>

### create a test target for vector search (one embedding location to search for similar locations)

In [None]:
result_table = result_table+"_test_target"
query = f"SELECT * FROM {in_table} LIMIT 1"

job_config = bigquery.QueryJobConfig(destination=result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

<google.cloud.bigquery.table.RowIterator at 0x7dd018c62ad0>

In [None]:
import datetime
target_table = '.'.join(result_table.split(".")[1:])
print(target_table)
query = f"""
SELECT query.id AS target_id,
  query.tile AS target_tile,
  base.id AS base_id,
  base.tile AS base_tile,
  distance
FROM
  VECTOR_SEARCH(
    TABLE {in_table},
    'embedding',
    TABLE {target_table},
    top_k => 11,
    distance_type => 'COSINE',
    options => '{{"fraction_lists_to_search": 0.005}}')
ORDER BY distance
LIMIT 10
OFFSET 1;
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
search_result_table = f"{PROJECT_ID}.{DATASET_ID}.vector_search_results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
job_config = bigquery.QueryJobConfig(destination=search_result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

embeddings_sea.earthgenome_cambodia_v1_test_target


<google.cloud.bigquery.table.RowIterator at 0x7dd01ae79810>

In [None]:
query = f"SELECT * FROM `{search_result_table}` LIMIT 10"
query_job = client.query(query)
# print schema 
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

target_id: INTEGER
target_tile: STRING
base_id: INTEGER
base_tile: STRING
distance: FLOAT
Row((31737298129880965, '47PRP', 31737295412923663, '47PRP', 0.018239032724844506), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737298129880965, '47PRP', 31737318005760655, '47PRP', 0.018310001948530275), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737298129880965, '47PRP', 31737318187192539, '47PRP', 0.018820251294408008), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737298129880965, '47PRP', 31738782341388707, '47PRQ', 0.019642342550201808), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737298129880965, '47PRP', 31738794989829102, '47PRQ', 0.019749958040534477), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737298129880965, '47PRP', 31737295417137182, '47PRP', 0.020084470457225834), {