### postprocess A[xx] columns (from Google EFM dataset) into an `ARRAY<FLOAT>` column

In [23]:

from google.cloud import bigquery
PROJECT_ID = "g4g-eaas"
DATASET_ID = "embeddings_sea"
TABLE_ID = "google_efm_cambodia_v2_method_mapRR_tile_chunks"
query = f"""
SELECT
  id,
  tile,
  geo,
  ARRAY[A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31, A32, A33, A34, A35, A36, A37, A38, A39, A40, A41, A42, A43, A44, A45, A46, A47, A48, A49, A50, A51, A52, A53, A54, A55, A56, A57, A58, A59, A60, A61, A62, A63] AS embedding
FROM
  `{PROJECT_ID}`.{DATASET_ID}.{TABLE_ID}
WHERE
A00 IS NOT NULL
"""

# Run the query and save the result to a new table
result_table = f'{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}_conversion_test3'
job_config = bigquery.QueryJobConfig(destination=result_table)
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query, job_config=job_config)
job.result()  # Wait for the job to complete

<google.cloud.bigquery.table.RowIterator at 0x769ca20f4690>

### Check the new result table

In [24]:
# Check if the result_table exists
def table_exists(client, table_id):
    try:
        client.get_table(table_id)
        print(f"Table {table_id} exists.")
        return True
    except Exception as e:
        print(f"Table {table_id} does not exist. Error: {e}")
        return False

table_exists(client, result_table)

Table g4g-eaas.embeddings_sea.google_efm_cambodia_v2_method_mapRR_tile_chunks_conversion_test3 exists.


True

In [26]:
# check the resulting table's schema and data 
query = f"SELECT id,tile,embedding FROM `{result_table}` LIMIT 10"
query_job = client.query(query)
# print schema 
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

id: INTEGER
tile: STRING
embedding: FLOAT
Row((31737383565331173, '47PRP', [-0.036561610688936985, -0.16025379964327124, 0.13390693539578624, -0.04473288683837816, -0.05629424746626376, -0.013727391777642362, 0.10461089438455531, 0.070570715758561, -0.11470668000416206, 0.14097773673187053, -0.090037885754511, 0.10638771898334795, -0.2699486273946963, 0.10987680456459811, -0.01671837890508104, -0.04011806404719555, 0.06877311292521861, 0.019796001151772387, -0.1524633596090132, 0.03965880338042681, -0.2063223455437522, 0.25561377836232874, 0.12228237049259352, -0.029134781904507873, -0.05566234263410125, 0.09437242485469807, 0.1648749672580442, -0.12588496774938795, -0.2333148004429977, 0.22365253533742502, 0.11037936892959559, -0.01292184152515419, -0.010273292132149165, 0.026666728465396344, 0.02162232826566569, -0.02616938037522516, -0.09310000541446456, -0.06514044590824229, 0.04140343026352888, -0.1499608234028665, 0.16300235901778212, -0.12056823337001231, 0.017001528670991382, 0

### Index table for vector search

In [None]:
# test VECTOR SEARCH operations
in_table = '.'.join(result_table.split(".")[1:]) # remove project-id from table ref
print(f'indexing {in_table} for vector search')
query = f"""
CREATE VECTOR INDEX my_index ON {in_table}(embedding)
OPTIONS(distance_type='COSINE', index_type='IVF', ivf_options='{{"num_lists": 1000}}');
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query)
job.result()  # Wait for the job to complete

table_exists(client, result_table)

indexing embeddings_sea.google_efm_cambodia_v2_method_mapRR_tile_chunks_conversion_test3 for vector search


<google.cloud.bigquery.table._EmptyRowIterator at 0x769ca0fe4590>

### create a test target table for vector search (one embedding record to search for similar records in the whole embedding table)

In [None]:
result_table = result_table+"_test_target"
query = f"SELECT * FROM {in_table} LIMIT 1"

job_config = bigquery.QueryJobConfig(destination=result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

table_exists(client, result_table)

<google.cloud.bigquery.table.RowIterator at 0x769ca20f5310>

In [None]:
import datetime
target_table = '.'.join(result_table.split(".")[1:])
print(target_table)
query = f"""
SELECT query.id AS target_id,
  query.tile AS target_tile,
  base.id AS base_id,
  base.tile AS base_tile,
  distance
FROM
  VECTOR_SEARCH(
    TABLE {in_table},
    'embedding',
    TABLE {target_table},
    top_k => 11,
    distance_type => 'COSINE',
    options => '{{"fraction_lists_to_search": 0.005}}')
ORDER BY distance
LIMIT 10
OFFSET 1;
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
search_result_table = f"{PROJECT_ID}.{DATASET_ID}.vector_search_results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
job_config = bigquery.QueryJobConfig(destination=search_result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

table_exists(client, search_result_table)

embeddings_sea.google_efm_cambodia_v2_method_mapRR_tile_chunks_conversion_test3_test_target


<google.cloud.bigquery.table.RowIterator at 0x769ca20f51d0>

In [31]:
query = f"SELECT * FROM `{search_result_table}` LIMIT 10"
query_job = client.query(query)
# print schema 
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

target_id: INTEGER
target_tile: STRING
base_id: INTEGER
base_tile: STRING
distance: FLOAT
Row((31737383565331173, '47PRP', 31737383565331173, '48PTU', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173, '47PRP', 31737383565331173, '47PRQ', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173, '47PRP', 31737383565331173, '48PTV', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173, '47PRP', 31737383565331173, '47PRP', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173, '47PRP', 31737383565331173, '48PTV', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173, '47PRP', 31737383565331173, '48PTV', 0.0), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((31737383565331173