In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

In [None]:
PROJECT_ID = "uk-bh-experiments-argolis"


#[CONFIG]
embedding_model = 'vertex' # Options: 'vertex' or 'vertex-lang'
description_model = 'gemini-1.5-pro-001' # 'gemini-1.5-pro-001', 'gemini-1.5-pro', 'text-bison-32k'
data_source = 'bigquery' #  Options: 'bigquery' and 'cloudsql-pg' 
vector_store = 'bigquery-vector' # Options: 'bigquery-vector', 'cloudsql-pgvector'
logging = True # True or False 
kgq_examples = True # True or False 

#[GCP]
project_id = PROJECT_ID

#[PGCLOUDSQL]
# If you want to use PG as source, fill out the values below
pg_region = ''
pg_instance = ''
pg_database = ''
pg_user = ''
pg_password = ''
pg_schema = ''

#[BIGQUERY]
# If you want to use BQ as source, fill out the values below
bq_dataset_region = 'us-east1'
bq_dataset_name = 'breedr'

# Name for the BQ dataset created for bigquery-vector and/or logging
bq_opendataqna_dataset_name = 'opendataqna'
bq_log_table_name = 'audit_log_table' 
bq_table_list = None #['animals', 'activities', 'activity_types'] #None # either None or a list of table names in format ['reviews', 'ratings']

#Decode Region and Userdatabase based on source
dataset_region = bq_dataset_region
user_database=bq_dataset_name 

In [None]:

# Input verification - Source
assert data_source in {'bigquery', 'cloudsql-pg'}, "⚠️ Invalid DATA_SOURCE. Must be 'bigquery' or 'cloudsql-pg'"

# Input verification - Vector Store
assert vector_store in {'bigquery-vector', 'cloudsql-pgvector'}, "⚠️ Invalid VECTOR_STORE. Must be 'bigquery-vector' or 'cloudsql-pgvector'"

if logging: 
    assert bq_log_table_name, "⚠️ Please provide a name for your log table if you want to use logging"

if data_source == 'bigquery':
    assert bq_dataset_region, "⚠️ Please provide the Data Set Region"
    assert bq_dataset_name, "⚠️ Please provide the name of the dataset on Bigquery"

elif data_source == 'cloudsql-pg':
    assert pg_region, "⚠️ Please provide Region of the Cloud SQL Instance"
    assert pg_instance, "⚠️ Please provide the name of the Cloud SQL Instance"
    assert pg_database, "⚠️ Please provide the name of the PostgreSQL Database on the Cloud SQL Instance"
    assert pg_user, "⚠️ Please provide a username for the Cloud SQL Instance"
    assert pg_password, "⚠️ Please provide the Password for the PG_USER"

In [None]:
from scripts import save_config

save_config(embedding_model, description_model, data_source, vector_store, logging, kgq_examples, PROJECT_ID,
            pg_region, pg_instance, pg_database, pg_user, pg_password, pg_schema, 
            bq_dataset_region, bq_dataset_name, 
            bq_opendataqna_dataset_name, bq_log_table_name, bq_table_list)

In [None]:
from env_setup import create_vector_store
# Setup vector store for embeddings
create_vector_store()  


In [None]:
from env_setup import get_embeddings

# Generate embeddings for tables and columns
table_schema_embeddings, col_schema_embeddings = get_embeddings()  


In [None]:
print("Table Descriptions")
table_schema_embeddings.head()

In [None]:
print("Column Descriptions")
col_schema_embeddings.head()

In [None]:
from env_setup import store_embeddings

# Store table/column embeddings (asynchronous)
await(store_embeddings(table_schema_embeddings, col_schema_embeddings)) 

In [None]:
bq_table_list = None #[ 'activities', 'activity_types'] #None # either None or a list of table names in format ['reviews', 'ratings']

from scripts import save_config

save_config(embedding_model, description_model, data_source, vector_store, logging, kgq_examples, PROJECT_ID,
            pg_region, pg_instance, pg_database, pg_user, pg_password, pg_schema, 
            bq_dataset_region, bq_dataset_name, 
            bq_opendataqna_dataset_name, bq_log_table_name, bq_table_list)


from env_setup import get_embeddings

# Generate embeddings for tables and columns
table_schema_embeddings, col_schema_embeddings = get_embeddings()  

print("Table Descriptions")
table_schema_embeddings.head()

print("Column Descriptions")
col_schema_embeddings.head()

from env_setup import store_embeddings

# Store table/column embeddings (asynchronous)
await(store_embeddings(table_schema_embeddings, col_schema_embeddings)) 