In [None]:
%pip install -U opensearch-py==2.3.1
%pip install -U boto3==1.33.2
%pip install -U retrying==1.3.4

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import boto3
import pprint
import random

suffix = random.randrange(200, 900)

sts_client = boto3.client('sts')

boto3_session = boto3.session.Session()
region_name = boto3_session.region_name
pp = pprint.PrettyPrinter(indent=2)
# Get the current AWS account id
account_id=sts_client.get_caller_identity().get('Account')
# Set the Glue Database name
database_name="snowflake"

Here we pull the snowflake table metadata, be sure to run the crawler first

In [None]:
import json

glue_client=boto3.client("glue")
response = glue_client.get_tables(
    CatalogId=account_id,
    DatabaseName=database_name,
)
snowflake_tables=response["TableList"]
print(json.dumps(snowflake_tables, indent=4, sort_keys=True, default=str))

Here we setup the vector store
Enter the Opensearch serverless node hostname

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import time
credentials = boto3.Session().get_credentials()
awsauth = AWSV4SignerAuth(credentials, region_name, "aoss")
aoss_node="mey2dta082iatb0w5x43.us-east-1.aoss.amazonaws.com"
index_name = "snowflake"
body_json = {
    "settings": {
        "index.knn": "true",
        "number_of_shards": 1,
        "knn.algo_param.ef_search": 512,
        "number_of_replicas": 0,
    },
    "mappings": {
        "properties": {
            "vector": {
                "type": "knn_vector",
                "dimension": 1536,
                "method": {
                    "name": "hnsw",
                    "engine": "faiss"
                },
            },
            "text": {
                "type": "text"
            },
            "text-metadata": {
                "type": "text"         }
        }
    }
}
# Build the OpenSearch client
oss_client = OpenSearch(
    hosts=[{'host': aoss_node, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=300
)

# Create index
response = oss_client.indices.create(index=index_name, body=json.dumps(body_json))

print('\nCreating index:')
print(response)
time.sleep(5) # index creation can take up to a minute

After this we need to take the table metadata, run it through titan, and then store the embeddings in the vector store 