In [17]:
from opensearchpy import OpenSearch, RequestsHttpConnection
import json
from langchain_aws import BedrockEmbeddings, ChatBedrock
import boto3

In [6]:
# OpenSearch endpoint config
HOST = "search-my-personalized-search-6gaepnh6huyymltmddtdtm4ihi.us-east-1.es.amazonaws.com"
PORT = 443
INDEX_NAME = "product-index"
# Auth if FGAC is enabled
auth = ("admin-user", "")

In [7]:
# OpenSearch client setup
client = OpenSearch(
    hosts=[{"host": HOST, "port": PORT}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [8]:
# Create index if it doesn't exist
if not client.indices.exists(index=INDEX_NAME):
    client.indices.create(index=INDEX_NAME)
    print(f"Created index: {INDEX_NAME}")

Created index: product-index


In [9]:
# Load sample data
with open("/Users/aktiwary/new-workspace/opensearch-test/sample_products.json", "r") as f:
    items = json.load(f)
items

[{'item_id': 'item_1',
  'title': 'Wireless Noise Cancelling Headphones',
  'category': 'Electronics',
  'price': 199.99},
 {'item_id': 'item_2',
  'title': 'Bluetooth Speaker',
  'category': 'Electronics',
  'price': 59.99},
 {'item_id': 'item_3',
  'title': 'Fitness Smartwatch',
  'category': 'Wearables',
  'price': 129.99},
 {'item_id': 'item_4',
  'title': 'Electric Toothbrush',
  'category': 'Personal Care',
  'price': 39.99},
 {'item_id': 'item_5',
  'title': 'Running Shoes - Men',
  'category': 'Footwear',
  'price': 89.99},
 {'item_id': 'item_6',
  'title': 'Gaming Mouse',
  'category': 'Electronics',
  'price': 49.99},
 {'item_id': 'item_7',
  'title': 'E-book Reader',
  'category': 'Books',
  'price': 99.99},
 {'item_id': 'item_8',
  'title': 'Laptop Stand',
  'category': 'Office',
  'price': 25.99},
 {'item_id': 'item_9',
  'title': '4K Action Camera',
  'category': 'Electronics',
  'price': 149.99},
 {'item_id': 'item_10',
  'title': 'Wireless Charger',
  'category': 'Mobil

In [10]:
# Ingest each item
for item in items:
    client.index(index=INDEX_NAME, id=item["item_id"], body=item)
print(f"Ingested {len(items)} items into index '{INDEX_NAME}'.")

Ingested 10 items into index 'product-index'.


In [12]:
query = {
  "query": {
    "match": {
      "title": "wireless headphones"
    }
  }
}

In [13]:
response = client.search(
    index="product-index",
    body=query
)

In [14]:
response

{'took': 833,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 1.4877305,
  'hits': [{'_index': 'product-index',
    '_id': 'item_10',
    '_score': 1.4877305,
    '_source': {'item_id': 'item_10',
     'title': 'Wireless Charger',
     'category': 'Mobile Accessories',
     'price': 19.99}},
   {'_index': 'product-index',
    '_id': 'item_1',
    '_score': 0.5753642,
    '_source': {'item_id': 'item_1',
     'title': 'Wireless Noise Cancelling Headphones',
     'category': 'Electronics',
     'price': 199.99}}]}}

In [15]:
for hit in response["hits"]["hits"]:
    print(f"{hit['_source']['title']} - Score: {hit['_score']}")

Wireless Charger - Score: 1.4877305
Wireless Noise Cancelling Headphones - Score: 0.5753642


In [18]:
# Initialize Bedrock client
bedrock_client = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
    aws_access_key_id="",
    aws_secret_access_key=""
)

# Initialize models
embed_model = BedrockEmbeddings(
    client=bedrock_client,
    model_id="amazon.titan-embed-text-v1"
)

In [20]:
sample_text = "This is a sample text to embed."
embedding = embed_model.embed_query(sample_text)

print(f"Embedding dimension: {len(embedding)}")

Embedding dimension: 1536


In [19]:
#embed_model.embed_query

def generate_embedding(text):
    return embed_model.embed_query(text) 

In [27]:
index_name = "hybrid-product-index"

index_body = {
  "settings": {
    "index": {
      "knn": True
    }
  },
  "mappings": {
    "properties": {
      "item_id": { "type": "keyword" },
      "title": { "type": "text" },
      "category": { "type": "keyword" },
      "price": { "type": "float" },
      "embedding": {
        "type": "knn_vector",    
        "dimension": 1536,        
        "method": {
          "name": "hnsw",
          "engine": "nmslib",
          "space_type": "cosinesimil"
        }
      }
    }
  }
}

In [28]:
# Create the index
if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name, body=index_body)
    print(f"Created hybrid index: {index_name}")

Created hybrid index: hybrid-product-index


In [29]:
items

[{'item_id': 'item_1',
  'title': 'Wireless Noise Cancelling Headphones',
  'category': 'Electronics',
  'price': 199.99},
 {'item_id': 'item_2',
  'title': 'Bluetooth Speaker',
  'category': 'Electronics',
  'price': 59.99},
 {'item_id': 'item_3',
  'title': 'Fitness Smartwatch',
  'category': 'Wearables',
  'price': 129.99},
 {'item_id': 'item_4',
  'title': 'Electric Toothbrush',
  'category': 'Personal Care',
  'price': 39.99},
 {'item_id': 'item_5',
  'title': 'Running Shoes - Men',
  'category': 'Footwear',
  'price': 89.99},
 {'item_id': 'item_6',
  'title': 'Gaming Mouse',
  'category': 'Electronics',
  'price': 49.99},
 {'item_id': 'item_7',
  'title': 'E-book Reader',
  'category': 'Books',
  'price': 99.99},
 {'item_id': 'item_8',
  'title': 'Laptop Stand',
  'category': 'Office',
  'price': 25.99},
 {'item_id': 'item_9',
  'title': '4K Action Camera',
  'category': 'Electronics',
  'price': 149.99},
 {'item_id': 'item_10',
  'title': 'Wireless Charger',
  'category': 'Mobil

In [31]:
# Ingest each item with embedding
for item in items:
    title = item["title"]
    embedding = embed_model.embed_query(title)
    doc = {
        "item_id": item["item_id"],
        "title": title,
        "category": item["category"],
        "price": item["price"],
        "embedding": embedding
    }
    client.index(index=index_name, id=item["item_id"], body=doc)
print(f"Ingested {len(items)} items into hybrid index.")

Ingested 10 items into hybrid index.


In [32]:
query_text = "wireless headphones"
q_embedding = embed_model.embed_query(query_text)  # returns 384-d vector
query_body = {
    "size": 5,
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "title": {
                            "query": query_text,
                            "boost": 1.0  # Boost lexical score
                        }
                    }
                },
                {
                    "knn": {
                        "embedding": {
                            "vector": q_embedding,
                            "k": 5,
                            "boost": 2.0  # Boost semantic score
                        }
                    }
                }
            ]
        }
    }
}

In [33]:
results = client.search(index=index_name, body=query_body)
for hit in results["hits"]["hits"]:
    print(f"{hit['_source']['title']} - Score: {hit['_score']}")

Wireless Charger - Score: 2.8612733
Wireless Noise Cancelling Headphones - Score: 2.0828602
Bluetooth Speaker - Score: 1.3651588
Fitness Smartwatch - Score: 1.2136363
Gaming Mouse - Score: 1.1607678


In [39]:
for hit in results["hits"]["hits"]:
    print(f"{hit['_source']['item_id']} - {hit['_source']['title']} - Score: {hit['_score']}")

item_10 - Wireless Charger - Score: 2.8612733
item_1 - Wireless Noise Cancelling Headphones - Score: 2.0828602
item_2 - Bluetooth Speaker - Score: 1.3651588
item_3 - Fitness Smartwatch - Score: 1.2136363
item_6 - Gaming Mouse - Score: 1.1607678
