In [None]:
%pip install weaviate-client --quiet

<mark>Weaviate documentation for query<br></mark>
https://weaviate.io/developers/weaviate/tutorials/query

<mark>Weaviate documentation on Search API<br></mark>
https://weaviate.io/developers/weaviate/search

In [None]:
import ast
import json
import pandas as pd
import weaviate

<mark>Define the load balancer for the Weaviate instance</mark>

In [None]:
elb_endpoint = ''

In [None]:
wv_client = weaviate.Client(url=f"http://{elb_endpoint}")

<h1>Basic Search</h1>

In [None]:
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','company_address','document_summary','file','stylus'])
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# return the vector associated with the obejct 
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','company_address','document_summary','file','stylus'])
    .with_additional("vector")
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

<h1>Filtering based on metadata</h1>

In [None]:
where_filter = {
    "path": ["stylus"],
    "operator": "Equal",
    "valueBoolean": True,
}

result = (
    wv_client.query
    .get("Manual", ['model_names','key_features','stylus'])
    .with_where(where_filter)
    .with_limit(2)
    .do()
)

print(json.dumps(result, indent=2))

<h1>Aggregate data</h1>

In [None]:
response = (
    wv_client.query
    .aggregate("Manual")
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# combine metadata filtering and aggregation

where_filter = {
    "path": ["model_names"],
    "operator": "Like",
    "valueText": "Galaxy",
}

response = (
    wv_client.query
    .aggregate("Manual")
    .with_where(where_filter)
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

<h1>Similarity / Vector Search</h1>

In [None]:
nearText = {"concepts": ["rugged design"]}

response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','stylus'])
    .with_near_text(nearText)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
where_filter = {
    "path": ["model_names"],
    "operator": "Like",
    "valueText": "Galaxy",
}

nearText = {"concepts": ["fast charging"]}

result = (
    wv_client.query
    .get("Manual", ['model_names','key_features','stylus'])
    .with_where(where_filter)
    .with_near_text(nearText)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(result, indent=2))

In [None]:
# ... and we can set a threshold for the distance
where_filter = {
    "path": ["model_names"],
    "operator": "Like",
    "valueText": "Galaxy",
}

max_distance = .4
nearText = {"concepts": ["fast charging"],
           "distance": max_distance}


result = (
    wv_client.query
    .get("Manual", ['model_names','key_features','stylus'])
    .with_where(where_filter)
    .with_near_text(nearText)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(result, indent=2))

<h1>BM25 / Keyword Search</h1>

In [None]:
# basic bm25 search
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_bm25(query="biometric")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# return the bm25 algorithm score
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_bm25(query="biometric")
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# perform bm25 search only in specific fields of the object
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_bm25(query="front camera", properties=["key_features"]) # if not provided, all text fields are searched
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# boost a specific attribute in keyword search
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_bm25(query="front camera", properties=["key_features^2","document_summary"]) # if not provided, all text fields are searched
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

<h1>Hybrid (BM25 + Semantic Vector) Search</h1>

In [None]:
# basic hybrid search
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_hybrid(query="biometric")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# include bm25 score and explanations

response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_hybrid(query="biometric")
    .with_additional(["score", "explainScore"])
    .with_limit(3)
    .do()
)


print(json.dumps(response, indent=2))

In [None]:
# weight the bm25 and vector scores
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_hybrid(query="large screen", alpha=0.25) # An alpha of 1 is for a pure vector search and 0 is for a pure keyword search
    .with_additional(["score", "explainScore"])
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# note you can combine metadata filtering with all of these search types

where_filter = {
    "path": ["model_names"],
    "operator": "Like",
    "valueText": "Galaxy",
}

response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_hybrid(query="large screen", alpha=0.25) # An alpha of 1 is for a pure vector search and 0 is for a pure keyword search
    .with_additional(["score"])
    .with_where(where_filter)
    .with_limit(3)
    .do()
)


print(json.dumps(response, indent=2))

<h1>Reranking Search</h1>

In [None]:
# Reranking vector search results
# let's say we want to find phones with a large screen, and further sort towards the top those with fast charging.

# we can start with searching near the concept of large screen
nearText = {"concepts": ["large screen"]}

response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_near_text(nearText)
    .with_additional('distance')
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# and sort to the top those who's "document_summary" field matches the re-rank query for fast charging

nearText = {"concepts": ["large screen"]}

response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_near_text(nearText)
    .with_additional('rerank(property: "key_features" query: "fast charging") { score }')
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# rerank bm25 search
response = (
    wv_client.query
    .get("Manual", ['model_names','key_features','document_summary'])
    .with_bm25(query='security')
    .with_additional('rerank(property: "key_features" query: "fast charging") { score }')
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

<h1>Searching for information on a specific device - We need a better pattern</h1>

In [None]:
nearText = {"concepts": ["how do I unlock the screen?"]}

where_filter = {
    "path": ["model_names"],
    "operator": "Equal",
    "valueText": "Galaxy S22",
}

response = (
    wv_client.query
    .get("ManualContent", ['content','model_names','file'])
    .with_near_text(nearText)
    .with_where(where_filter)
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))