In [None]:
!pip install weaviate-client --quiet

<mark>Weaviate documentation for query<br></mark>
https://weaviate.io/developers/weaviate/tutorials/query

<mark>Weaviate documentation on Search API<br></mark>
https://weaviate.io/developers/weaviate/search

In [None]:
import ast
import json
import pandas as pd
import weaviate

In [None]:
# define class names
article_class_name = "Article"
article_no_vector_class_name = "ArticleNoTransformer"

<mark>Confirm the ELB endpoint URL<br></mark>

In [None]:
elb_endpoint = ''

In [None]:
# Instantiate the client 
wv_client = weaviate.Client(url=f"http://{elb_endpoint}")

<h1>Basic Search</h1>

In [None]:
response = (
    wv_client.query
    .get(article_class_name, ['title','content','url','custom_tags'])
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# return the vector associated with the obejct 
response = (
    wv_client.query
    .get(article_class_name, ['title','content','url','custom_tags'])
    .with_additional("vector")
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=2))

<h1>Filtering based on metadata</h1>

In [None]:
where_filter = {
    "path": ["custom_tags"],
    "operator": "Equal",
    "valueText": "france",
}

result = (
    wv_client.query
    .get(article_class_name, ['title','content','url','custom_tags'])
    .with_where(where_filter)
    .with_limit(2)
    .do()
)

print(json.dumps(result, indent=2))

<h1>Aggregate data</h1>

In [None]:
response = (
    wv_client.query
    .aggregate(article_class_name)
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# combine metadata filtering and aggregation

where_filter = {
    "path": ["custom_tags"],
    "operator": "Equal",
    "valueText": "car",
}

response = (
    wv_client.query
    .aggregate(article_class_name)
    .with_where(where_filter)
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

<h1>Similarity / Vector Search</h1>

In [None]:
nearText = {"concepts": ["modern art in Europe"]}

response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_near_text(nearText)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# NOTE you cannot use "with_near_text" to query the class without a transformer, as it is unknown how to embed the text
# Instead, we can use "with_near_vector"

# collect a single vector
result = (
    wv_client.query
    .get(article_no_vector_class_name)
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get'][article_no_vector_class_name][0]['_additional']['vector']

In [None]:
vector

In [None]:
# search for objects near this vector
nearVector = {"vector": vector}

response = (
    wv_client.query
    .get(article_no_vector_class_name, ['title','url'])
    .with_near_vector(nearVector)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# IMPORTANT! The model used to embed your query must be the same as the model used to embed the underlying database (i.e. same vector length)
vector = vector[:10]
nearVector = {"vector": vector}

response = (
    wv_client.query
    .get(article_no_vector_class_name, ['title','url'])
    .with_near_vector(nearVector)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# NOTE you can also use "with_near_vector" in the Class we've enabled Weaviate to create the embeddings

# collect a single vector
result = (
    wv_client.query
    .get(article_class_name)
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get'][article_class_name][0]['_additional']['vector']

# query for objects near the vector
nearVector = {"vector": vector}

response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_near_vector(nearVector)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# we can combine metadata filtering and semantic search
where_filter = {
    "path": ["custom_tags"],
    "operator": "Equal",
    "valueText": "car",
}

nearText = {"concepts": ["modern art in Europe"]}

result = (
    wv_client.query
    .get(article_class_name, ['title','content','url'])
    .with_where(where_filter)
    .with_near_text(nearText)
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(result, indent=2))

In [None]:
# ... and we can set a threshold for the distance
where_filter = {
    "path": ["custom_tags"],
    "operator": "Equal",
    "valueText": "car",
}

max_distance = .5
nearText = {"concepts": ["modern art in Europe"],
           "distance": max_distance}


result = (
    wv_client.query
    .get(article_class_name, ['title','content','url'])
    .with_where(where_filter)
    .with_near_text(nearText)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(result, indent=2))

<h1>BM25 / Keyword Search</h1>

In [None]:
# basic bm25 search
response = (
    wv_client.query
    .get(article_class_name, ['title','content','url','custom_tags'])
    .with_bm25(query="food")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# return the bm25 algorithm score
response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_bm25(query="food")
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# perform bm25 search only in specific properties of the object
response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_bm25(query="food", properties=["url","content"]) # if not provided, all text fields are searched
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# boost the importance of specific properties
response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_bm25(query="food", properties=["url^2","content"]) # if not provided, all text fields are searched
    .with_limit(3)
    .with_additional("score")
    .do()
)

print(json.dumps(response, indent=2))

<h1>Hybrid (BM25 + Semantic Vector) Search</h1>

In [None]:
# basic hybrid search
response = (
    wv_client.query
    .get(article_class_name, ['title','url','c_access'])
    .with_hybrid(query="food")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# include specific / boosted propertiees for the bm25 search
response = (
    wv_client.query
    .get(article_class_name, ['title','url','custom_tags'])
    .with_hybrid(query="food", properties=["url^2","content"])
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# include bm25 score and explanations
response = (
    wv_client.query
    .get(article_class_name, ['title','url','custom_tags'])
    .with_hybrid(query="food")
    .with_additional(["score", "explainScore"])
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# weight the bm25 and vector scores
response = (
    wv_client.query
    .get(article_class_name, ['title','url','custom_tags'])
    .with_hybrid(query="food", alpha=0.25) # An alpha of 1 is for a pure vector search and 0 is for a pure keyword search
    .with_additional(["score"])
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# note you can combine metadata filtering with all of these search types

where_filter = {
    "path": ["custom_tags"],
    "operator": "Equal",
    "valueText": "france",
}


response = (
    wv_client.query
    .get(article_class_name, ['title','url', 'custom_tags'])
    .with_hybrid(query="food", alpha=0.25) # An alpha of 1 is for a pure vector search and 0 is for a pure keyword search
    .with_additional(["score"])
    .with_where(where_filter)
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

<h1>Reranking Search</h1>

In [None]:
# Reranking vector search results
# let's say we want to find articles about Europe, and further sort towards the top those about capitals of countries.

# we can start with searching near the concept of Europe
nearText = {"concepts": ["Europe"]}

response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_near_text(nearText)
    .with_additional('distance')
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# and sort to the top those who's "content" fields matches the re-rank query for captial

nearText = {"concepts": ["Europe"]}

response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_near_text(nearText)
    .with_additional('rerank(property: "content" query: "capital") { score }')
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# rerank bm25 search
response = (
    wv_client.query
    .get(article_class_name, ['title','url'])
    .with_bm25(
      query='Europe'
    )
    .with_additional('rerank(property: "content" query: "capital") { score }')
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=2))