In [1]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data
    
def json_print(data):
    print(json.dumps(data, indent=2))
    
json_print(data)

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  },
  {
    "Category": "ANIMALS",
    "Question": "The gavial looks very much like a crocodile except for this bodily feature",
    "Answer": "the nose or snout"
  },
  {
    "Category": "ANIMALS",
    "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "Answer": "Antelope"
  },
  {
    "Category": "ANIMALS",
    "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "Answer": "the diamondback rattler"
  },
  {
    "Category": "SCIENCE",
    "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "Answer": "species"
  },
  {
    "Category": "SCIENCE",
   

In [2]:
import weaviate
from weaviate import EmbeddedOptions
import os

client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051, skip_init_checks=True)

# Check that Weaviate is up and live
if client.is_live():
    print("Weaviate is live!")
else:
    print("Weaviate is not reachable.")



Weaviate is live!


In [4]:
if client.collections.exists("Question"):
    client.collections.delete("Question")

In [5]:
from weaviate.classes.config import Configure, Property, DataType, VectorDistances

# Define the collection name and properties
collection_name = "Question"

# Define properties with correct field names
properties = [
    Property(name="question", data_type=DataType.TEXT),
    Property(name="answer", data_type=DataType.TEXT),
    Property(name="category", data_type=DataType.TEXT)
]

# Create the collection with properly configured vectorizer and vector index
client.collections.create(
    name=collection_name,
    properties=properties,
    vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
    vector_index_config=Configure.VectorIndex.hnsw(
        distance_metric=VectorDistances.COSINE
    )
)


<weaviate.collections.collection.sync.Collection at 0x25a04e39400>

In [6]:
#Insert the data into Weaviate
collection = client.collections.get("Question")
print(client.is_live())
with client.batch.dynamic() as batch:
    for o in data:
        obj_body = {
            'question': o["Question"],
            'answer': o["Answer"],
            'round': o["Category"]
        }
        collection.data.insert(properties=obj_body)
        

True


In [7]:
response = collection.aggregate.over_all(total_count=True)
print(response.total_count)

10


### Lets perform vector search for the concept of "animal"

### Now, lets perform keyword search

In [10]:
from weaviate.classes.query import MetadataQuery, Filter
response = collection.query.near_text(query="animal", limit=3, return_metadata=MetadataQuery(distance=True))

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)
    
# print(json.dumps(response, indent=2))

{'round': 'ANIMALS', 'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': None}
0.42083919048309326
{'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'answer': 'Antelope', 'round': 'ANIMALS', 'category': None}
0.5061103105545044
{'round': 'ANIMALS', 'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': None}
0.5605587959289551


### Why do we only get one match here? We know there are more animal related objects!

### Lets combine keyword and vector search - called hybrid search!

In [11]:
from weaviate.classes.query import MetadataQuery, Filter
response = collection.query.bm25(query="animal", limit=3, return_metadata=MetadataQuery(distance=True))

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)
    
# print(json.dumps(response, indent=2))

{'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'answer': 'Antelope', 'round': 'ANIMALS', 'category': None}
None


In [12]:
from weaviate.classes.query import MetadataQuery, Filter
response = collection.query.hybrid(query="animal", limit=3, return_metadata=MetadataQuery(distance=True))

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)
    
# print(json.dumps(response, indent=2))

{'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'answer': 'Antelope', 'round': 'ANIMALS', 'category': None}
None
{'round': 'ANIMALS', 'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': None}
None
{'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'answer': 'the nose or snout', 'round': 'ANIMALS', 'category': None}
None


### Notice the order of the returned results!