In [1]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data
    
def json_print(data):
    print(json.dumps(data, indent=2))
    
json_print(data)

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  },
  {
    "Category": "ANIMALS",
    "Question": "The gavial looks very much like a crocodile except for this bodily feature",
    "Answer": "the nose or snout"
  },
  {
    "Category": "ANIMALS",
    "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "Answer": "Antelope"
  },
  {
    "Category": "ANIMALS",
    "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "Answer": "the diamondback rattler"
  },
  {
    "Category": "SCIENCE",
    "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "Answer": "species"
  },
  {
    "Category": "SCIENCE",
   

In [2]:
import weaviate
from weaviate import EmbeddedOptions
import os

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]  # Replace this with your actual key
    }
)

embedded weaviate is already listening on port 6666


In [3]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

In [4]:
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

In [5]:
with client.batch.configure() as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [6]:
json_print(client.query.aggregate('Question').with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}


### Lets run a vector search to see whats comes back

search with concept vectordb -> collection of curated local knowldge, docs -> pass initial query + augmented info (docs) -> pass to GPT -> accurate results

In [7]:
#Write a vector search related to animals

response = (client.query
            .get("Question", "answer")
            .with_near_text({"concepts":"animals"})
            .with_limit(2)
            .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Elephant"
        },
        {
          "answer": "the nose or snout"
        }
      ]
    }
  }
}


### No we want to pass each of these objects to a LLM individually to use when answering a prompt!

In [8]:
#Write a prompt that will be passed in the returend object above.

prompt = "Tell me a story about this animal {answer} flying!"

In [9]:
#Write a query to perform RAG

response = (client.query
            .get("Question","answer")
            .with_near_text({"concepts":"animals"}) #ret
            .with_generate(single_prompt=prompt) #gen
            .with_limit(2)
            .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "generate": {
              "error": null,
              "singleResult": "Once upon a time, in a lush and magical forest, there lived a young elephant named Ella. Ella was no ordinary elephant; she possessed a unique ability that set her apart from her fellow elephants. She could fly!\n\nElla's story began when she was just a baby elephant. One sunny day, while playing near a sparkling river, she stumbled upon a mysterious glowing flower. Intrigued by its beauty, Ella reached out her trunk and gently touched the petals. In an instant, a burst of magical energy enveloped her, and she felt lighter than air.\n\nWith her newfound ability, Ella soared through the sky, her large ears acting as wings, carrying her gracefully through the clouds. The other animals in the forest were amazed and delighted by this extraordinary sight. Birds chirped in awe, monkeys swung from tree to tree, and even the w

### Lets extract all the categories

In [10]:
response = (client.query
            .get("Question", 'category')
            .with_near_text({'concepts':'animals'})
            .with_limit(10)
            .do()
           )

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "category": "ANIMALS"
        },
        {
          "category": "ANIMALS"
        },
        {
          "category": "ANIMALS"
        },
        {
          "category": "SCIENCE"
        },
        {
          "category": "ANIMALS"
        },
        {
          "category": "SCIENCE"
        },
        {
          "category": "SCIENCE"
        },
        {
          "category": "SCIENCE"
        },
        {
          "category": "SCIENCE"
        },
        {
          "category": "SCIENCE"
        }
      ]
    }
  }
}


### Now we'll pass all of these in at the same time for a LLM to generate a grouped answer.

In [11]:
#Write a prompt that requires information from all returned objects


prompt = "which of these subjects {category} does a zoologist specialize in?"

In [12]:
#write a query that generates a grouped response

response = (client.query
            .get("Question", 'category')
            .with_near_text({'concepts':'animals'})
            .with_generate(grouped_task=prompt)
            .with_limit(10)
            .do()
           )

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "generate": {
              "error": null,
              "groupedResult": "A zoologist specializes in the subject category of ANIMALS."
            }
          },
          "category": "ANIMALS"
        },
        {
          "_additional": {
            "generate": null
          },
          "category": "ANIMALS"
        },
        {
          "_additional": {
            "generate": null
          },
          "category": "ANIMALS"
        },
        {
          "_additional": {
            "generate": null
          },
          "category": "SCIENCE"
        },
        {
          "_additional": {
            "generate": null
          },
          "category": "ANIMALS"
        },
        {
          "_additional": {
            "generate": null
          },
          "category": "SCIENCE"
        },
        {
          "_additional": {
            "generate": null
          },
         