In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

In [3]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
import minsearch

## Usual index

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1f7fdb1c8f0>

In [6]:
query = "Can I join the course if it has already started?"

filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3}

results = index.search(query, filter_dict, boost_dict, num_results=5)

for result in results:
    print(json.dumps(result, indent=2))

{
  "text": "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  "section": "General course-related questions",
  "question": "Course - Can I still join the course after the start date?",
  "course": "data-engineering-zoomcamp"
}
{
  "text": "Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.",
  "section": "General course-related questions",
  "question": "Course - Can I follow the course after it finishes?",
  "course": "data-engineering-zoomcamp"
}
{
  "text": "Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel fi

## Appendable index

In [7]:
index = minsearch.AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x1f7db661ac0>

In [8]:
query = "Can I join the course if it has already started?"

filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3}

results = index.search(query, filter_dict, boost_dict, num_results=5)

for result in results:
    print(json.dumps(result, indent=2))

{
  "text": "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  "section": "General course-related questions",
  "question": "Course - Can I still join the course after the start date?",
  "course": "data-engineering-zoomcamp"
}
{
  "text": "Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don\u2019t rely on its answers 100%, it is pretty good though.",
  "section": "General course-related questions",
  "question": "Course - Can I get support if I take the course in the self-paced mode?",
  "course": "data-engineering-zoomcamp"
}
{
  "text": "No, you can only get a