In [1]:
from openai import OpenAI
import os

In [2]:
client = OpenAI()

In [None]:
!pip install minsearch

!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

In [6]:
import minsearch
import json

In [8]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [9]:
documents = []

# a doc inside a course_dict only has 'text', 'section', 'question', but nothing like 'course', because in the json file it's already a block inside a specific course. To build our documents dictionary of doc objects, we get the 'course' info to add to each doc ob ject.
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [10]:
# build an index object so that when the user queries a specific term, the result is returned faster (compared to when searching through the whole document every time)
# there's more preprocessing time and memory space required to create an index object, but in turn we get much faster lookups especially when the number of documents grow

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x7b88bf418bf0>

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=3
    )

    return results

In [13]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
query = "is it too late to join the course?"
search_results = search(query)
rag_prompt = build_prompt(query, search_results)

In [17]:
search_results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [19]:
print(rag_prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: is it too late to join the course?

CONTEXT: 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course T

In [15]:
# prompt GPT to answer without RAG

# have to add at least $5 to OpenAI balance if encounter RateLimitError
response = client.chat.completions.create(
    model='gpt-5-mini',
    messages=[{"role":"user", "content":"is it too late to join the course?"}]

)
response.choices[0].message.content

'I can help — but I need one detail first: which course (school/platform and course name or code)? Policies vary a lot by institution and by whether it’s a university semester course, a short workshop, or an online MOOC.\n\nMeanwhile, here’s quick, actionable guidance you can use right away.\n\nTypical rules (very roughly)\n- University semester courses: add/drop or late-enroll windows are usually in the first 1–3 weeks. After that you often need instructor permission and/or registrar approval.\n- Labs, seminars, group projects, clinicals: these fill up or require permission sooner than lectures.\n- MOOCs (Coursera/edX/etc.): many are open to join but graded deadlines may already have passed — you can usually audit content anytime.\n- Short workshops or certificate courses: may close once they start or cap capacity.\n\nImmediate steps to take\n1. Check the course page/registration portal for deadlines and capacity.\n2. Find the syllabus/LMS (Canvas/Blackboard) and note missed assignmen

In [16]:
# prompt GPT to answer WITH RAG
response = client.chat.completions.create(
    model='gpt-5-mini',
    messages=[{"role":"user", "content":rag_prompt}]

)
response.choices[0].message.content

"No — you can still join after the start. Even if you don't register, you're still eligible to submit homeworks. Keep in mind there are deadlines for final projects, so don't leave everything to the last minute. After the course finishes, materials remain available so you can follow at your own pace."