In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
from minsearch import AppendableIndex

In [4]:
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x7274cacdb440>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [6]:
question = 'How do I join the course?'

In [7]:
search_results = search(question)

In [8]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [9]:
prompt = build_prompt(question, search_results)

In [13]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
How do I join the course?
</QUESTION>

<CONTEXT>
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the cour

In [10]:
from openai import OpenAI
client = OpenAI()

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [20]:
answer = llm(prompt)

In [12]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [23]:
rag('how do I run kafka in Docker?')

'To run Kafka in Docker, you need to ensure that your Kafka broker Docker container is up and running. Follow these steps:\n\n1. **Check if your Kafka broker is running**: Use the command `docker ps` to confirm that the Kafka broker container is active.\n   \n2. **Start the containers**: In the folder where your Docker Compose YAML file is located, run the command:\n   ```\n   docker compose up -d\n   ```\n   This command will start all the instances defined in your Docker Compose configuration.\n\nMake sure all your Docker images are up and running to avoid issues with Kafka.'

In [25]:
rag('How do I pacth KDE under FreeBSD?')

"I'm sorry, but there is no information provided in the context regarding how to patch KDE under FreeBSD. Please refer to specific FreeBSD documentation or KDE support resources for guidance on this topic."

## Agentic RAG

In [13]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [14]:
question = "how can I join the course?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how can I join the course?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}


In [31]:
answer = llm(prompt)
print(answer)

{
"action": "SEARCH",
"reasoning": "The context is empty, and I need to find information about how a student can join the course."
}


In [32]:
search_results = search(question)

In [15]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    return context.strip()

In [16]:
context = build_context(search_results)

In [17]:
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how can I join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course star

In [39]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you can register before the start date using the provided registration link. Even if you miss the registration, you can still participate by submitting homework assignments. However, make sure you are aware of the deadlines for the final projects, as they will still apply.",
"source": "CONTEXT"
}


In [18]:
import json

In [19]:
def agentic_rag(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)

    if answer['action'] == 'SEARCH':
        print('preforming search...')
        search_results = search(question)
        context = build_context(search_results)
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)

    return answer

In [46]:
agentic_rag('how do I join the course?')

preforming search...


{'action': 'ANSWER',
 'answer': "To join the course, you need to register before the course starts. You can do this using the registration link provided in the course materials. Additionally, make sure to join the course Telegram channel for announcements and register in DataTalks.Club's Slack to stay updated. The course will begin on January 15, 2024, at 17:00 with the first 'Office Hours' live session.",
 'source': 'CONTEXT'}

Breaks:

- 10:30 - 11:00 - Coffee break
- 12:30 - 13:30 - Lunch break
- 15:00 - 15:30 - Coffee break

## Agentic Search

In [33]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [34]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [36]:
question = 'how do I run kafka in Docker?'

iteration_number = 0
max_iterations = 3

search_queries = []
search_results = []
previous_actions = []

while True:
    print(f'iteration {iteration_number}...')
    context = build_context(search_results)

    prompt = prompt_template.format(
        question=question,
        iteration_number=iteration_number,
        max_iterations=max_iterations,
        search_queries='\n'.join(search_queries),
        previous_actions='\n'.join(previous_actions),
        context=context
    )

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] != 'SEARCH':
        break

    previous_actions.append(json.dumps(answer))
    
    keywords = answer['keywords']
    search_queries.extend(keywords)

    for kw in keywords:
        sr = search(kw)
        search_results.extend(sr)

    search_results = dedup(search_results)
    iteration_number = iteration_number + 1

    if iteration_number >= max_iterations:
        break    

iteration 0...
{'action': 'SEARCH', 'reasoning': 'I need to gather specific information on running Kafka in Docker since the current context is empty and the student needs guidance on this topic.', 'keywords': ['run Kafka in Docker', 'Kafka Docker setup', 'Kafka Docker installation']}
iteration 1...
{'action': 'SEARCH', 'reasoning': 'I need to gather more specific and detailed information on running Kafka in Docker to provide the student with comprehensive guidance on the topic, as the previous search did not yield relevant results that are directly applicable to running Kafka in Docker.', 'keywords': ['Kafka Docker configuration', 'Kafka Docker run example', 'Docker Compose Kafka setup']}
iteration 2...
{'action': 'ANSWER', 'answer': 'To run Kafka in Docker, you typically use Docker Compose to simplify the setup. Here is a quick guide:\n\n1. **Create a Docker Compose file**: You need to create a `docker-compose.yml` file. A basic configuration would look something like this:\n   ```ya