In [11]:
import json
import os
from dotenv import load_dotenv
from minsearch import AppendableIndex

In [3]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x11280ecf0>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [6]:
results = search('I just discovered the course. Can I join now?')
print(results[0]['text'])

Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


In [12]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "OPENAI_API_KEY= environment variable is not set.")

In [13]:
from openai import OpenAI
client = OpenAI()

In [14]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
rag('I just discovered the course. Can I join now?')

"Yes, you can still join the course now. Even if you don't register, you are eligible to submit the homeworks. However, keep in mind that there will be deadlines for turning in the final projects, so it's best not to leave everything until the last minute."

In [17]:
rag('how do I run docker on gentoo?')

"The provided context does not contain specific information about running Docker on Gentoo. Therefore, I can't provide a direct answer to your question regarding running Docker on Gentoo based solely on the available information. You may want to refer to Gentoo's official documentation or community forums for guidance on this topic."

## Making RAG more agentic

First, we'll take the prompt we have so far and make it 
a little more "agentic":

- Tell the LLM that it can answer the question directly or look up context
- Provide output templates
- Show clearly what's the source of the answer

In [18]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [19]:
question = "how do I run docker on gentoo?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I run docker on gentoo?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}


In [20]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To run Docker on Gentoo, you first need to install it. You can do this using Portage, Gentoo’s package management system. Here are the steps: 1. Ensure your system is up-to-date by running `emerge --sync` and `emerge --update --deep --newuse @world`. 2. Install Docker by executing `emerge app-emulation/docker`. 3. After the installation, you need to add your user to the 'docker' group with `usermod -aG docker your_username`. 4. Start the Docker service with `rc-service docker start` and make it start on boot using `rc-update add docker default`. 5. Finally, confirm that Docker is running by executing `docker run hello-world`. This command should pull a test image from Docker Hub and run it, confirming that your Docker installation is working correctly.",
"source": "OWN_KNOWLEDGE"
}


In [22]:
question = "how do I join the course?"
# context = "EMPTY"
context = ""

prompt = prompt_template.format(question=question, context=context)
answer = llm(prompt)
print(answer)

{
"action": "SEARCH",
"reasoning": "The question about how to join the course is not answered in the current context, so I'll refer to the FAQ database to find the relevant information."
}


In [23]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [24]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course start

In [25]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you need to register before the course starts using the provided link. The course starts on 15th January 2024 at 17h00. Additionally, it's beneficial to join the course's Telegram channel and register in DataTalks.Club’s Slack for announcements and communication.",
"source": "CONTEXT"
}


In [26]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [27]:
agentic_rag_v1('how do I join the course?')

{'action': 'SEARCH', 'reasoning': 'The context is empty, so I need to search the FAQ database to find out how to join the course.'}
need to perform search...
{'action': 'ANSWER', 'answer': "To join the course, you need to register using the provided link before the course starts. It's important to do this as soon as possible, especially since the course will begin on January 15th, 2024 at 17:00. Even if you miss the registration deadline, you can still submit homework but you'll need to adhere to the final project deadlines. Don't forget to also join the course Telegram channel for announcements and DataTalks.Club's Slack channel for communication.", 'source': 'CONTEXT'}


{'action': 'ANSWER',
 'answer': "To join the course, you need to register using the provided link before the course starts. It's important to do this as soon as possible, especially since the course will begin on January 15th, 2024 at 17:00. Even if you miss the registration deadline, you can still submit homework but you'll need to adhere to the final project deadlines. Don't forget to also join the course Telegram channel for announcements and DataTalks.Club's Slack channel for communication.",
 'source': 'CONTEXT'}

In [28]:
agentic_rag_v1('how patch KDE under FreeBSD?')

{'action': 'ANSWER', 'answer': 'To patch KDE under FreeBSD, you generally need to follow these steps: \n\n1. **Install the necessary tools**: Ensure you have the `ports` collection and other development tools installed, such as `patch`, `make`, and the KDE source code you want to patch. \n\n2. **Download the KDE source**: You can fetch the source code for KDE from the FreeBSD ports tree or the official KDE repository.\n\n3. **Create the patch**: If you are modifying the source code, use the `diff` command to create a patch file. For example, if you have made changes to a file called `example.cpp`, you can create a patch with:\n   ```\n   diff -u original/example.cpp modified/example.cpp > my_patch.patch\n   ```\n\n4. **Apply the patch**: Navigate to the directory where the KDE source code is located and use the `patch` command to apply your patch:\n   ```\n   patch -p1 < path/to/my_patch.patch\n   ```\n\n5. **Compile and install**: After applying the patch, compile the code using `make

{'action': 'ANSWER',
 'answer': 'To patch KDE under FreeBSD, you generally need to follow these steps: \n\n1. **Install the necessary tools**: Ensure you have the `ports` collection and other development tools installed, such as `patch`, `make`, and the KDE source code you want to patch. \n\n2. **Download the KDE source**: You can fetch the source code for KDE from the FreeBSD ports tree or the official KDE repository.\n\n3. **Create the patch**: If you are modifying the source code, use the `diff` command to create a patch file. For example, if you have made changes to a file called `example.cpp`, you can create a patch with:\n   ```\n   diff -u original/example.cpp modified/example.cpp > my_patch.patch\n   ```\n\n4. **Apply the patch**: Navigate to the directory where the KDE source code is located and use the `patch` command to apply your patch:\n   ```\n   patch -p1 < path/to/my_patch.patch\n   ```\n\n5. **Compile and install**: After applying the patch, compile the code using `mak

## Part 2: Agentic search

So far we had two actions only: search and answer.

But we can let our "agent" formulate one or more 
search queries - and do it for a few iterations until
we found an answer


Let's build a prompt:

- List available actions:
    - Search in FAQ
    - Answer using own knowledge
    - Answer using information extracted from FAQ 
- Provide access to the previous actions
- Have clear stop criteria (no more than X iterations)
- We also specify the output format, so it's easier to parse it

In [29]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.


Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}


<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [30]:
question = "how do I join the course?"

search_queries = []
search_results = []
previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [31]:
answer_json = llm(prompt)
answer = json.loads(answer_json)

In [32]:
print(json.dumps(answer, indent=2))

{
  "action": "SEARCH",
  "reasoning": "I need to find information about the process or requirements for joining the course as the CONTEXT is currently empty.",
  "keywords": [
    "how to join the course",
    "course enrollment process",
    "course registration"
  ]
}


In [34]:
previous_actions.append(answer)

In [35]:
keywords = answer['keywords']
search_queries.extend(keywords)

In [36]:
for k in keywords:
    res = search(k)
    search_results.extend(res)

In [37]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [38]:
search_results = dedup(search_results)

In [39]:
# question = "how do I join the course?"

# search_queries = []
# search_results = []
# previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=2
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [40]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "ANSWER",
  "answer": "To join the course, you can start by registering before the course begins. You can find the registration link mentioned in the course materials. The course is set to start on January 15, 2024, at 17:00. Registration is not mandatory to participate, as you can start learning and submitting homework even without it, but it's recommended to gauge interest. Additionally, it is important to subscribe to the course's public Google Calendar and join the Telegram channel for announcements. Just keep in mind there will still be deadlines for project submissions even if you register late.",
  "source": "OWN_KNOWLEDGE"
}


In [41]:
question = "what do I need to do to be successful at module 1?"

search_queries = []
search_results = []
previous_actions = []


iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()


ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

In [42]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer

In [43]:
agentic_search('how do I prepare for the course?')

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

{'action': 'ANSWER',
 'answer': "To prepare for the course effectively, consider the following strategies:\n\n1. **Familiarize Yourself with Tools:** Ensure that you are comfortable using Git and GitHub, as they are essential for collaborating and managing course materials. Review tutorials on cloning repositories and setting up your own notes repository.\n\n2. **Join Relevant Communities:** Engage with your peers and instructors by joining communication platforms like Slack and Telegram where announcements and support are provided.\n\n3. **Create a Study Schedule:** Allocate regular study times in your calendar to ensure you stay consistent with the course material.\n\n4. **Review Course Materials:** Even before the course officially begins, reviewing any recommended resources can enhance your understanding. You can access materials even after the course to revise concepts at your own pace.\n\n5. **Practice Hands-On:** If the course involves practical components or coding, practice ha

In [44]:
print(_['answer'])

To prepare for the course effectively, consider the following strategies:

1. **Familiarize Yourself with Tools:** Ensure that you are comfortable using Git and GitHub, as they are essential for collaborating and managing course materials. Review tutorials on cloning repositories and setting up your own notes repository.

2. **Join Relevant Communities:** Engage with your peers and instructors by joining communication platforms like Slack and Telegram where announcements and support are provided.

3. **Create a Study Schedule:** Allocate regular study times in your calendar to ensure you stay consistent with the course material.

4. **Review Course Materials:** Even before the course officially begins, reviewing any recommended resources can enhance your understanding. You can access materials even after the course to revise concepts at your own pace.

5. **Practice Hands-On:** If the course involves practical components or coding, practice hands-on with the tools you will be using, su

## Tools (function calling)

https://platform.openai.com/docs/guides/function-calling

    def search(query):
        boost = {'question': 3.0, 'section': 0.5}
    
        results = index.search(
            query=query,
            filter_dict={'course': 'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5,
            output_ids=True
        )
    
        return results


In [45]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }

}

In [46]:
tools = [search_tool]

In [47]:
question = "How do I do well in module 1?"

In [48]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [49]:
response

Response(id='resp_687551f4badc819badf42f388e5c69480262e67ea3ec636e', created_at=1752519156.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_sWUZ5pIKoakLRl2z6lGblwwC', name='search', type='function_call', id='fc_687551f5ed70819b802c038173fafc630262e67ea3ec636e', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=Reasoning(effort=None, generate_summary=None, summary=None), servic

In [50]:
response.output

[ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_sWUZ5pIKoakLRl2z6lGblwwC', name='search', type='function_call', id='fc_687551f5ed70819b802c038173fafc630262e67ea3ec636e', status='completed')]

In [51]:
# response.choices[0].message.content
calls = response.output

In [52]:
call = calls[0]
call

ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_sWUZ5pIKoakLRl2z6lGblwwC', name='search', type='function_call', id='fc_687551f5ed70819b802c038173fafc630262e67ea3ec636e', status='completed')

In [53]:
call.call_id

'call_sWUZ5pIKoakLRl2z6lGblwwC'

In [54]:
f_name = call.name
f_name

'search'

In [55]:
arguments = json.loads(call.arguments)
arguments

{'query': 'module 1 tips'}

In [56]:
f = locals()[f_name]

In [57]:
results = f(**arguments)

In [58]:
search_results = json.dumps(results, indent=2)
print(search_results)

[
  {
    "text": "Following dbt with BigQuery on Docker readme.md, after `docker-compose build` and `docker-compose run dbt-bq-dtc init`, encountered error `ModuleNotFoundError: No module named 'pytz'`\nSolution:\nAdd `RUN python -m pip install --no-cache pytz` in the Dockerfile under `FROM --platform=$build_for python:3.9.9-slim-bullseye as base`",
    "section": "Module 4: analytics engineering with dbt",
    "question": "DBT - Error: No module named 'pytz' while setting up dbt with docker",
    "course": "data-engineering-zoomcamp",
    "_id": 299
  },
  {
    "text": "Issue:\ne\u2026\nSolution:\npip install psycopg2-binary\nIf you already have it, you might need to update it:\npip install psycopg2-binary --upgrade\nOther methods, if the above fails:\nif you are getting the \u201c ModuleNotFoundError: No module named 'psycopg2' \u201c error even after the above installation, then try updating conda using the command conda update -n base -c defaults conda. Or if you are using pip, t

In [59]:
chat_messages.append(call)

In [60]:
chat_messages.append({
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": search_results,
})

In [61]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [62]:
r = response.output[0]

In [63]:
print(r.content[0].text)

To excel in Module 1 of your course, here are some tips:

1. **Familiarize Yourself with Docker and Terraform**:
   - Ensure you understand the basics of Docker and how to set up containers. Follow the official documentation and start with simple examples.
   - Learn about Terraform for infrastructure management. Understanding its syntax and common commands is crucial.

2. **Practice Hands-On**:
   - Set up a local environment where you can experiment with Docker and Terraform. 
   - Complete all assigned exercises and try additional practice projects to solidify your understanding.

3. **Resolve Common Errors**:
   - Be prepared for issues like `ModuleNotFoundError` for packages like `psycopg2`. Know how to install packages using pip or conda, and practice troubleshooting common errors.
   - Familiarize yourself with error resolution tips, such as ensuring the correct installation of dependencies.

4. **Engage with Learning Materials**:
   - Watch any available video lectures, and rea

In [64]:
r.type

'message'

In [65]:
call.type

'function_call'

### Multiple calls

In [67]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }

In [66]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [68]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.text) 

function_call
function_call
function_call


In [69]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)
    print()

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message

To do well in Module 1, here are some strategies and tips based on common issues and resources:

1. **Understand the Basics**:
   - Review the fundamental concepts of Docker and Terraform that are necessary for this module. Make sure you're comfortable with commands and configurations.

2. **Practice with Examples**:
   - Follow along with practical examples. Hands-on practice is essential for mastering Docker and Terraform.

3. **Check Dependencies**:
   - Ensure all required modules are installed:
     - For PostgreSQL, if you encounter errors such as `ModuleNotFoundError: No module named 'psycopg2'`, you can install it with:
       ```bash
       pip install psycopg2-binary
       ```
     - Make sure to update it if needed:
       ```bash
       pip install psycopg2-binary --upgrade
       ```

4. **Resolve Common Errors**:
   - If you run into specific errors like:
     - **TypeError: 'module' object is not callable**: Make sure to define your connection string correctly:

### Putting it all together

Have two loops:

- First is the main Q&A loop - ask question, get back the answer
- Second is the request loop - send requests until there's a message reply from API

In [70]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [71]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

It seems like your question might be unclear or lacking context. Could you provide more details or specify what you're asking about? This way, I can assist you better!

function_call: ResponseFunctionToolCall(arguments='{"query":"first module topic"}', call_id='call_21WGCEqdD7NhrxrTmX9ankiT', name='search', type='function_call', id='fc_6875537b6a44819b9ca88a5f88fbae340407d8167b9dd9c9', status='completed')

The topic of the first module is **"Docker and Terraform."** 

This module likely covers the essentials of using Docker for containerization and Terraform for infrastructure as code, both crucial skills in data engineering.

If you have specific questions about this module or need more details, feel free to ask! What particular aspect of Docker and Terraform are you most interested in?

It seems like you didn't type anything. If you have a question or need further information about the first module or any other topic, please let me know!

If you need to exit the conversation, that's 

KeyboardInterrupt: Interrupted by user

Same using widgets

In [74]:
from IPython.display import display, HTML
import markdown # pip install markdown

In [75]:
    

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

# Chat loop
while True:
    
    if question.strip().lower() == 'stop':
        print("Chat ended.")
        break
    print()

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True:  # inner request loop
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False

        for entry in response.output:
            chat_messages.append(entry)

            if entry.type == "function_call":
                result = do_call(entry)
                chat_messages.append(result)
                display_function_call(entry, result)

            elif entry.type == "message":
                display_response(entry)
                has_messages = True

        if has_messages:
            break




NameError: name 'display_response' is not defined

## Adding more tools