In [None]:
import minsearch
from openai import OpenAI
from dotenv import load_dotenv
import json

load_dotenv()

client = OpenAI()

In [18]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [19]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [87]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=['course']
)
index.fit(documents)

<minsearch.minsearch.Index at 0x12c02f110>

In [88]:
def search(query: str):
    boost = {
        'question': 3.0, 'section': 0.5
    }
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course': 'data-engineering-zoomcamp'}
    )
    return results

In [34]:
def build_prompt (query: str, search_results: list):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTENT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + \
            f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [35]:
def llm(prompt: str):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [91]:
def rag (query: str):
    results = search(query=query)
    prompt = build_prompt(query=query, search_results=results)
    answer = llm(prompt)
    return answer

In [92]:
query = 'How do I run Kafka'
rag(query)

"To run Kafka, there are different approaches based on whether you're using Java or Python:\n\n1. **Java Kafka**:\n   - In the project directory, execute the following command to run a producer or similar Kafka components:\n     ```bash\n     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n     ```\n\n2. **Python Kafka**:\n   - First, ensure you have a virtual environment set up and activate it. You can create and activate a virtual environment as follows:\n     ```bash\n     python -m venv env\n     source env/bin/activate  # For MacOS/Linux\n     # or \n     env\\Scripts\\activate  # For Windows\n     ```\n   - Install the necessary dependencies:\n     ```bash\n     pip install -r ../requirements.txt\n     ```\n   - Ensure all Docker images are up and running before executing the Python scripts. \n\nRemember that these steps need to be performed in the appropriate directories mentioned in your project setup."

In [93]:
rag(query='The course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. However, keep in mind that there will be deadlines for submitting the final projects, so it's important not to leave everything until the last minute. You are also eligible to submit the homework assignments."