In [1]:
import RetrievalAndSearch.minsearch as minsearch
import json
from openai import OpenAI

In [2]:
def read_documents(doc_path):
    with open(doc_path, 'rt') as f_in:
        docs_raw = json.load(f_in)

    documents = []

    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)

    return documents

In [3]:
def search(query, documents):
    index = minsearch.Index(
        text_fields=["question", "text", "section"],
        keyword_fields=["course"]
    )

    index.fit(documents)
    
    boost = {'question': 3, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=10
    )

    return results

In [4]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [5]:
def llm(prompt):
    client = OpenAI()
    
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content':prompt}]
    )

    return response.choices[0].message.content

In [6]:
def rag(query, doc_path):
    documents = read_documents(doc_path)
    search_results = search(query, documents)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [7]:
doc_path = 'RetrievalAndSearch/documents.json'
query = "How do I run Kafka?"
rag(query, doc_path)

"To run Kafka, you should refer to the specific commands provided for your use case:\n\n1. **For Java Kafka (running producer/consumer/kstreams/etc in terminal):**\n   In your project directory, run:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka (module not found or virtual environment setup):**\n   a. Create a virtual environment and run `requirements.txt` and the Python files in that environment.\n   ```bash\n   python -m venv env\n   source env/bin/activate\n   pip install -r ../requirements.txt\n   ```\n   b. To activate it (every time you need the virtual env):\n   ```bash\n   source env/bin/activate\n   ```\n   c. To deactivate it:\n   ```bash\n   deactivate\n   ```\n\n3. **For Python Kafka (resolving `NoBrokersAvailable` error):**\n   If you encounter this error, it is likely because your Kafka broker Docker container is not running. Check by running:\n   ```bash\n   docker ps\n   ```\