In [1]:
import json
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from openai import OpenAI

import minsearch


client = OpenAI()

In [3]:
with open('documents.json') as f:
    docs_raw = json.load(f)


documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"],
)

index.fit(documents)

<minsearch.Index at 0x72362fcd6500>

In [18]:
def search(query, num_results=5):
    boosts = {'question': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boosts,
        num_results=num_results
    )

    return results

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course assistant helping a student with a question about the course. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the information provided in the CONTEXT.
    If the CONTEXT does not contain answer, output NONE.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [13]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt},
        ]
    )

    return response.choices[0].message.content

In [20]:
def rag(query):
    results = search(query, 10)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [21]:
query = "how do I run kafka?"
rag(query)

'To run Kafka producer/consumer/kstreams/etc in the terminal, you can use the following command in the project directory:\n\n```sh\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nMake sure to replace `<jar_name>` with the actual name of your jar file.\n\nIf the CONTEXT does not contain answer, output NONE.'

In [6]:
es_client = Elasticsearch('http://localhost:9200')

In [7]:
es_client.info()

ObjectApiResponse({'name': '1355fe02f538', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4k6FCamiScqjdVU2PR7yMQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [11]:
query = "I just discovered the course. Can I still join it?"

In [None]:
def elastic_search(query, num_results=5):
    search_query = {
        "size": num_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs