In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

load_dotenv()

class ElasticRag:
    client: OpenAI
    documents: list
    es_client: Elasticsearch
    index_settings: dict[str, any]
    index_name: str

    def __init__(self, index_name: str):
        self.client = OpenAI()
        self.es_client = Elasticsearch('http://localhost:9200')
        with open('documents.json', 'rt') as f_in:
            docs_raw = json.load(f_in)
        
        documents = []
        for course_dict in docs_raw:
            for doc in course_dict['documents']:
                doc['course'] = course_dict['course']
                documents.append(doc)
        self.documents = documents
        self.index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            },
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
                    "section": {"type": "text"},
                    "question": {"type": "text"},
                    "course": {"type": "keyword"}
                }
            }
        }
        self.index_name = index_name

    def create_index(self):
        self.es_client.indices.create(index=self.index_name, body=self.index_settings)
    def destroy_index(self):
        self.es_client.indices.delete(index=self.index_name)
    def index_data(self):
        for doc in tqdm(self.documents):
            self.es_client.index(index=self.index_name, document=doc)
    def _build_prompt (self, query: str, search_results: list):
        prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTENT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT:
        {context}
        """.strip()

        context = ""

        for doc in search_results:
            context = context + \
                f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

        prompt = prompt_template.format(question=query, context=context).strip()
        return prompt
    def _llm(self, prompt: str):
        response = self.client.chat.completions.create(
            model='gpt-4o',
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    def _elastic_search (self, query: str):
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["question^3", "text", "section"],
                            "type": "best_fields"
                        }
                    },
                    "filter": {
                        "term": {
                            "course": "data-engineering-zoomcamp"
                        }
                    }
                }
            }
        }
        response = self.es_client.search(index=self.index_name, body=search_query)
        result_docs = []
        for hit in response['hits']['hits']:
            result_docs.append(hit['_source'])

        return result_docs
    def rag(self, query: str):
        results = self._elastic_search(query=query)
        prompt = self._build_prompt(query=query, search_results=results)
        answer = self._llm(prompt)
        return answer

In [19]:
# instanciate ELasticRag class
elastic_rag = ElasticRag(index_name='course-questions')

In [20]:
# Destroy existing index
elastic_rag.destroy_index()

In [21]:
# create new index with given index name and feed data into index.
elastic_rag.create_index()
elastic_rag.index_data()

100%|██████████| 948/948 [00:02<00:00, 320.77it/s]


In [22]:
query = 'I just discovered the course can I still join it?'
elastic_rag.rag(query=query)

'Yes, even if you just discovered the course, you can still join it. You are eligible to submit the homework without registering. However, be aware that there are deadlines for turning in the final projects, so try not to leave everything for the last minute.'