In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
es_client.info()

# Import our Documents

In [None]:
import json

In [None]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [None]:
documents = []

for course_dic in docs_raw:
    for doc in course_dic['documents']:
        doc['course'] = course_dic['course']
        documents.append(doc)

In [None]:
documents

# Create an Index for our Documents

In [None]:
from tqdm.auto import tqdm

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
try:
    es_client.indices.create(index=index_name, body = index_settings)
except:
    pass

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

# Query

In [None]:
query = "I just discovered the course. Can I still join it?"

In [None]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [None]:
response = es_client.search(index=index_name, body=search_query)

In [None]:
response

In [None]:
result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [None]:
result_docs

# Clean Up

In [None]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs    

In [None]:
elastic_search(query)

# Combine it with the previous RAG functions we created

In [None]:
from openai import OpenAI

In [None]:
client = OpenAI()

In [None]:
def build_prompt(query, search_results):

    prompt_template = """
    You're a course teaching asssistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the question.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION : {question}
    CONTEXT: 
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
    model = "gpt-4o",
    messages = [{
        "role": "user",
        "content": prompt
    }]
    )
    
    return response.choices[0].message.content

In [None]:
def rag(query):
    # Notice now that the search is using Elasticsearch results, not the minsearch results
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)