In [1]:
import minsearch
import json

In [2]:
with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while weâ€™re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [5]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)

In [6]:
q = 'the course has already started, can I still enroll?'


In [7]:
index.fit(documents)

<minsearch.minsearch.Index at 0x77641e66a660>

In [8]:
from dotenv import load_dotenv
import os

load_dotenv()

COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [9]:
import cohere
co = cohere.ClientV2(COHERE_API_KEY)  

response = co.chat(
    model="command-r",  
    messages=[{
        "role": "user",
        "content": q
    }]
)

In [10]:
response.message.content[0].text

"It's often best to check directly with the course provider about their enrollment policies. Many institutions and organizations have their own guidelines regarding late enrollment. Some courses may allow late enrollment, especially if they are delivered online and have a flexible structure. Others may have strict deadlines for enrollment, especially if there are physical classes or in-person attendance requirements. \n\nIt's recommended to reach out to the relevant course coordinator or the institution's admissions office to inquire about the possibilities of late enrollment. They can provide the most up-to-date information on the availability and requirements for joining the course. You may also be able to find the answer on their website or through their online student portals. \n\nIf the course is full or has closed applications, they may also be able to advise you on similar courses or future intake dates. Don't hesitate to ask, as many places will be used to handling such inquiri

### Modularizing the Code

In [None]:
# Search Query to pick top 5 results, based on tfidf from 'minsearch'
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search( # from index.fit(documents)
        query=query,
        filter_dict={'course': 'llm-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
# Making a simple prompt with the text results that we get from above
# It just provides some necessary context
# Rather than going thru every search, it only gets the relevant ones
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT: 
        {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']} \n question: {doc['question']} \n answer: {doc['text']} \n \n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
# Simple LLM API call based on the CONTEXTUALIZED prompt from above
def llm(prompt):
    response = co.chat(
        model="command-r",  
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )

    return response.message.content[0].text

In [14]:
query = 'What AI will we be using?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

rag(query)

'In this course, you will be using the Open AI API, as well as SaturnCloud.'

In [15]:
rag('the course has already started, can I still enroll?')

'Yes, you can still enroll and join the course even though it has already started. However, to receive a certificate, you must submit your project before the submission window closes.'

### Using ElasticSearch

Whatever we did above but using ElasticSearch's Search functionality
1. We will have to start it using docker OR using `sudo systemctl start elasticsearch` (if installed)

In [19]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch('http://localhost:9200/') # accessing it here

In [32]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-question"

if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
else:
    print(f"Index '{index_name}' already exists.")

Index 'course-question' already exists.


  if not es_client.indices.exists(index=index_name):


The following 2 code blocks are to prevent some system errors

In [33]:
es_client.cluster.put_settings(
    body={
        "transient": {
            "cluster.routing.allocation.disk.watermark.low": "100gb",
            "cluster.routing.allocation.disk.watermark.high": "50gb",
            "cluster.routing.allocation.disk.watermark.flood_stage": "10gb",
            "cluster.info.update.interval": "1m"
        }
    }
)

  es_client.cluster.put_settings(


ObjectApiResponse({'acknowledged': True, 'persistent': {}, 'transient': {'cluster': {'routing': {'allocation': {'disk': {'watermark': {'low': '100gb', 'flood_stage': '10gb', 'high': '50gb'}}}}, 'info': {'update': {'interval': '1m'}}}}})

In [34]:
es_client.indices.put_settings(
    index=index_name,
    body={"index.blocks.read_only_allow_delete": None}
)

  es_client.indices.put_settings(


ObjectApiResponse({'acknowledged': True})

In [35]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/86 [00:00<?, ?it/s]

  es_client.index(index=index_name, document=doc)


Same code logic as above

In [36]:
query = 'I just disovered the course. Can I still join it?'

In [None]:
# Getting the top 5 results, and the extracting the main text via '_source'

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "llm-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    results_doc = []
    for hit in response['hits']['hits']:
        results_doc.append(hit['_source'])

    return results_doc

In [None]:
def rag(query):
    search_results = elastic_search(query) # Changed the search function
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [49]:
rag(query)

  response = es_client.search(index=index_name, body=search_query)


'Yes, you can still join the course. However, if you want to receive a certificate, you need to submit your project while the submission period is still open.'