In [None]:
#download the minsearch library
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import minsearch

In [None]:
#download documents
#!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [None]:
import json

In [None]:
#create the docs_raw variable from the .json file
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)
    

In [None]:
docs_raw

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
        

In [None]:
documents[0]

In [None]:
#make the index
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields = ["course"]
)



In [None]:
index.fit(documents)

In [None]:
#you need to start a ollama client, doing this ->
#ollama serve

#to get a model ->
#ollama pull name_model
import ollama


In [None]:
#search function

def search(query):
    #when we think that one of the fields is more important than others, we can boost this field
    boost = {'question': 3.0, 'section': 0.5}
    #the question field is 3 times more important than text or section

    #if you want more precision, increase the num_results
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course': 'data-engineering-zoomcamp'}
    )
    return results


In [14]:
def build_prompt(query,search_results):
    prompt_template="""
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't cointain the answer, output NONE.

    QUESTION: {question}

    CONTEXT:
    {context}
    """

    
    context = ""

    for doc in search_results:
        context= context + f"section: {doc['section']}\n\nquestion: {doc['question']}\n\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query,context=context)
    return prompt

In [13]:
def llm(prompt):
    import ollama
    response = ollama.generate(model="gemma:2b", prompt=prompt)
    return response

In [None]:
print(response['response'])

In [17]:
query = "the course has already started, can I still enroll?"


In [None]:
def rag(query):
    results = search_elastic(query)
    prompt = build_prompt(query,results)
    response = llm(prompt)
    return response['response']

In [None]:
answer = rag(query)
print(answer)

In [None]:
#DONT RUN THIS ON JUPYTER
#before doing this, we need to start the elastic search client, on a command line->
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [20]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')



In [12]:
#"filter" chooose the document that match with the "course"
#"question^3" means that the field is 3 times more important than the other
#"size" is the number of document retrieve


def search_elastic(query):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    results_doc = []
    
    for hit in response['hits']['hits']:
        results_doc.append(hit['_source'])
    
    return results_doc

In [21]:
search_elastic(query)

[]

In [None]:
#CREATING INDICES
#they are saved permanently, so you have to run it only 1 time, no need to rerun after power-up

index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }
    
index_name = "course-questions"
    
es_client.indices.create(index=index_name, body=index_settings)


In [None]:
#dont run this cell several times, or (idk why) some doc duplicated. If so, try to delete the index ->
#es_client.indices.delete(index=index_name)
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)