## Retrieval

In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-30 22:06:32--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Résolution de raw.githubusercontent.com (raw.githubusercontent.com)… 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8002::154, ...
Connexion à raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 3832 (3,7K) [text/plain]
Enregistre : ‘minsearch.py’


2024-06-30 22:06:32 (30,5 MB/s) - ‘minsearch.py’ enregistré [3832/3832]



In [2]:
import minsearch
import requests
from openai import OpenAI

In [3]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [4]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x72dd4531e5f0>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results


In [6]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [9]:
llm('write that this is a test')

' Absolutely! Here\'s the text:\n\n"This statement serves as an initial verification to ensure that my system for generating content and responses operates correctly."'

In [10]:
print(_)

 Absolutely! Here's the text:

"This statement serves as an initial verification to ensure that my system for generating content and responses operates correctly."
