In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-01 13:27:52--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-01 13:27:53 (44.3 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [3]:
from openai import OpenAI
import requests 
import minsearch

In [3]:
def setup_index():
    docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    documents = []

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)

    index = minsearch.Index(
        text_fields=["question", "text", "section"],
        keyword_fields=["course"]
    )

    index.fit(documents)

    return index

In [4]:
def search(query, index):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    print(results)

    return results

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.
        
        QUESTION: {question}

        CONTEXT: {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    print(prompt)

    return prompt

In [6]:
def send_to_llm(prompt):
    llm_client = OpenAI(
        base_url = "http://localhost:11434/v1",
        api_key = "ollama"
    )

    llm_resp = llm_client.chat.completions.create(
        model = "phi3",
        messages = [{ "role": "user", "content": prompt }],
        temperature = 0.0
    )
    
    return llm_resp

In [7]:
def rag(query):
    index = setup_index()
    search_results = search(query, index)
    prompt = build_prompt(query, search_results)
    answer = send_to_llm(prompt)
    return answer

In [8]:
answer = rag("What's the formula for energy?")

answer

[{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites', 'section': 'General course-related questions', 'question': 'Course - What are the prerequisites for this course?', 'course': 'data-engineering-zoomcamp'}, {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning in a public link you get one point.\n(https://datatalks-club.slack.com/archives/C01FABYF2RG/p1706846846359379?thread_ts=1706825019.546229&cid=C01FABYF2RG)', 'section': 'General course-related questions', 'question': 'Homework and Leaderboard - what is the system for points in the course management platform?', 'course': '

ChatCompletion(id='chatcmpl-711', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The formula for energy is not mentioned in the context, so I cannot answer this question from the provided context.', role='assistant', function_call=None, tool_calls=None))], created=1719815293, model='gemma:2b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=23, prompt_tokens=0, total_tokens=23))

In [1]:
def send_to_llm_no_context(prompt):
    llm_client = OpenAI(
        base_url = "http://localhost:11434/v1",
        api_key = "ollama"
    )

    llm_resp = llm_client.chat.completions.create(
        model = "phi3",
        messages = [{ "role": "user", "content": prompt }],
        temperature = 0.0
    )
    
    return llm_resp

In [4]:
answer = send_to_llm_no_context("What's the formula for energy?")

answer

ChatCompletion(id='chatcmpl-160', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=" The most common formula to calculate the energy of an object is derived from two fundamental principles in physics:\n\n1. Kinetic Energy (KE): This represents the energy possessed by a moving object due to its motion. It can be calculated using the following formula:\n\n   KE = 0.5 * m * v^2\n\n   where 'm' is the mass of the object in kilograms, and 'v' is the velocity or speed of the object in meters per second (m/s).\n\n2. Potential Energy (PE): This represents the energy possessed by an object due to its position relative to other objects within a force field, such as gravity. The most common formula for gravitational potential energy is:\n\n   PE = m * g * h\n\n   where 'm' is the mass of the object in kilograms, 'g' is the acceleration due to gravity (approximately 9.81 m/s^2 on Earth), and 'h' is the height or vertical distance above a reference