In [15]:
import minsearch
import json
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm


#### Q1. Running Elastic

version.build_hash value = `42f05b9372a9a4a470db3b52817899b99a76ee73`

In [7]:
!curl localhost:9200


{
  "name" : "211abcd912f0",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "a7DlRcjfQY-iKGaJZBw_xQ",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


##### getting data

In [16]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

#### Q2. Indexing the data


In [19]:
search_settings = {
    "settings":{
        'number_of_shards':1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"}
        }
    }
}
index_name =  "course-questions"
es_client = Elasticsearch('http://localhost:9200')
es_client.indices.create(index=index_name, body=search_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)
    


100%|██████████| 948/948 [00:11<00:00, 85.07it/s] 


#### Q3. Searching

Top ranking result score: 84.050095

In [37]:
query = {
    "query": {
        "multi_match": {
            "query": "How do I execute a command in a running docker container?",
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}

# Execute the search query
response = es_client.search(index="course-questions", body=query)  

# Get the score for the top-ranking result
top_score = response['hits']['hits'][0]['_score']
print(f"Top ranking result score: {top_score}")


Top ranking result score: 84.050095


#### Question 4. Filtering: 3rd document

'How do I copy files from a different folder into docker container’s working directory?'

In [60]:
# Define the query
query = {
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "How do I execute a command in a running docker container?",
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    },
    "size": 3
}

# Execute the search query
response = es_client.search(index="course-questions", body=query)  

# Get the third question returned by the search engine
third_question = response['hits']['hits'][2]["_source"]["question"]
third_question

'How do I copy files from a different folder into docker container’s working directory?'

#### Q5 Building a prompt
Length of the resulting prompt: 1462

In [69]:
context_template = """
Q: {question}
A: {text}
""".strip()

context = "\n\n".join([context_template.format(question=record["_source"]["question"], text=record["_source"]["text"]) for record in response['hits']['hits']])

# Define the prompt template
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

# Define the question
question = "How do I execute a command in a running docker container?"

# Construct the full prompt
prompt = prompt_template.format(question=question, context=context)

# Measure the length of the resulting prompt
prompt_length = len(prompt)
print(f"Length of the resulting prompt: {prompt_length}")


Length of the resulting prompt: 1462


#### Q6. Tokens
Number of tokens in the prompt: 322


In [77]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex>=2022.1.18
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.1/775.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.5.15 tiktoken-0.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [81]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

tokens = encoding.encode(prompt)

# Calculate the number of tokens
num_tokens = len(tokens)

print(f"Number of tokens in the prompt: {num_tokens}")


Number of tokens in the prompt: 322


### Bonus: generating the answer (ungraded)


In [None]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT: 
        {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer