In [2]:
from elasticsearch import Elasticsearch

from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage


import json

import os
from dotenv import load_dotenv

from tqdm.auto import tqdm

load_dotenv()
api_key = os.getenv('api_key')

  from .autonotebook import tqdm as notebook_tqdm


# Q1. Running Elastic

### Run Elastic Search 8.4.3, and get the cluster information. If you run it on localhost, this is how you do it:

In [3]:
!curl http://localhost:9200

{
  "name" : "a03f82748fbf",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "IczZxbR-S1qZeGAgFn4oaA",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [6]:
client = Elasticsearch('http://localhost:9200')
client.info()['version']['build_hash']

'42f05b9372a9a4a470db3b52817899b99a76ee73'

## What's the version.build_hash value? 
> 42f05b9372a9a4a470db3b52817899b99a76ee73

In [8]:
index_name = 'course-questions'
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
with open('documents.json', 'r') as fp:
    docs_raw = json.load(fp)

document = []
for docs in docs_raw:
   for doc in docs['documents']:
       doc['course'] = docs['course']
       document.append(doc)

# Q2. Indexing the data

### Which function do you use for adding your data to elastic? 

> Index

In [10]:
for doc in tqdm(document):
    client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 49.75it/s]



# Q3. Searching

Now let's search in our index.

We will execute a query "How do I execute a command in a running docker container?".

Use only question and text fields and give question a boost of 4, and use "type": "best_fields".

## What's the score for the top ranking result?

> 84.050095

In [12]:

query = 'How do I execute a command in a running docker container?'
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
}
client.search(index=index_name, body=search_query)['hits']['max_score']

84.050095

# Q4. Filtering

Now let's only limit the questions to machine-learning-zoomcamp.

Return 3 results. What's the 3rd question returned by the search engine?

>How do I copy files from a different folder into docker container’s working directory?

In [14]:
# now let's only limit the questions to machine-learning-zoomcamp.

#Return 3 results. What's the 3rd question returned by the search engine?
search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
response = client.search(index=index_name, body=search_query)['hits']['hits']

In [15]:
response[-1]

{'_index': 'course-questions',
 '_id': 'JKARR5ABepppZvMlbUAo',
 '_score': 49.938507,
 '_source': {'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t\t\t\t\t\t\tGopakumar Gopinathan',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from a different folder into docker container’s working directory?',
  'course': 'machine-learning-zoomcamp'}}

# Q5. Building a prompt

What's the length of the resulting prompt? (use the len function)
> 1462

In [21]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [17]:
context_template = """
Q: {question}
A: {text}
""".strip()

In [18]:
context = ''
for doc in response:
    context = context + context_template.format(question=doc['_source']['question'], text=doc['_source']['text']) + '\n\n'

In [19]:
prompt = prompt_template.format(question=query, context=context).strip()

In [20]:
len(prompt)

1462

# Q6. Tokens

In [22]:
!pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [23]:
import tiktoken

In [24]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [25]:
len(encoding.encode(prompt))

322

# Bonus: generating the answer (ungraded)

In [26]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = client.search(index = index_name, body=search_query)
    result_doc = []
    for hit in response['hits']['hits']:
        result_doc.append(hit['_source'])

    return result_doc

In [27]:
def build_prompt(query, search_result):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()

    context_template = """
    Q: {question}
    A: {text}
    """.strip()

    context = ''
    for doc in search_result:
        context = context + context_template.format(question=doc['_source']['question'], text=doc['_source']['text']) + '\n\n'

    return prompt_template.format(question=query, context=context).strip()


In [28]:
def llm(prompt):
    model = "mistral-small-latest"
    
    client = MistralClient(api_key=api_key)
    
    chat_response = client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=prompt)]
    )


    return chat_response.choices[0].message.content

In [31]:
def rag(query):
    search_result = elastic_search(query)
    prompt = build_prompt(query, response)
    return llm(prompt)

In [32]:
query = 'How do I execute a command in a running docker container?'
print(rag(query))

To execute a command in a running Docker container, you first need to identify the container ID using the `docker ps` command. Once you have the container ID, you can execute a command in the specific container using the `docker exec -it <container-id> bash` command. For example, if your container ID was 1234, you would execute `docker exec -it 1234 bash` to start a bash shell in the container.
