In [43]:
import json
from pprint import pprint

from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
import tiktoken
from openai import OpenAI

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
es_client = Elasticsearch('http://localhost:9200') 

In [7]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-homework"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-homework'})

In [29]:
from typing import TypedDict, Optional


class Source(TypedDict):
    text: str
    section: str
    question: str
    course: str


class SearchResult(TypedDict):
    _index: str
    _id: str
    _score: int
    _source: Source

In [30]:
def elastic_search(
    query: str, size: int, course: Optional[str] = None
) -> list[SearchResult]:
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields",
                    }
                },
            }
        },
    }

    if course:
        search_query["query"]["bool"]["filter"] = {"term": {"course": course}}

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit)

    return result_docs

### Q2

In [8]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1006 [00:00<?, ?it/s]

### Q3

In [22]:
query = "How do I execute a command in a running docker container?"

In [36]:
elastic_search(
    query,
    size=5,
)

[{'_index': 'course-homework',
  '_id': 'uZh-NZAB9TstJc-P8aiC',
  '_score': 83.48419,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'course-homework',
  '_id': 'xph-NZAB9TstJc-P6aYk',
  '_score': 74.8831,
  '_source': {'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nData

### Q4

In [37]:
q4_result = elastic_search(
    query,
    size=3,
    course="machine-learning-zoomcamp",
)

pprint(q4_result)

[{'_id': 'uZh-NZAB9TstJc-P8aiC',
  '_index': 'course-homework',
  '_score': 83.48419,
  '_source': {'course': 'machine-learning-zoomcamp',
              'question': 'How do I debug a docker container?',
              'section': '5. Deploying Machine Learning Models',
              'text': 'Launch the container image in interactive mode and '
                      'overriding the entrypoint, so that it starts a bash '
                      'command.\n'
                      'docker run -it --entrypoint bash <image>\n'
                      'If the container is already running, execute a command '
                      'in the specific container:\n'
                      'docker ps (find the container-id)\n'
                      'docker exec -it <container-id> bash\n'
                      '(Marcos MJD)'}},
 {'_id': '2Jh-NZAB9TstJc-P8aj5',
  '_index': 'course-homework',
  '_score': 50.681328,
  '_source': {'course': 'machine-learning-zoomcamp',
              'question': 'How do I copy f

### Q5

In [35]:
context_template = """
Q: {question}
A: {text}
""".strip()

context = "\n\n".join(
    [
        context_template.format(
            question=result["_source"]["question"], text=result["_source"]["text"]
        )
        for result in q4_result
    ]
)

# pprint(context)

In [38]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

prompt = prompt_template.format(question=query, context=context)
len(prompt)

1462

### Q6

In [42]:
encoding = tiktoken.encoding_for_model("gpt-4o")

len(encoding.encode(prompt))

322

### Bonus

In [44]:
client = OpenAI()

response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )
    
pprint(response.choices[0].message.content)

('To execute a command in a running Docker container, first find the container '
 'ID using the command "docker ps", then use the command "docker exec -it '
 '<container-id> bash" to execute the desired command in the specific '
 'container.')
