In [48]:
def search(query):
    
    query='the course has already started, can I still enroll ?'
    boost = {'question':3.0, 'sections':0.5}
    
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course': 'data-engineering-zoomcamp'}
    )
    return results

In [49]:
def build_prompt(q,results):

    prompt_template = """
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. Use only the facts from the context
    when answering the question.
    IF the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
                   
    context = ""
    
    for doc in results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return prompt_template.format(question=q, context=context).strip()

In [50]:

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role":"user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [74]:
query = "How do I run kafka?"

def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [75]:
rag(query)

'To run Kafka, here are the relevant instructions based on the context provided:\n\nIn the project directory, you need to run:\n```shell\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nReplace `<jar_name>` with the actual name of your jar file.'

In [76]:
rag("the course has already started, can I still enroll")

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [54]:
from elasticsearch import Elasticsearch

In [55]:
es_client = Elasticsearch('http://localhost:9200')

In [56]:
es_client.info()

{'name': '65e463efa695',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'TAYV6CgFRxynhQyrYAHyNw',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [57]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'course-questions'}

In [59]:
from tqdm.auto import tqdm
for doc in documents:
    es_client.index(index=index_name, body=doc)

In [60]:
query = "I just discovered the . Can I still join it ?"

In [73]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(body=search_query, index=index_name)
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source'])
    return results

## HOMEWORK 1

In [77]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [78]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "hw-questions"

es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'hw-questions'}

In [79]:
for doc in documents:
    es_client.index(index=index_name, body=doc)

In [97]:
query = "How do I execute a command in a running docker container?"
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(body=search_query, index=index_name)
    results = []
    for hit in response['hits']['hits']:
        # print(hit)
        results.append(hit['_source'])
    return results

In [110]:
results = elastic_search(query)

In [111]:
print(results)

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}, {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani", 'section': '5. Deploying Machine Learning Models', 'question': 'How do I copy files from my local machine to docker container?', 'course': 'm

In [112]:
question = "How do I execute a command in a running docker container?"

In [118]:
def build_context(question, results):
    context_template = """
    Q: {question}
    A: {text}
    """.strip()

    context = ""
    for entry in results:
        context += context_template.format(question=entry['question'], text=entry['text']) + "\n\n"
    return context.strip()

In [119]:
def build_prompt(question, context):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
        
    QUESTION: {question}
        
    CONTEXT:
    {context}
    """.strip()


    return prompt_template.format(question = question, context = context).strip()


In [120]:
prompt = build_prompt(question, build_context(question, results))

In [121]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
        
    QUESTION: How do I execute a command in a running docker container?
        
    CONTEXT:
    Q: How do I debug a docker container?
    A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a command in the specific container:
docker ps (find the container-id)
docker exec -it <container-id> bash
(Marcos MJD)

Q: How do I copy files from my local machine to docker container?
    A: You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:
To copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:
docker cp /path/

In [122]:
len(prompt)

1506

In [124]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.5.15-cp39-cp39-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading tiktoken-0.7.0-cp39-cp39-macosx_10_9_x86_64.whl (961 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m961.8/961.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading regex-2024.5.15-cp39-cp39-macosx_10_9_x86_64.whl (281 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.7/281.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m[31m6.4 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: regex, tiktoken
Successfully installed regex-2024.5.15 tiktoken-0.7.0


In [127]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [128]:
num_tokens = len(encoding.encode(prompt))
print(num_tokens)

331
