In [2]:
import os

# Test OPENAI

In [1]:
import openai

In [2]:
from openai import OpenAI

In [3]:
client = OpenAI(api_key=os.environ['OPENAI-API-KEYS'])

In [13]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": "is it too late to join the course"}] 
)

In [None]:
response

# Test Mistral

In [8]:
# !pip install mistralai

In [3]:
import os
from mistralai import Mistral

In [5]:
api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-large-latest"

In [6]:
client = Mistral(api_key=api_key)

In [21]:
chat_response = client.chat.complete(
    model= model,
    messages = [
        {
            "role": "user",
            "content": "is it too late to join the course?",
        },
    ]
)
print(chat_response.choices[0].message.content)

Whether it's too late to join a course depends on several factors, including the specific course, the institution or platform offering it, and their policies. Here are some steps you can take to determine if it's too late:

1. **Check the Course Website or Platform**: Look for information on enrollment deadlines, late registration policies, or any notes about joining the course after it has started.

2. **Contact the Instructor or Administrator**: Reach out to the course instructor or the administrative office of the institution. They can provide the most accurate and up-to-date information.

3. **Review the Syllabus or Course Outline**: If available, check the syllabus or course outline for any mention of late enrollment policies.

4. **Look for Online Forums or Communities**: Sometimes, other students or participants may have asked similar questions in online forums, social media groups, or discussion boards related to the course.

5. **Check for Self-Paced Options**: If the course i

# RAG Intro

In [2]:
# !pip install minsearch

In [3]:
import minsearch
import json

## getting data

In [6]:
# !pip install requests

In [7]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [8]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [9]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## minsearch

In [13]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [17]:
# ??minsearch.Index

In [19]:
index.fit(documents)

<minsearch.minsearch.Index at 0x71e4f05226f0>

In [22]:
query = 'the course has already started, can I still enroll?'

In [21]:
boost = {'question': 3.0, 'section': 0.5}

In [25]:
search_results = index.search(
    query=query,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)
search_results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [27]:
context = ""    
for doc in search_results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

context

"section: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - Can I follow the course after it finishes?\nanswer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.\n\nsection: General course-related questions\nquestion: Course - When will the course start?\nanswer: The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course 

In [28]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start wit

## Function

In [10]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    
    return response.choices[0].message.content

In [13]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [15]:
rag(query)

In [51]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. Even if you don't register, you're still eligible to submit the homeworks."

## ElasticSearch

In [54]:
# !pip freeze 

In [20]:
from elasticsearch import Elasticsearch

In [21]:
es_client = Elasticsearch('http://localhost:9200') 

In [22]:
es_client

<Elasticsearch(['http://localhost:9200'])>

In [23]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

In [24]:
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [29]:
# !pip install tqdm

In [25]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████| 948/948 [00:03<00:00, 247.61it/s]


In [27]:
query = 'I just disovered the course. Can I still join it?'

In [28]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [29]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [30]:
rag(query)

"Yes, you can still join the course even after the start date. You're eligible to submit the homeworks even if you don't register. However, be aware that there will be deadlines for turning in the final projects, so it's important not to leave everything for the last minute."

# Homework-1

## Q1

In [30]:
!curl localhost:9200

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
{
  "name" : "cf4119d995bf",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "TRSFiVgXSpGbFu0wTzwMEg",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


## Q2

In [None]:
# es_client.index(index=index_name, document=doc)

## Q3

In [31]:
query = 'How do execute a command on a Kubernetes pod?'

In [32]:
def elastic_search(query):
    search_query = {
        "size": 4,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", 
                                   # "section"
                                  ],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "course": "data-engineering-zoomcamp"
                #     }
                # }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return response, result_docs

In [33]:
response, result_docs = elastic_search(query)

In [34]:
response['hits']['hits'][0]['_score']

33.379173

## Q4

In [35]:
query = 'How do copy a file to a Docker container?'

In [36]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", 
                                   # "section"
                                  ],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [37]:
result_docs = elastic_search(query)
result_docs[2]['question']

'How do I copy files from a different folder into docker container’s working directory?'

## Q5

In [38]:
query = 'How do I execute a command in a running docker container?'

In [39]:
res_prompt = build_prompt(query, result_docs)

In [40]:
res_prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: How do I execute a command in a running docker container?\n\nCONTEXT: \nsection: 5. Deploying Machine Learning Models\nquestion: How do I debug a docker container?\nanswer: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\nsection: 5. Deploying Machine Learning Models\nquestion: How do I copy files from my local machine to docker container?\nanswer: You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nTo copy a file or directory from your local machine into a running Docke

In [41]:
len(res_prompt)

1637

## Q6

In [61]:
# !pip install tiktoken

In [43]:
import tiktoken

In [44]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [49]:
len(encoding.encode(res_prompt))

356

### Error with Mistral Tokenizer

In [70]:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
tokenizer = MistralTokenizer.v3()

TokenizerException: Unrecognized tokenizer file: /usr/local/python/3.12.1/lib/python3.12/site-packages/mistral_common/data/mistral_instruct_tokenizer_240323.model.v3