<!-- +----------------+-----------------------------+--------------------------+
| Tool           | Description                 | Best Use Case            |
+================+=============================+==========================+
| Anaconda       | Full Python data distro     | Beginner-friendly setup  |
+----------------+-----------------------------+--------------------------+
| Miniconda      | Minimal Conda installer     | Custom, lightweight envs |
+----------------+-----------------------------+--------------------------+
| Micromamba     | Fast Conda alternative      | CI/CD, containers        |
+----------------+-----------------------------+--------------------------+ -->

| Tool        | Description                                           | Best Use Case             | Size (initial)        | Download Link                                                                 |
|-------------|-------------------------------------------------------|---------------------------|------------------------|--------------------------------------------------------------------------------|
| Anaconda    | Full Python data distro                               | Beginner-friendly setup   | 🚛 Huge (~3–5 GB)      | [Archive 📦](https://repo.anaconda.com/archive/)                               |
| Miniconda   | Minimal Conda installer (Python + conda)              | Custom, lightweight envs  | 📦 Small (~70 MB)      | [Miniconda Releases 🌐](https://repo.anaconda.com/miniconda/)                 |
| Micromamba  | Fast, lightweight Conda-compatible CLI (C++ binary)   | CI/CD, containers         | 🪶 Tiny (~2 MB binary) | [Micromamba GitHub 🚀](https://github.com/mamba-org/micromamba-releases/tags) |



In [None]:
# !conda init

## openai key 

- [platform.openai.com](https://platform.openai.com/docs/overview)
- https://huggingface.co/

In [1]:
# !pip install tqdm notebook jupyter ipywidgets

In [2]:
# !pip install openai transformers tensorflow tf-keras elasticsearch

In [3]:
# !pip install huggingface_hub python-dotenv

In [4]:
# import getpass

# Securely prompt user for Hugging Face token (input will be hidden)
# token = getpass.getpass("🔐 Enter your Hugging Face token: ").strip()

# Validate token is not empty
# if not token:
#     raise ValueError("⚠️ Token input was empty. Please enter a valid token.")

In [5]:
import os
from dotenv import load_dotenv, find_dotenv

# Load variables from .env file
load_dotenv()

# Get the token from environment variable
# os.environ["HUGGINGFACE_TOKEN"]
# os.environ.get("HUGGINGFACE_TOKEN", None)
token = os.getenv("HUGGINGFACE_TOKEN", None)

In [6]:
from huggingface_hub import login

# Perform login – this stores the token in your local cache securely
login(token=token)

# print("✅ Successfully logged in to Hugging Face Hub.")

In [7]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
# from transformers import pipeline

# pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta")
# messages = [
#     # {"role": "user", "content": "Who are you?"},
#     documents[0]
# ]
# pipe(messages)

# print(result)

In [9]:
from transformers import pipeline

qa = pipeline("question-answering", model="deepset/roberta-base-squad2")

result = qa({
    "context": documents[0]['text'],
    "question": documents[0]['question']
})

print(result["answer"])


2025-07-01 14:01:46.803530: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-01 14:01:46.823176: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-01 14:01:47.145312: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-01 14:01:47.271588: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751378507.586537   42325 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751378507.67

15th Jan 2024


In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [None]:
from tqdm.auto import tqdm 

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [None]:
query = "How do execute a command on a Kubernetes pod?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
        }
    }
}

search_results = es_client.search(index=index_name, body=search_query)

In [None]:
search_results['hits']['hits'][0]['_score']

In [None]:
query = "How do copy a file to a Docker container?"

search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

search_results = es_client.search(index=index_name, body=search_query)

In [None]:
search_results['hits']['hits']

In [None]:
context_template = """
Q: {question}
A: {text}
""".strip()

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [None]:
context_pieces = []

for hit in search_results['hits']['hits']:
    doc = hit['_source']
    context_piece = context_template.format(**doc)
    context_pieces.append(context_piece)

context = '\n\n'.join(context_pieces)

In [None]:
prompt = prompt_template.format(question=query, context=context)

In [None]:
len(prompt)

In [None]:
import tiktoken

In [None]:
print(prompt[:100])

In [None]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [None]:
len(encoding.encode(prompt))

In [None]:
tokens = encoding.encode(prompt)[:10]
tokens

In [None]:
encoding.decode_single_token_bytes(tokens[5])