In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es = Elasticsearch('http://127.0.0.1:9200/')
es.info()


ObjectApiResponse({'name': '66fdaa2378e5', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'YvL_dWOXSmKijcFbmfCdwQ', 'version': {'number': '8.17.6', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dbcbbbd0bc4924cfeb28929dc05d82d662c527b7', 'build_date': '2025-04-30T14:07:12.231372970Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
query = {
    "query": {
        "multi_match": {
            "query": "How do execute a command on a Kubernetes pod?",
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}

response = es.search(index="faq", body=query)

top_hit = response["hits"]["hits"][0]
print("Top Question:", top_hit["_source"]["question"])
print("Score:", round(top_hit["_score"], 2))


Top Question: How do I debug a docker container?
Score: 44.51


In [5]:
{
  "size": 3,
  "query": {
    "bool": {
      "must": {
        "multi_match": {
          "query": "How do copy a file to a Docker container?",
          "fields": ["question^4", "text"],
          "type": "best_fields"
        }
      },
      "filter": {
        "term": {
          "course.keyword": "machine-learning-zoomcamp"
        }
      }
    }
  }
}


{'size': 3,
 'query': {'bool': {'must': {'multi_match': {'query': 'How do copy a file to a Docker container?',
     'fields': ['question^4', 'text'],
     'type': 'best_fields'}},
   'filter': {'term': {'course.keyword': 'machine-learning-zoomcamp'}}}}}

In [6]:
from elasticsearch import Elasticsearch

# Connect to local ElasticSearch
es = Elasticsearch("http://localhost:9200")

# Your index name (use your actual index)
index_name = "faq"

# Construct query
query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "How do copy a file to a Docker container?",
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course.keyword": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

# Run the search
response = es.search(index=index_name, body=query)

# Extract the top 3 questions
questions = [hit['_source']['question'] for hit in response['hits']['hits']]

print("Top 3 questions:")
for i, q in enumerate(questions, start=1):
    print(f"{i}. {q}")

# To get the 3rd question:
print("\n3rd question:", questions[2] if len(questions) >= 3 else "Less than 3 results")


Top 3 questions:

3rd question: Less than 3 results


In [14]:
import requests

docs_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json'
res = requests.get(docs_url)
docs_raw = res.json()

# Check how many top-level courses
print("Total Courses:", len(docs_raw))

# List all course names
for course in docs_raw:
    print(course['course'], "->", len(course['documents']), "docs")


Total Courses: 3
data-engineering-zoomcamp -> 435 docs
machine-learning-zoomcamp -> 375 docs
mlops-zoomcamp -> 138 docs


In [15]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

# Delete index if it exists
if es.indices.exists(index="faq"):
    es.indices.delete(index="faq")

# Define schema
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

# Recreate index
es.indices.create(index="faq", body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'faq'})

In [16]:
from tqdm.auto import tqdm

# Now index all 900+ docs
for doc in tqdm(documents):
    es.index(index="faq", document=doc)


  0%|          | 0/948 [00:00<?, ?it/s]

In [17]:
from elasticsearch.helpers import scan

docs_ml = list(scan(es, index="faq", query={
    "query": {
        "term": {
            "course.keyword": "machine-learning-zoomcamp"
        }
    }
}))

print("ML Zoomcamp Docs (in Elasticsearch):", len(docs_ml))


ML Zoomcamp Docs (in Elasticsearch): 0
