**Connect to Elasticsearch**

In [1]:
from pprint import pprint 
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

client_info = es.info()

print("Successfully Connected to Elasticsearch!")

pprint(client_info.body)

Successfully Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'T1HeaWnRTOqX_BBgREVVbA',
 'name': '64c49e436740',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-10-21T10:06:21.288851013Z',
             'build_flavor': 'default',
             'build_hash': '25d88452371273dd27356c98598287b669a03eae',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.1',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.2.0'}}


**Preparint the index**

In [2]:
es.indices.delete(index="my_index", ignore_unavailable=True) 
es.indices.create(
    index="my_index",
    mappings={
        "properties": {
            'embedding': {"type": "dense_vector"}
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

**Embedding Model**

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

**Device like CUDA vs CPU**

In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cpu')

**Embedded Device into Model**

In [5]:
model = model.to(device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

**Load Documents**

In [9]:
import json 

documents = json.load(open("data/astronomy.json"))

documents[0]

{'id': 1,
 'title': 'The Solar System',
 'content': 'The Solar System consists of the Sun and the objects that orbit it, including eight planets, their moons, dwarf planets, and countless small bodies like asteroids and comets.'}

**Embed the documents**

In [10]:
from tqdm import tqdm

def get_embedding(text):
    return model.encode(text)

operations = []

for document in tqdm(documents, total=len(documents)):
    operations.append({"index": {"_index": "my_index"}}) # action
    operations.append({
        **document,
        "embedding": get_embedding(document["content"])
    })

# bulk api
response = es.bulk(operations=operations)

pprint(response.body)

100%|██████████| 10/10 [00:01<00:00,  6.20it/s]


{'errors': False,
 'items': [{'index': {'_id': '23VDRpoBkw1BrtJkdqYo',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '3HVDRpoBkw1BrtJkdqYp',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '3XVDRpoBkw1BrtJkdqYp',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, '

We indexed all documents with an additional field embedding. Let's retrieve the documents to verify that the text was converted to a dense vector.

In [12]:
response = es.search(
    index="my_index",
    body={
        "_source":['id', 'title', 'content', 'embedding'],
        "query": {
            "match_all": {}
        }
    }
)

pprint(response["hits"]["hits"])

[{'_id': '23VDRpoBkw1BrtJkdqYo',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'content': 'The Solar System consists of the Sun and the objects '
                         'that orbit it, including eight planets, their moons, '
                         'dwarf planets, and countless small bodies like '
                         'asteroids and comets.',
              'embedding': [0.040633388,
                            -0.0025618025,
                            0.05483473,
                            0.009171056,
                            0.03121994,
                            -0.014487917,
                            0.029695725,
                            0.016134942,
                            0.06336765,
                            0.04546153,
                            -0.0039837,
                            -0.054472603,
                            0.048416253,
                            -0.08368916,
                            0.07394563,
                          

Awesome! We successfully inserted the documents with the additional embedding field. Now, let’s check the mapping to confirm that the dimension of the dense vector is 384.

In [13]:
response = es.indices.get_mapping(index="my_index")
pprint(response.body)

{'my_index': {'mappings': {'properties': {'content': {'fields': {'keyword': {'ignore_above': 256,
                                                                             'type': 'keyword'}},
                                                      'type': 'text'},
                                          'embedding': {'dims': 384,
                                                        'index': True,
                                                        'index_options': {'ef_construction': 100,
                                                                          'm': 16,
                                                                          'rescore_vector': {'oversample': 3.0},
                                                                          'type': 'bbq_hnsw'},
                                                        'similarity': 'cosine',
                                                        'type': 'dense_vector'},
                                          '

**KNN Search**

###### 1. Query N°1

In [19]:
query = "I heard space is getting bigger everyday in my physics class!"
embedded_query = get_embedding(query)

# KNN search. Find top k=3 items among top = 5
response = es.search(
    index="my_index", 
    knn={
        "field": "embedding",
        "query_vector": embedded_query,
        "num_candidates": 5,
        "k": 3,
    }
)

pprint(response["hits"]["hits"])

[{'_id': '43VDRpoBkw1BrtJkdqYp',
  '_index': 'my_index',
  '_score': 0.721279,
  '_source': {'content': 'The universe has been expanding since the Big Bang. '
                         'Observations of distant galaxies show that they are '
                         'moving away from us, which supports the idea of an '
                         'expanding universe.',
              'id': 9,
              'title': 'The Expanding Universe'}},
 {'_id': '3nVDRpoBkw1BrtJkdqYp',
  '_index': 'my_index',
  '_score': 0.6749128,
  '_source': {'content': 'The Big Bang Theory is the leading explanation about '
                         'how the universe began. It suggests that the '
                         'universe was once in an extremely hot and dense '
                         'state and has been expanding ever since.',
              'id': 4,
              'title': 'The Big Bang Theory'}},
 {'_id': '5HVDRpoBkw1BrtJkdqYp',
  '_index': 'my_index',
  '_score': 0.6579334,
  '_source': {'content': 'Spac

In [28]:
hits = response.body["hits"]["hits"]

for hit in hits:
    print(f"Title  : {hit['_source']['title']}")
    print(f"Content: {hit['_source']['content']}")
    print(f"Match  : {hit['_score'] * 100}%")
    print("*"*100)

Title  : The Expanding Universe
Content: The universe has been expanding since the Big Bang. Observations of distant galaxies show that they are moving away from us, which supports the idea of an expanding universe.
Match  : 72.1279%
****************************************************************************************************
Title  : The Big Bang Theory
Content: The Big Bang Theory is the leading explanation about how the universe began. It suggests that the universe was once in an extremely hot and dense state and has been expanding ever since.
Match  : 67.49128%
****************************************************************************************************
Title  : Space Exploration
Content: Space exploration involves the use of space technology to explore outer space. It includes missions to planets, moons, and other celestial bodies, aiming to discover more about the universe.
Match  : 65.79334%
**************************************************************************

**Query #2**

In [30]:
query = "How do we find exoplanets?"
embedded_query = get_embedding(query)

# KNN search. Find top k=3 items among top = 5
response = es.search(
    index="my_index", 
    knn={
        "field": "embedding",
        "query_vector": embedded_query,
        "num_candidates": 5,
        "k": 1,
    }
)

pprint(response["hits"]["hits"])

[{'_id': '33VDRpoBkw1BrtJkdqYp',
  '_index': 'my_index',
  '_score': 0.85617316,
  '_source': {'content': 'Exoplanets, or extrasolar planets, are planets that '
                         'exist outside our solar system. They vary greatly in '
                         'size and composition and are often found using '
                         'methods like the transit method and radial velocity.',
              'id': 5,
              'title': 'Exoplanets'}}]


In [31]:
hits = response.body["hits"]["hits"]

for hit in hits:
    print(f"Title  : {hit['_source']['title']}")
    print(f"Content: {hit['_source']['content']}")
    print(f"Match  : {hit['_score'] * 100}%")
    print("*"*100)

Title  : Exoplanets
Content: Exoplanets, or extrasolar planets, are planets that exist outside our solar system. They vary greatly in size and composition and are often found using methods like the transit method and radial velocity.
Match  : 85.617316%
****************************************************************************************************


We observe that the document with the highest score consistently corresponds to the query. Additionally, the other results returned by the k-nearest neighbors (k-NN) search are also relevant. To further refine the results, you can set a threshold to return only documents that meet a specified score, allowing you to exclude unrelated documents.