**Connect to Elasticsearch**

In [3]:
from pprint import pprint 
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
client_info = es.info()
print("Connected to Elasticsearch!")

pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'T1HeaWnRTOqX_BBgREVVbA',
 'name': '64c49e436740',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-10-21T10:06:21.288851013Z',
             'build_flavor': 'default',
             'build_hash': '25d88452371273dd27356c98598287b669a03eae',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.1',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.2.0'}}


**Create Index**

In [28]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

**Index documents**

Let's use the APOD dataset in this notebook.

In [29]:
import json 
from tqdm import tqdm 


with open("data/apod.json") as f:
    documents = json.load(f)

operations = []

for document in tqdm(documents, total=len(documents), desc="Indexing documents..."):
    year = document["date"].split("-")[0]
    document["year"] = int(year)
    operations.append({"index": {"_index": "my_index"}}) # action 
    operations.append(document) 

response = es.bulk(operations=operations)

pprint(response.body)

Indexing documents...: 100%|██████████| 3333/3333 [00:00<00:00, 359290.02it/s]


{'errors': False,
 'items': [{'index': {'_id': 'AaucbpoBfo9L6JQ6LD5X',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'AqucbpoBfo9L6JQ6LD5Y',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'A6ucbpoBfo9L6JQ6LD5Y',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, '

If the indexing is successful, you should see response["errors"] as False.

In [22]:
response["errors"]

False

**Collapse search results**

Without collapsing, the search results will return all documents that match the query.

In [37]:
response_no_collapsing = es.search(
    index="my_index",
    body={
        "query": {"match": {"title": "Andromeda galaxy"}},
        "size": 10_000,
    },
)
total_hits = response_no_collapsing["hits"]["total"]["value"]
print(f"Total hits before collapsing: {total_hits}")
total_returned_hits = len(response_no_collapsing["hits"]["hits"])
print(f"Total returned hits before collapsing: {total_returned_hits}")

Total hits before collapsing: 270
Total returned hits before collapsing: 270


In [38]:
from elastic_transport import ObjectApiResponse


def get_hits_per_year(response: ObjectApiResponse) -> dict:
    hits_per_year_count = {}
    for hit in response["hits"]["hits"]:
        year = hit["_source"]["year"]
        if year not in hits_per_year_count:
            hits_per_year_count[year] = 0
        hits_per_year_count[year] += 1
    return hits_per_year_count

print("Hits per year count:")
pprint(get_hits_per_year(response_no_collapsing))

Hits per year count:
{2015: 28,
 2016: 29,
 2017: 31,
 2018: 19,
 2019: 32,
 2020: 25,
 2021: 24,
 2022: 30,
 2023: 32,
 2024: 20}


Collapsing search results by year will return only one document per year that matches the query. That returned document will be the one with the highest `_score` for that year.

In [32]:
response_collapsing = es.search(
    index="my_index",
    body={
        "query": {"match": {"title": "Andromeda galaxy"}},
        "collapse": {"field": "year"},
        "size": 10_000,
    },
)
total_hits = response_collapsing["hits"]["total"]["value"]
print(f"Total hits before collapsing: {total_hits}")
total_returned_hits = len(response_collapsing["hits"]["hits"])
print(f"Total returned hits after collapsing: {total_returned_hits}")

Total hits before collapsing: 270
Total returned hits after collapsing: 10


In [33]:
print("Hits per year count:")
pprint(get_hits_per_year(response_collapsing))

Hits per year count:
{2015: 1,
 2016: 1,
 2017: 1,
 2018: 1,
 2019: 1,
 2020: 1,
 2021: 1,
 2022: 1,
 2023: 1,
 2024: 1}


Let's verify if the document in year 2024 is the one with the highest _score.

From the response with collapsing, we can see that the document in year 2024 has a _score of 7.789091.

In [35]:
for hit in response_collapsing["hits"]["hits"]:
    year = hit["_source"]["year"]

    if year == 2024:
        score = hit["_score"]
        print(f"Document with a score of {score} for year {year}:")
        pprint(hit["_source"])
        break

Document with a score of 7.789091 for year 2024:
{'authors': 'Subaru, Hubble, Mayall, R. Gendler, R. Croman\n',
 'date': '2024-09-08',
 'explanation': 'Explanation: The most distant object easily visible to the '
                'unaided eye is M31, the great Andromeda Galaxy. Even at some '
                'two and a half million light-years distant, this immense '
                'spiral galaxy -- spanning over 200,000 light years -- is '
                'visible, although as a faint, nebulous cloud in the '
                'constellation Andromeda. A bright yellow nucleus, dark '
                'winding dust lanes, and expansive spiral arms dotted with '
                'blue star clusters and red nebulae, are recorded in this '
                'stunning telescopic image which combines data from orbiting '
                'Hubble with ground-based images from Subaru and Mayall. In '
                'only about 5 billion years, the Andromeda galaxy may be even '
                'eas

And in the response without collapsing, we confirm that the first hits from 2024 has a _score of 7.789091, which is the same as the one in the response with collapsing.

In [39]:
for hit in response_no_collapsing["hits"]["hits"]:
    year = hit["_source"]["year"]
    if year == 2024:
        score = hit["_score"]
        print(f"Score {score}:")
        pprint(hit["_source"])
        print("-" * 50)

Score 7.789091:
{'authors': 'Subaru, Hubble, Mayall, R. Gendler, R. Croman\n',
 'date': '2024-09-08',
 'explanation': 'Explanation: The most distant object easily visible to the '
                'unaided eye is M31, the great Andromeda Galaxy. Even at some '
                'two and a half million light-years distant, this immense '
                'spiral galaxy -- spanning over 200,000 light years -- is '
                'visible, although as a faint, nebulous cloud in the '
                'constellation Andromeda. A bright yellow nucleus, dark '
                'winding dust lanes, and expansive spiral arms dotted with '
                'blue star clusters and red nebulae, are recorded in this '
                'stunning telescopic image which combines data from orbiting '
                'Hubble with ground-based images from Subaru and Mayall. In '
                'only about 5 billion years, the Andromeda galaxy may be even '
                'easier to see -- as it will likely s

**Expand collapsed results**

Expanding collapsed results allows you to retrieve more than one document per year that matches the query. Control how documents are sorted within each collapsed group and more.

In [43]:
response_collapsing = es.search(
    index="my_index", 
    body={
        "query": {
            "match": {
                "title": "Andromeda galaxy"
            }
        },
        "size": 10_000,
        "collapse": {
            "field": "year",
            "inner_hits": {
                "name": "most_recent",
                "size": 3,
            }
        }
    }
)

total_hits = response_collapsing["hits"]["total"]["value"]
print(f"Total hits before collapsing: {total_hits}")
total_returned_hits = len(response_collapsing["hits"]["hits"])
print(f"Total returned hits after collapsing: {total_returned_hits}")
inner_hits = response_collapsing["hits"]["hits"][0]["inner_hits"]["most_recent"]
total_returned_hits_after_expanding = len(inner_hits["hits"]["hits"])
print(f"Total returned hits after expanding: {total_returned_hits_after_expanding}")

Total hits before collapsing: 270
Total returned hits after collapsing: 10
Total returned hits after expanding: 3


After expanding the collapsed results, we can see that we have more than one document per year that matches the query.

In [44]:
print("Hits per year count:")
pprint(get_hits_per_year(inner_hits))

Hits per year count:
{2024: 3}


The documents are sorted by _score within each collapsed group. They also match the scores in the response without collapsing.

In [45]:
for hit in inner_hits["hits"]["hits"]:
    score = hit["_score"]
    print(f"Score: {score}")

Score: 7.789091
Score: 3.0710938
Score: 2.792822


**Collapsing with search_after**

When collapsing on a field with a lot of unique values, you can use the search_after parameter to paginate through the results. This is useful when you want to retrieve all collapsed results without missing any.

Note: You can't use the `scroll` API with collapsing. Use `search_after` instead.

In [48]:
documents = []

number_of_unique_user_ids = 20_000

for user_id in range(number_of_unique_user_ids):
    for doc_id in range(2):
        document = {
            "user_id": user_id,
            "title": f"Document {doc_id} for user {user_id}",
            "content": f"This is the content of document {doc_id} for user {user_id}.",
        }

        documents.append(document)

In [49]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [52]:
from tqdm import tqdm

operations = []

for document in tqdm(documents, total=len(documents), desc="Indexing documents"):
    operations.append({"index": {"_index": "my_index"}}) # action
    operations.append(document)

es.bulk(operations=operations)

Indexing documents: 100%|██████████| 40000/40000 [00:00<00:00, 451765.52it/s]


ObjectApiResponse({'errors': False, 'took': 2002, 'items': [{'index': {'_index': 'my_index', '_id': 'BqtDb5oBfo9L6JQ6MEuA', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'B6tDb5oBfo9L6JQ6MEuA', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'CKtDb5oBfo9L6JQ6MEuA', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'CatDb5oBfo9L6JQ6MEuA', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'CqtDb5oBfo9L6JQ6MEuA', '_version': 1, 'result': 'created', '_shards': {'tot

We indexed 40000 documents, now we are ready to use search_after to paginate through the collapsed results. Since we have 2 documents per user, we can expect to have 20000 collapsed results.We indexed 40000 documents, now we are ready to use search_after to paginate through the collapsed results. Since we have 2 documents per user, we can expect to have 20000 collapsed results.

In [53]:
document_count = es.count(index="my_index")
print(f"Total documents indexed: {document_count['count']}")

Total documents indexed: 40000


And we can see that the last user ID in the collapsed results is 19999 and the number of collapsed hits is 20000, which is what we expected.

In [62]:
collapsed_hits = []
search_after = None

while True:
    body = {
        "query": {
            "match": {
                "content": "document"
            }
        },
        "collapse": {"field": "user_id"},
        "sort": ["user_id"],
        "size": 10_000,
    }

    if search_after:
        body["search_after"] = [search_after]
    
    response = es.search(
        index="my_index", 
        body=body
    )

    hits = response["hits"]["hits"]

    if not hits:
        break 

    search_after = hits[-1]["_source"]["user_id"]

    print(f"Last user ID: {search_after}")
    collapsed_hits.extend(hits)

print(len(collapsed_hits))
pprint(collapsed_hits[0])

    

Last user ID: 9999
Last user ID: 19999
20000
{'_id': 'BqtDb5oBfo9L6JQ6MEuA',
 '_index': 'my_index',
 '_score': None,
 '_source': {'content': 'This is the content of document 0 for user 0.',
             'title': 'Document 0 for user 0',
             'user_id': 0},
 'fields': {'user_id': [0]},
 'sort': [0]}
