# Part 2: Query Processing & Query Rewrite demos
Show how Elasticsearch rewrites queries/aggregations using real radiology text and DICOM metadata.


In [1]:
# import libraries
import os
import sys

import numpy as np
import pandas as pd
import pydicom
from tqdm import tqdm

import matplotlib.pyplot as plt

from elasticsearch import Elasticsearch, helpers
from pprint import pprint

In [2]:
# import current working dir to sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


# declare constants
ES_HOST = "http://localhost:9200"

TEXT_INDEX = "radiology_text"
VECTOR_INDEX = "radiology_vector"

TEXT_DATA_PATH = "../../data/text/Radiologists Report.xlsx"
IMAGE_DATA_DIR = "../../data/images/"

In [3]:
# connect to Elasticsearch
es = Elasticsearch(ES_HOST)
if es.ping():
    print("Connected to Elasticsearch!")

Connected to Elasticsearch!


## 1. Setups

In [4]:
# import current working dir to sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


# declare constants
ES_HOST = "http://localhost:9200"

TEXT_INDEX = "radiology_text"
DOC_INDEX = "radiology_doc"
VECTOR_INDEX = "radiology_vector"

TEXT_DATA_PATH = "../../data/text/Radiologists Report.xlsx"
IMAGE_DATA_DIR = "../../data/images/"

In [5]:
# connect to Elasticsearch
es = Elasticsearch(ES_HOST)
if es.ping():
    print("Connected to Elasticsearch!")

Connected to Elasticsearch!


In [6]:
# ensure radiology_text exists with 2 primary shards and 1 replica
def ensure_text_index():
    if es.indices.exists(index=TEXT_INDEX):
        print(f'Recreating {TEXT_INDEX} to enforce shard/replica settings')
        es.indices.delete(index=TEXT_INDEX)

    settings = {
        'settings': {
            'number_of_shards': 2,
            'number_of_replicas': 1
        },
        "mappings": {
            "properties": {
                "patient_id": {
                    "type": "keyword",
                    "null_value": "NULL_PATIENT"
                },
                "clinician_notes": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                        "type": "keyword",
                        "ignore_above": 256,
                        "null_value": "NULL_NOTE"
                        }
                    }
                }
            }
        }
    }
    es.indices.create(index=TEXT_INDEX, body=settings)
    print(f'Created {TEXT_INDEX} with 2 shards / 1 replica')

    df = pd.read_excel(TEXT_DATA_PATH)
    actions = []
    for _, row in df.iterrows():
        actions.append({
            '_index': TEXT_INDEX,
            '_id': str(row['Patient ID']),
            '_source': {
                'patient_id': row['Patient ID'],
                'clinician_notes': row["Clinician's Notes"]
            }
        })
    try:
        helpers.bulk(es, actions)
    except helpers.BulkIndexError as e:
        print(e.errors[:2])  # inspect the reason

    print(f'Indexed {len(actions)} text docs')

ensure_text_index()


Recreating radiology_text to enforce shard/replica settings
Created radiology_text with 2 shards / 1 replica
[{'index': {'_index': 'radiology_text', '_id': '18', 'status': 400, 'error': {'type': 'document_parsing_exception', 'reason': '[1:18] failed to parse: [1:39] Non-standard token \'NaN\': enable `JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS` to allow\n at [Source: (byte[])"{"patient_id":18,"clinician_notes":NaN}"; line: 1, column: 39]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': '[1:39] Non-standard token \'NaN\': enable `JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS` to allow\n at [Source: (byte[])"{"patient_id":18,"clinician_notes":NaN}"; line: 1, column: 39]', 'caused_by': {'type': 'json_parse_exception', 'reason': 'Non-standard token \'NaN\': enable `JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS` to allow\n at [Source: (byte[])"{"patient_id":18,"clinician_notes":NaN}"; line: 1, column: 39]'}}}, 'data': {'patient_id': 18, 'clinician_notes': nan}}}, {'index': {'_index': '

In [15]:
# ensure DICOM metadata index exists with up to 1000 docs for range/BKD demo
from itertools import islice
from helpers.ima_loader import load_dicom, get_metadata

def iter_dicom_paths(root):
    for dirpath, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith((".ima", ".dcm")):
                yield os.path.join(dirpath, f)

def ensure_dicom_index(limit=50000):
    if es.indices.exists(index=DOC_INDEX):
        count = es.count(index=DOC_INDEX)['count']
        if count >= limit:
            print(f'{DOC_INDEX} already has {count} docs')
            return
        es.indices.delete(index=DOC_INDEX)

    mapping = {
        'mappings': {
            'properties': {
                'doc_id': {'type': 'keyword'},
                'patient_id': {'type': 'keyword'},
                'series_description': {
                    'type': 'text',
                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}
                },
                'slice_thickness': {'type': 'double'},
                'spacing_between_slices': {'type': 'double'},
                'rows': {'type': 'integer'},
                'columns': {'type': 'integer'}
            }
        }
    }
    es.indices.create(index=DOC_INDEX, body=mapping)
    print(f'Created {DOC_INDEX}')

    sample_paths = list(islice(iter_dicom_paths(IMAGE_DATA_DIR), limit))
    docs = []
    for path in sample_paths:
        dicom = load_dicom(path)
        meta = get_metadata(dicom)
        doc = {
            'doc_id': f"{meta.get('PatientID', 'unknown')}_{os.path.basename(path)}",
            'patient_id': meta.get('PatientID'),
            'series_description': meta.get('SeriesDescription'),
            'slice_thickness': float(meta.get('SliceThickness', 0) or 0),
            'spacing_between_slices': float(meta.get('SpacingBetweenSlices', 0) or 0),
            'rows': int(meta.get('Rows', 0) or 0),
            'columns': int(meta.get('Columns', 0) or 0)
        }
        docs.append(doc)

    actions = [{'_index': DOC_INDEX, '_id': d['doc_id'], '_source': d} for d in docs]
    if actions:
        helpers.bulk(es, actions)
        print(f'Indexed {len(actions)} DICOM metadata docs')
    else:
        print('No DICOM files found to index')

ensure_dicom_index()


Created radiology_doc
Indexed 48345 DICOM metadata docs


In [16]:
# check number of docs in DOC_INDEX
count = es.count(index=DOC_INDEX)['count']
print(f'{DOC_INDEX} has {count} docs')

radiology_doc has 43351 docs


### Collect doc ids for explain
We'll reuse the indexed docs so explain has concrete IDs.


In [None]:
text_doc_id = None
dicom_doc_id = None

try:
    res_text = es.search(index=TEXT_INDEX, size=1, _source=True)
    text_doc_id = res_text['hits']['hits'][0]['_id']
    print('TEXT_DOC_ID:', text_doc_id)
except Exception as e:
    print('Could not fetch a text doc:', e)

try:
    res_dicom = es.search(index=DOC_INDEX, size=1, _source=True)
    dicom_doc_id = res_dicom['hits']['hits'][0]['_id']
    print('DICOM_DOC_ID:', dicom_doc_id)
except Exception as e:
    print('Could not fetch a DICOM doc:', e)


Could not fetch a text doc: list index out of range
DICOM_DOC_ID: unknown_LOCALIZER_0_0001_001.ima


## 2. Query Rewriting Demo

### 1. Match query rewrite (inverted index)
Explain shows Lucene rewriting match into BooleanQuery of analyzed terms.


In [None]:
match_query = {'query': {'match': {'clinician_notes': 'disc bulge'}}}
match_exp = es.indices.validate_query(index=TEXT_INDEX, body=match_query, explain=True, rewrite=True)
pprint(match_exp['explanations'])

[{'explanation': 'clinician_notes:disc clinician_notes:bulge',
  'index': 'radiology_text',
  'valid': True}]


In [None]:
match_query = {'query': 
                    {'match': 
                        {'clinician_notes': 
                            {
                                'query': 'disc bulge',
                                'operator': 'and'
                            }
                        }
                    }
                }
match_exp = es.indices.validate_query(index=TEXT_INDEX, body=match_query, explain=True, rewrite=True)
pprint(match_exp['explanations'])

[{'explanation': '+clinician_notes:disc +clinician_notes:bulge',
  'index': 'radiology_text',
  'valid': True}]


### 2. Range query rewrite (BKD)
Numeric ranges are served by BKD trees; explain shows the rewritten range clause.


In [None]:
# get min value of slice_thickness
response = es.search(
    index=DOC_INDEX,
    body={
        "size": 0,
        "aggs": {
            "min_slice_thickness": {
                "min": {
                    "field": "slice_thickness"
                }
            }
        }
    }
)
print("Minimum slice_thickness:", response['aggregations']['min_slice_thickness']['value'])

Minimum slice_thickness: 4.0


In [None]:
if dicom_doc_id:
    range_query = {'query': {'range': {'slice_thickness': {'gte': 9.0}}}}
    range_exp = es.indices.validate_query(index=DOC_INDEX, body=range_query, explain=True, rewrite=True)
    pprint(range_exp['explanations'])
else:
    print('No DICOM_DOC_ID available to run explain')


[{'explanation': 'IndexOrDocValuesQuery(indexQuery=slice_thickness:[9.0 TO '
                 'Infinity], dvQuery=slice_thickness:[4621256167635550208 TO '
                 '9218868437227405312])',
  'index': 'radiology_doc',
  'valid': True}]


### 3. Aggregations require keyword/doc_values
Terms agg on a text field errors; switching to `.keyword` succeeds (doc values).


In [None]:
agg_text = {'size': 0, 'aggs': {'by_note': {'terms': {'field': 'clinician_notes'}}}}
try:
    es.search(index=TEXT_INDEX, body=agg_text)
except Exception as e:
    print('Agg on text -> error (needs .keyword):', e)


Agg on text -> error (needs .keyword): BadRequestError(400, 'search_phase_execution_exception', 'Fielddata is disabled on [clinician_notes] in [radiology_text]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [clinician_notes] in order to load field data by uninverting the inverted index. Note that this can use significant memory.', Fielddata is disabled on [clinician_notes] in [radiology_text]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [clinician_notes] in order to load field data by uninverting the inverted index. Note that this can use significant memory.)


In [None]:
agg_keyword = {'size': 0, 'aggs': {'by_note': {'terms': {'field': 'clinician_notes.keyword'}}}}
try:
    agg_res = es.search(index=TEXT_INDEX, body=agg_keyword)
    pprint(agg_res['aggregations']['by_note']['buckets'][:3])
except Exception as e:
    print('Agg on .keyword failed (check mapping):', e)

[]


### 4. Wildcard field expansion
Use validate_query with rewrite to see `clin*` expanded to actual fields.


In [None]:
try:
    validation = es.indices.validate_query(
        index=TEXT_INDEX,
        body={'query': {'multi_match': {'query': 'disc', 'fields': ['clin*']}}},
        explain=True,
        rewrite=True
    )
    pprint(validation.get('explanations', [])[:2])
except Exception as e:
    print('Wildcard validation failed:', e)


[{'explanation': 'MatchNoDocsQuery("unmapped fields []")',
  'index': 'radiology_text',
  'valid': True}]


In [None]:
response = es.indices.get_field_mapping(index=TEXT_INDEX, fields='clin*')
pprint(response['radiology_text']['mappings'].keys())

dict_keys(['clinician_notes.keyword', 'clinician_notes'])


## 2. Distributed execution â€” Query vs Fetch phases
Use the profile API to see which shards answered, how the query was rewritten/executed, and how fetch loads _source.


In [None]:
# check number of primary / replica shards
response = es.indices.get_settings(index=TEXT_INDEX)
pprint(response[TEXT_INDEX]['settings']['index'])

{'creation_date': '1765373383241',
 'number_of_replicas': '1',
 'number_of_shards': '2',
 'provided_name': 'radiology_text',
 'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
 'uuid': 'V95DGmJiTumv1dxH_Z6_aQ',
 'version': {'created': '9039001'}}


In [12]:
# see number of docs in each primary shard
response = es.cat.shards(index=TEXT_INDEX, format='json')
shard_rows = []
for shard in response:
    if shard['prirep'] != 'p':
        continue
    shard_rows.append(
        {'shard': shard['shard'], 'docs': shard['docs'], 'volume_kb': shard['store']}
    )
pd.DataFrame(shard_rows)

Unnamed: 0,shard,docs,volume_kb
0,0,257,76.8kb
1,1,192,62kb


In [11]:
# profile a match query to see shard-level query/fetch phases
profile_res = es.search(
    index=TEXT_INDEX,
    body={
        'profile': True,
        'query': {'match': {'clinician_notes': 'neural canal'}}
    },
    size=3
)

print('Total hits:', profile_res['hits']['total'])
print('Shards involved:', profile_res['_shards'])

import json as _json
print('Profile (truncated to first shard):')
print(_json.dumps(profile_res['profile']['shards'][0], indent=2)[:2000], '...')

Total hits: {'value': 206, 'relation': 'eq'}
Shards involved: {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}
Profile (truncated to first shard):
{
  "id": "[BZ2TnfIhQXCJr3__UCYqDA][radiology_text][1]",
  "node_id": "BZ2TnfIhQXCJr3__UCYqDA",
  "shard_id": 1,
  "index": "radiology_text",
  "cluster": "(local)",
  "searches": [
    {
      "query": [
        {
          "type": "BooleanQuery",
          "description": "clinician_notes:neural clinician_notes:canal",
          "time_in_nanos": 2174470,
          "breakdown": {
            "set_min_competitive_score_count": 0,
            "match_count": 89,
            "shallow_advance_count": 0,
            "set_min_competitive_score": 0,
            "next_doc": 302432,
            "match": 26034,
            "score_count": 89,
            "next_doc_count": 90,
            "compute_max_score_count": 0,
            "compute_max_score": 0,
            "advance": 0,
            "advance_count": 0,
            "count_weight_count": 0,

  profile_res = es.search(


In [13]:
# summarize query vs fetch per shard
shard_rows = []
for shard in profile_res['profile']['shards']:
    sid = shard['id']
    searches = shard['searches'][0]
    query_time = searches['query'][0]['time_in_nanos']
    fetch_time = shard['fetch']['time_in_nanos']
    shard_rows.append({
        'shard': sid,
        'query_time_ms': query_time / 1e6,
        'fetch_time_ms': fetch_time / 1e6,
        'query_description': searches['query'][0]['description']
    })
pd.DataFrame(shard_rows)


Unnamed: 0,shard,query_time_ms,fetch_time_ms,query_description
0,[BZ2TnfIhQXCJr3__UCYqDA][radiology_text][1],2.17447,1.179043,clinician_notes:neural clinician_notes:canal
1,[K3KnPcSGRIeEfmMhbRH35w][radiology_text][0],1.52026,0.911226,clinician_notes:neural clinician_notes:canal


## 3. Caching

In [None]:
import time, json
import statistics

agg_query = {
    "size": 0,
    "query": {
        "bool": {
            "filter": [
                {"term": {"body_part_examined": "L-SPINE"}},
                {"range": {"slice_thickness": {"gte": 9}}},
                {"range": {"spacing_between_slices": {"gte": 15}}},
                {"exists": {"field": "patient_id"}}
            ]
        }
    },
    "aggs": {
        "by_part": {
            "terms": {"field": "body_part_examined.keyword", "size": 10},
            "aggs": {
                "slice_stats": {"stats": {"field": "slice_thickness"}},
                "slice_percentiles": {"percentiles": {"field": "slice_thickness", "percents": [5, 25, 50, 75, 95]}},
                "top_example": {"top_hits": {"size": 1, "_source": {"includes": ["patient_id", "slice_thickness", "spacing_between_slices"]}}}
            }
        },
        "thickness_hist": {
            "histogram": {"field": "slice_thickness", "interval": 0.1, "min_doc_count": 1},
            "aggs": {"spacing_avg": {"avg": {"field": "spacing_between_slices"}}}
        },
        "spacing_hist": {
            "histogram": {"field": "spacing_between_slices", "interval": 0.5, "min_doc_count": 1},
            "aggs": {"thickness_avg": {"avg": {"field": "slice_thickness"}}}
        },
        "patient_cardinality": {"cardinality": {"field": "patient_id"}},
    }
}
def timed_many(body, n=10, **kwargs):
    times = []
    for _ in range(n):
        t0 = time.time()
        es.search(index=DOC_INDEX, body=body, **kwargs)
        times.append(time.time() - t0)
    return statistics.mean(times), statistics.stdev(times)

# clear all cache of index before experiment
es.indices.clear_cache()

print("=== Request cache demo ===")
mean_uncached, std_uncached = timed_many(agg_query, n=100, request_cache=False)
print("Uncached (mean):", round(mean_uncached, 4), "+-", round(std_uncached, 4), "s")
    
mean_cached, std_cached = timed_many(agg_query, n=100, request_cache=True)
print("Cached (mean):  ", round(mean_cached, 4), "+-", round(std_cached, 4), "s")


=== Request cache demo ===
Uncached (mean): 0.0014 +- 0.0008 s
Cached (mean):   0.001 +- 0.0004 s
