# Part 1: Indexing

## 0. Setting up

In [1]:
# import libraries
import os
import sys

import numpy as np
import pandas as pd
import pydicom
from tqdm import tqdm

import matplotlib.pyplot as plt

from elasticsearch import Elasticsearch, helpers
from pprint import pprint

In [30]:
# import current working dir to sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


# declare constants
ES_HOST = "http://localhost:9200"

TEXT_INDEX = "radiology_text"
VECTOR_INDEX = "radiology_vector"

TEXT_DATA_PATH = "../../data/text/Radiologists Report.xlsx"
IMAGE_DATA_DIR = "../../data/images/"

In [9]:
# connect to Elasticsearch
es = Elasticsearch(ES_HOST)
if es.ping():
    print("Connected to Elasticsearch!")

Connected to Elasticsearch!


In [10]:
# delete existed index and create new one
if es.indices.exists(index=TEXT_INDEX):
    es.indices.delete(index=TEXT_INDEX)
    print(f"Deleted existing index: {TEXT_INDEX}")
es.indices.create(index=TEXT_INDEX)
print(f"Created new index: {TEXT_INDEX}")

Deleted existing index: radiology_text
Created new index: radiology_text


## 1. Inverted Index for Text Data

- An inverted index is a data structure used for fast full-text search. Instead of mapping documents -> words (forward index), it maps words -> documents containing those words.

In [11]:
# read 3 first rows of the text data
text_data = pd.read_excel(TEXT_DATA_PATH)
pd.set_option('display.max_colwidth', None)
text_data.head(3)

Unnamed: 0,Patient ID,Clinician's Notes
0,1,"L4-5: degenerative annular disc bulge is noted more to the left side compressing thecal sac, compressing left nerve root and narrowing right neural foramen. // Evidence of hyperintense signal within the annulus fibrosus at left paramedian/posterolateral area which probably represents a torn annulus."
1,2,No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.
2,3,LSS MRI\nFeatures of muscle spasm.\nsmall central disc protrusion noted at L5-S1 level abutting the thecal sac.\nno significant thecal sac or nerve root compression noted.


In [14]:
# demo es analyzer
body = text_data.loc[0, 'Clinician\'s Notes']
analyzer = "whitespace"    # can try "whitespace", "simple, "keyword"

analyze_result = es.indices.analyze(
    index=TEXT_INDEX,
    body={
        "analyzer": analyzer,
        "text": body
    }
)

analyzer_tokens = [token_info['token'] for token_info in analyze_result['tokens']]
print(f"First 10 tokens using '{analyzer}' analyzer:")
for i in range(min(10, len(analyzer_tokens))):
    print(f"- {analyzer_tokens[i]}")

First 10 tokens using 'whitespace' analyzer:
- L4-5:
- degenerative
- annular
- disc
- bulge
- is
- noted
- more
- to
- the


- A small example of an inverted index for our radiology reports might look like this:

    | Term         | Document IDs |
    |--------------|--------------|
    | disc         | [1, 2, 3]    |
    | thecal       | [1, 2, 3]    |
    | degenerative | [1]          |
    | herniation   | [2]          |
    | protrusion   | [3]          |
    | ...          | ...          |

- Example: Query "disc herniation spasm"
    1. Look up "disc" in the inverted index -> [1, 2, 3]
    2. Look up "herniation" in the inverted index -> [2]
    3. Look up "spasm" in the inverted index -> []
    3. Intersect the document ID lists -> [2]
    4. Result: Document 2 contains both terms "disc" and "herniation".

In [33]:
# demo of scoring when querying

# 1. index 3 first documents into index
for idx, row in text_data.head(3).iterrows():
    doc = {
        "patient_id": row['Patient ID'],
        "clinician_notes": row["Clinician's Notes"]
    }
    es.index(index=TEXT_INDEX, id=row['Patient ID'], body=doc)

In [44]:
# 2. perform a search query
query_term = "disc herniation"
query_body = {
    "query": {
        "match": {
            "clinician_notes": query_term
        }
    }
}

search_result = es.search(index=TEXT_INDEX, body=query_body)
print(f"Search Results for query '{query_term}':")
for hit in search_result['hits']['hits']:
    print(f"- Patient ID: {hit['_source']['patient_id']}, Score: {hit['_score']}")

Search Results for query 'disc herniation':
- Patient ID: 2, Score: 1.4093384
- Patient ID: 3, Score: 0.13481398
- Patient ID: 1, Score: 0.10955827


- ES also provide an API to observe the frequency of a token in an indexed document, and number of documents in the whole index containing that token. This information is used to calculate the relevance score of a document for a given query.

In [41]:
# demo termvector
token = "disc"
# token = "annulus"

termvector = es.termvectors(
    index=TEXT_INDEX,
    id=str(text_data.loc[0, 'Patient ID']),
    fields=["clinician_notes"],
    term_statistics=True
)

terms = termvector['term_vectors']['clinician_notes']['terms']
if token in terms:
    print(f"Term Vector for token '{token}':")
    pprint(terms[token])

Term Vector for token 'disc':
{'doc_freq': 3,
 'term_freq': 1,
 'tokens': [{'end_offset': 31, 'position': 4, 'start_offset': 27}],
 'ttf': 3}


## 2. BKD Tree for Numeric/Spatial Data
- BKD (block KD) trees keep numeric or spatial values in balanced blocks with bounding boxes, making range queries fast.
- The demo below builds a tiny BKD tree over (x, y) coordinates and runs a rectangular range search.


In [19]:
# toy 2D points with ids
rng = np.random.RandomState(42)
points = rng.rand(20, 2)
points_with_id = [(float(x), float(y), i) for i, (x, y) in enumerate(points)]
pd.DataFrame(points_with_id, columns=['x', 'y', 'id']).head()


Unnamed: 0,x,y,id
0,0.37454,0.950714,0
1,0.731994,0.598658,1
2,0.156019,0.155995,2
3,0.058084,0.866176,3
4,0.601115,0.708073,4


In [16]:
# minimal BKD tree implementation (axis with largest spread is split each level)
class BKDTree:
    def __init__(self, points, leaf_size=4):
        self.leaf_size = leaf_size
        pts = np.array(points, dtype=float)
        self.root = self._build(pts)

    def _build(self, pts):
        bbox_min = pts[:, :2].min(axis=0)
        bbox_max = pts[:, :2].max(axis=0)
        if len(pts) <= self.leaf_size:
            return {'leaf': True, 'points': pts, 'bbox': (bbox_min, bbox_max)}

        spreads = bbox_max - bbox_min
        axis = int(np.argmax(spreads))
        order = pts[:, axis].argsort()
        pts = pts[order]
        mid = len(pts) // 2

        left = self._build(pts[:mid])
        right = self._build(pts[mid:])
        return {
            'leaf': False,
            'axis': axis,
            'split_value': pts[mid, axis],
            'bbox': (bbox_min, bbox_max),
            'left': left,
            'right': right
        }

    def range_query(self, query_min, query_max):
        query_min = np.array(query_min)
        query_max = np.array(query_max)
        return self._range_query(self.root, query_min, query_max)

    def _range_query(self, node, qmin, qmax):
        bbox_min, bbox_max = node['bbox']
        if np.any(bbox_max < qmin) or np.any(bbox_min > qmax):
            return []

        if node['leaf']:
            pts = node['points']
            mask = np.all((pts[:, :2] >= qmin) & (pts[:, :2] <= qmax), axis=1)
            return [tuple(row) for row in pts[mask]]

        return self._range_query(node['left'], qmin, qmax) + \
               self._range_query(node['right'], qmin, qmax)


In [17]:
# build the tree and run a rectangular search
tree = BKDTree(points_with_id, leaf_size=4)
query_min = np.array([0.2, 0.2])
query_max = np.array([0.8, 0.6])
matches = tree.range_query(query_min, query_max)

print(f'Query bbox x:[{query_min[0]:.2f}, {query_max[0]:.2f}] y:[{query_min[1]:.2f}, {query_max[1]:.2f}]')
print(f'Found {len(matches)} points in range:')
pd.DataFrame(matches, columns=['x', 'y', 'id'])


Query bbox x:[0.20, 0.80] y:[0.20, 0.60]
Found 5 points in range:


Unnamed: 0,x,y,id
0,0.292145,0.366362,11.0
1,0.304242,0.524756,8.0
2,0.431945,0.291229,9.0
3,0.684233,0.440152,19.0
4,0.731994,0.598658,1.0


## 3. Doc values
- We'll index DICOM metadata, then show that range queries work for numeric fields, fail for text, and also fail if doc_values is disabled on a numeric field.
- The helper `get_metadata` pulls many tags; we pick a handful to keep the mapping simple.


In [38]:
# collect some DICOM files
LIMIT = 8
from itertools import islice

def iter_dicom_paths(root, limit=LIMIT):
    for dirpath, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith((".ima", ".dcm")):
                yield os.path.join(dirpath, f)
        # no explicit break to allow nested dirs; limit handled by islice

sample_paths = list(islice(iter_dicom_paths(IMAGE_DATA_DIR), 8))
print(f'Load {len(sample_paths)} sample DICOM files')

docs = []
for path in sample_paths:
    dicom = load_dicom(path)
    metadata = get_metadata(dicom)
    doc = {
        'patient_id': metadata.get('PatientID') or os.path.basename(path),
        'study_date': metadata.get('StudyDate'),
        'series_description': metadata.get('SeriesDescription'),
        'slice_thickness': float(metadata.get('SliceThickness', 0) or 0),
        'spacing_between_slices': float(metadata.get('SpacingBetweenSlices', 0) or 0),
        'rows': int(metadata.get('Rows', 0) or 0),
        'columns': int(metadata.get('Columns', 0) or 0),
    }
    doc['doc_id'] = f"{doc['patient_id']}_{os.path.basename(path)}"
    docs.append(doc)

pd.DataFrame(docs).head()


Load 8 sample DICOM files


Unnamed: 0,patient_id,study_date,series_description,slice_thickness,spacing_between_slices,rows,columns,doc_id
0,LOCALIZER_0_0001_001.ima,,,8.0,12.0,512,512,LOCALIZER_0_0001_001.ima_LOCALIZER_0_0001_001.ima
1,LOCALIZER_0_0001_002.ima,,,8.0,12.0,512,512,LOCALIZER_0_0001_002.ima_LOCALIZER_0_0001_002.ima
2,LOCALIZER_0_0001_003.ima,,,8.0,12.0,512,512,LOCALIZER_0_0001_003.ima_LOCALIZER_0_0001_003.ima
3,LOCALIZER_0_0001_004.ima,,,8.0,32.0,512,512,LOCALIZER_0_0001_004.ima_LOCALIZER_0_0001_004.ima
4,LOCALIZER_0_0001_005.ima,,,8.0,32.0,512,512,LOCALIZER_0_0001_005.ima_LOCALIZER_0_0001_005.ima


In [39]:
# create an index with mapping; duplicate one numeric field with doc_values disabled
DOC_INDEX = 'dicom_metadata_docvalues_demo'

if es.indices.exists(index=DOC_INDEX):
    es.indices.delete(index=DOC_INDEX)
    print(f'Deleted existing index: {DOC_INDEX}')

mapping = {
    'mappings': {
        'properties': {
            'doc_id': {'type': 'keyword'},
            'patient_id': {'type': 'keyword'},
            'study_date': {'type': 'date', 'format': 'yyyyMMdd'},
            'series_description': {'type': 'text'},
            'slice_thickness': {'type': 'double'},
            'spacing_between_slices': {'type': 'double'},
            'rows': {'type': 'integer'},
            'rows_no_docvals': {'type': 'integer', 'doc_values': False}
        }
    }
}

es.indices.create(index=DOC_INDEX, body=mapping)
print(f'Created index {DOC_INDEX}')


Created index dicom_metadata_docvalues_demo


In [40]:
# index multiple documents; copy rows into a doc_values-disabled field
actions = []
for doc in docs:
    src = dict(doc)
    src['rows_no_docvals'] = src['rows']
    actions.append({
        '_index': DOC_INDEX,
        '_id': src['doc_id'],
        '_source': src
    })
helpers.bulk(es, actions)
print(f'Indexed {len(actions)} documents into', DOC_INDEX)


Indexed 8 documents into dicom_metadata_docvalues_demo


In [52]:
# sort on numeric field works
response = es.search(index=DOC_INDEX, sort=[{
    'slice_thickness': 'asc'
}])
print('Sort on numeric slice_thickness -> hits:', response['hits']['total']['value'])


Sort on numeric slice_thickness -> hits: 8


In [None]:
# sort on text field fails
try:
    reponse = es.search(index=DOC_INDEX, sort=[{
        'series_description': 'asc'
    }])
    print(response)
except Exception as e:
    print('Sort on text series_description -> error:', e)

Sort on text series_description -> error: BadRequestError(400, 'search_phase_execution_exception', 'Fielddata is disabled on [series_description] in [dicom_metadata_docvalues_demo]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [series_description] in order to load field data by uninverting the inverted index. Note that this can use significant memory.', Fielddata is disabled on [series_description] in [dicom_metadata_docvalues_demo]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [series_description] in order to load field data by uninverting the inverted index. Note that this can use significant memory.)


In [53]:
# sort on numeric field with doc_values disabled fails
try:
    es.search(index=DOC_INDEX, sort=[{
        'rows_no_docvals': 'asc'
    }])
    print('Sort on rows_no_docvals (doc_values: false) -> success')
except Exception as e:
    print('Sort on rows_no_docvals (doc_values: false) -> error:', e)

Sort on rows_no_docvals (doc_values: false) -> error: BadRequestError(400, 'search_phase_execution_exception', "Can't load fielddata on [rows_no_docvals] because fielddata is unsupported on fields of type [integer]. Use doc values instead.", Can't load fielddata on [rows_no_docvals] because fielddata is unsupported on fields of type [integer]. Use doc values instead.)
