In [4]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import json
import urllib3

In [5]:
es = Elasticsearch(
        "http://localhost:9200",
)
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch


In [6]:
excel_file = "Radiologists Report.xlsx"
df = pd.read_excel(excel_file)
print(f"Dataset shape: {df.shape}")
print(df.head())

Dataset shape: (575, 2)
   Patient ID                                  Clinician's Notes
0           1  L4-5: degenerative annular disc bulge is noted...
1           2  No evidence of disc herniation.\nNo significan...
2           3  LSS MRI\nFeatures of muscle spasm.\nsmall cent...
3           4  Feature of muscle spasm.\nDiffuse disc bulges ...
4           5  LSS MRI :\nFeature of muscle spasm.\nDiffuse d...


In [7]:
print("Missing values:")
print(df.isnull().sum())

Missing values:
Patient ID            0
Clinician's Notes    60
dtype: int64


In [8]:
df_clean = df.dropna(subset=['Patient ID', "Clinician's Notes"])
print(f"After cleaning: {df_clean.shape}")

After cleaning: (515, 2)


In [9]:
index_name = "radiology_reports"

index_mapping = {
    "mappings": {
        "properties": {
            "patient_id": {
                "type": "keyword"
            },
            "clinicians_notes": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    },
                    "english": {
                        "type": "text",
                    }
                }
            }
        }
    }
}

# Delete index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index
es.indices.create(index=index_name, body=index_mapping)
print(f"Index '{index_name}' created successfully")

Index 'radiology_reports' created successfully


In [10]:
# Prepare documents for indexing 
documents = [] 
for idx, row in df_clean.iterrows(): 
    doc = { 
           "_index": index_name, 
           "_id": idx, 
           "_source": { 
               "patient_id": str(row['Patient ID']), 
               "clinicians_notes": row["Clinician's Notes"] 
            }
    } 
    documents.append(doc)

In [11]:
# Index the documents in bulk
try:
    success, failed = helpers.bulk(es, documents, stats_only=True)
    print(f"Successfully indexed {success} documents. Failed: {failed}")
except Exception as e:
    print(f"Error during bulk indexing: {e}")

# Verify the data was indexed
count = es.count(index=index_name)['count']
print(f"Total documents in index: {count}")

Successfully indexed 515 documents. Failed: 0
Total documents in index: 307


Use Case 1: Find all patients whose notes mention <i>stenosis</i>

In [12]:
query = {
    "query": {
        "match": {
            "clinicians_notes": "disc herniation" 
        }
    }
}

response = es.search(index=index_name, body=query)
print(json.dumps(response.body, indent=2, ensure_ascii=False))

{
  "took": 135,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 488,
      "relation": "eq"
    },
    "max_score": 2.4830642,
    "hits": [
      {
        "_index": "radiology_reports",
        "_id": "1",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "2",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted."
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "31",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "32",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.\n"
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "45",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "46",
          "clinicians

Use Case 2: Retrieve notes that contain the exact phase "disc herniation"

In [13]:
query = {
    "query": {
        "match_phrase": {
            "clinicians_notes": "disc herniation" 
        }
    }
}

response = es.search(index=index_name, body=query)
print(json.dumps(response.body, indent=2, ensure_ascii=False))

{
  "took": 19,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 64,
      "relation": "eq"
    },
    "max_score": 2.4830642,
    "hits": [
      {
        "_index": "radiology_reports",
        "_id": "1",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "2",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted."
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "31",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "32",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.\n"
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "45",
        "_score": 2.4830642,
        "_source": {
          "patient_id": "46",
          "clinicians_n

Use Case 3: Find patients with <i>herniation</i> but exclude mentioning <i>infection</i>

In [14]:
query = {
    "query": {
        "bool": {
            "must": [
                { "match": { "clinicians_notes": "herniation" } }
            ],
            "must_not": [
                { "match": { "clinicians_notes": "infection" } }
            ]
        }
    }
}

response = es.search(index=index_name, body=query)
print(json.dumps(response.body, indent=2, ensure_ascii=False))

{
  "took": 9,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 77,
      "relation": "eq"
    },
    "max_score": 2.4133468,
    "hits": [
      {
        "_index": "radiology_reports",
        "_id": "1",
        "_score": 2.4133468,
        "_source": {
          "patient_id": "2",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted."
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "31",
        "_score": 2.4133468,
        "_source": {
          "patient_id": "32",
          "clinicians_notes": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.\n"
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "45",
        "_score": 2.4133468,
        "_source": {
          "patient_id": "46",
          "clinicians_no

Use Case 4: Handle typos - search "steniss" and still retrive "stenosis"

In [15]:
query = {
    "query": {
        "fuzzy": {
            "clinicians_notes": {
                "value": "steniss",
                "fuzziness": "AUTO"
            }
        }
    }
}

response = es.search(index=index_name, body=query)
print(json.dumps(response.body, indent=2, ensure_ascii=False))

{
  "took": 34,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 17,
      "relation": "eq"
    },
    "max_score": 2.3621368,
    "hits": [
      {
        "_index": "radiology_reports",
        "_id": "171",
        "_score": 2.3621368,
        "_source": {
          "patient_id": "172",
          "clinicians_notes": "MRI OF THE LUMBOSACRAL SPINE \nFeature of muscle spasm. \nDesiccated disc material . \nModec type II of L2/L3 with narrowed disc space . \nDiffuse disc bulge ,with Rt foraminal disc protrusion noted at L2/L3 level compressing the thecal sac and exit neural canals. \nWith spinal canal stenosis noted. \nDiffuse disc bulge at L3/L4 & L4/L5 levels , mild compressing thecal sac and encroaching exit nerve root , with relatively secondary spinal canal stenosis \n"
        }
      },
      {
        "_index": "radiology_reports",
        "_id": "125",
        "_score": 2.15

Use Case 5: Count the most common terms in notes.

In [16]:
query = {
    "size": 0,
    "aggs": {
        "common_terms": {
            "terms": {
                "field": "clinicians_notes.keyword",
                "size": 10
            }
        }
    }
}

response = es.search(index=index_name, body=query)
print(json.dumps(response.body, indent=2, ensure_ascii=False))

{
  "took": 190,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 515,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "common_terms": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 478,
      "buckets": [
        {
          "key": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.\n",
          "doc_count": 11
        },
        {
          "key": "No evidence of disc herniation.\nNo significant thecal sac or nerve root compression noted.\nAdequate spinal canal.",
          "doc_count": 6
        },
        {
          "key": "LSS MRI \nNo evidence of disc herniation. \nNo thecal sac or nerve root compression noted.\n",
          "doc_count": 4
        },
        {
          "key": "LSS MRI\nNo evidence of disc herniation noted.\nNo significant thecal sac or nerve ro

In [None]:
# Close the Elasticsearch connection
es.close()
print("Elasticsearch connection closed")