# FHIR RAG

## Environment Setup

### Authenticate

In [1]:
# Authenticate Notebook

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Global Variables

In [2]:
# GCP Parameters
PROJECT_ID = "propane-crawler-363311"  # @param {type:"string"}
REGION = "us-central1"  # @param {type: "string"}

# Neo4J Connection Parameters
NEO4J_URL="bolt://localhost:7687" # @param {type:"string"}
NEO4J_USER="neo4j" # @param {type:"string"}
NEO4J_PASSWORD="password" # @param {type:"string"}

# Dimension Vertex PaLM Text Embedding
ME_DIMENSIONS = 768 # @param {type:"integer"} 
ME_DISTANCE_MEASURE_TYPE = "DOT_PRODUCT_DISTANCE" # @param {type:"string"} 

# Update to bigger SHARDS for larger data volumes & performance
# Doc - https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index
ME_SHARD_SIZE = "SHARD_SIZE_SMALL" # @param ["SHARD_SIZE_SMALL", "SHARD_SIZE_MEDIUM", "SHARD_SIZE_LARGE"] 

# Vertex AI Vector Search (MatchingEngine) Endpoint Parameters
# Doc - https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index

# The machine types that you can use to deploy your index
ME_ENDPOINT_MACHINE_TYPE = "e2-standard-2" # @param ["n1-standard-16", "n1-standard-32", "e2-standard-2", "e2-standard-16", "e2-highmem-16", "n2d-standard-32"] 

ME_ENDPOINT_MIN_REPLICA_COUNT = 2 # @param {type:"integer"} 
ME_ENDPOINT_MAX_REPLICA_COUNT = 10 # @param {type:"integer"} 

# Vertex AI Vector Search (MatchingEngine) Index Parameters
ME_INDEX_NAME = 'fhir_me_index'  # @param {type: "string"}
ME_EMBEDDING_GCS_DIR = f'{PROJECT_ID}-me-bucket' # @param {type:"string"} 
ME_DESCRIPTION = "Index for FHIR Resources" # @param {type:"string"} 

# Set the LLM to use
VERTEX_AI_MODEL_NAME = 'gemini-1.0-pro-001'
TEXT_EMBEDDING_MODEL_NAME = "textembedding-gecko@003"

### Import Libraries

In [22]:
# Utils
from pprint import pprint
import json
import csv
import os
import re
from datetime import datetime, timezone, timedelta

from rich.console import Console
from rich.markdown import Markdown

from typing import Dict, Optional, Any, List

# Google Libs
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)

from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
    Namespace,
    NumericNamespace,
)

from pydantic import BaseModel, Field

# Langchain
import langchain
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import RetrievalQA, LLMChain, SequentialChain, TransformChain
from langchain.globals import set_debug, set_verbose
print(f"LangChain version: {langchain.__version__}")

# LangChain Google Libs
from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_vertexai import VectorSearchVectorStore

# Custom Utils
## Custom Matching Engine
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

## Neo4J
from utils.NEO4J_Graph import Graph



LangChain version: 0.1.16


### Establish Neo4J Connection

In [4]:
%env NEO4J_USER={NEO4J_USER}
%env NEO4J_PASSWORD={NEO4J_PASSWORD}

env: NEO4J_USER=neo4j
env: NEO4J_PASSWORD=password


In [5]:
# Check if Docker Container is running
! docker ps -a

CONTAINER ID   IMAGE          COMMAND                  CREATED      STATUS       PORTS                                                                                            NAMES
c11076036c54   neo4j:latest   "tini -g -- /startup…"   6 days ago   Up 4 hours   0.0.0.0:7474->7474/tcp, :::7474->7474/tcp, 7473/tcp, 0.0.0.0:7687->7687/tcp, :::7687->7687/tcp   testneo4j


In [6]:
# Start the Container if it is not running
! docker start testneo4j

testneo4j


In [7]:
# Instantiate & Connect to Neo4J graph
graph = Graph(NEO4J_URL, NEO4J_USER, NEO4J_PASSWORD)

In [8]:
# Test Neo4J Connection
# Get type and number of each FHIR resource in the database
resource_metrics = graph.resource_metrics()
resource_metrics.sort()
pprint(resource_metrics)

[['AllergyIntolerance', 10],
 ['CarePlan', 60],
 ['CareTeam', 60],
 ['Claim', 3486],
 ['Condition', 656],
 ['Device', 39],
 ['DiagnosticReport', 3100],
 ['DocumentReference', 1859],
 ['Encounter', 1859],
 ['ExplanationOfBenefit', 3486],
 ['ImagingStudy', 8],
 ['Immunization', 259],
 ['Medication', 957],
 ['MedicationAdministration', 957],
 ['MedicationRequest', 1627],
 ['Observation', 13501],
 ['Patient', 20],
 ['Procedure', 2966],
 ['SupplyDelivery', 239]]


In [9]:
node_count, relationship_count = graph.database_metrics()
print('Database Metrics:')
print(f'    - Node Count = {node_count}')
print(f'    - Relationship Count = {relationship_count}')

Database Metrics:
    - Node Count = 37884
    - Relationship Count = 190926


### Establish VectorSearch Connection

In [10]:
# Create Text Embedding
text_embedding_model = VertexAIEmbeddings(
    model_name=TEXT_EMBEDDING_MODEL_NAME,
    project=PROJECT_ID,
    location=REGION,
    max_retries=6
)

text_embedding_model

VertexAIEmbeddings(client=<vertexai.language_models.TextEmbeddingModel object at 0x7f82ee9964d0>, project='propane-crawler-363311', location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='textembedding-gecko@003', client_preview=None, temperature=None, max_output_tokens=None, top_p=None, top_k=None, credentials=None, n=1, streaming=False, safety_settings=None, api_transport=None, api_endpoint=None, instance={'max_batch_size': 250, 'batch_size': 250, 'min_batch_size': 5, 'min_good_batch_size': 5, 'lock': <unlocked _thread.lock object at 0x7f82fcb8d640>, 'batch_size_validated': False, 'task_executor': <concurrent.futures.thread.ThreadPoolExecutor object at 0x7f82fc1e0340>, 'embeddings_task_type_supported': True, 'get_embeddings_with_retry': <function TextEmbeddingModel.get_embeddings at 0x7f82ee831a20>})

In [11]:
# Get Matching Engine Index id and Endpoint id
me_utils = MatchingEngineUtils(PROJECT_ID, REGION, ME_INDEX_NAME)
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = me_utils.get_index_and_endpoint()

print(f'- ME_INDEX_ID:{ME_INDEX_ID}\n- ME_INDEX_ENDPOINT_ID:{ME_INDEX_ENDPOINT_ID}')

- ME_INDEX_ID:projects/884766917846/locations/us-central1/indexes/7340819014102286336
- ME_INDEX_ENDPOINT_ID:projects/884766917846/locations/us-central1/indexEndpoints/2407125622317907968


In [12]:
vector_store = VectorSearchVectorStore.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_GCS_DIR}".split("/")[2],
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
    stream_update=True,
    embedding=text_embedding_model
)
vector_store

<langchain_google_vertexai.vectorstores.vectorstores.VectorSearchVectorStore at 0x7f82ee881c60>

In [13]:
# Test VectorSearch is Connected
query_text = 'sample Resource'
response = vector_store.similarity_search_with_score(query=query_text, k=1)
response

[(Document(page_content='This is a sample Resource Type', metadata={'fhir_patient_id': 'pid_111111111', 'fhir_resource_id': 'rid_111111111', 'fhir_resource_type': 'Test_Resource_type', 'neo4j_node_id': 'nid_111111111'}),
  0.7039443254470825)]

### Google Vertex AI LLM Setup

In [14]:
llm = VertexAI(model_name=VERTEX_AI_MODEL_NAME)
llm.model_name

'gemini-1.0-pro-001'

<br>***QA Without RAG***

Asking LLM a question without context.

In [15]:
# Ask LLM a question
question = "What allergies does Antone63 have?"

no_rag_response = llm.invoke(question)

print(f'Question: {question}')
print(f'LLM Answer: {no_rag_response}')

Question: What allergies does Antone63 have?
LLM Answer: I do not have any information on any allergies that Antone63 may have.


## Retrieval with RAG

In [26]:
query = "Please provide a summary of Akiko835 Larkin917's Conditions. If there are multiple active conditions, list the most recent one. In the case of multiple conditions with the same most recent date, list all of them. If no active conditions are found, please state 'No active medical conditions found'." 

In [None]:
# Setting Langchain Global Variables
from langchain.globals import set_verbose, set_debug

# Change to False if you do not want debug and execution information
langchain_debug = True
set_debug(langchain_debug)
set_verbose(langchain_debug)

### Step-01: Get Patient Name
***Tip:*** 
- Minimize Cost & Latency - by first trying to extract patient name locally.
- If regex does not help, then use LLM.
- Fallback - Prompt user for Input
- You can use less powerful LLMs for this to save cost. E.g. Gemma(offline) or Smaller LLMs


<br>***Using Regex to extract patient name***

In [17]:
# Get Patient Name Local Function using Python regex
def extract_patient_name_with_custom_function(query: str) -> Optional[Dict[str, str]]:
    """
    Extracts patient's first and last name from the query using a regular expression.

    Args:
        query: The user's question or statement.

    Returns:
        A dictionary containing the extracted first and last names, or None if not found.
    """
    # name_pattern = re.compile(r"(?:Dr\.|Mr\.|Ms\.|Mrs\.)?\s*([A-Z][a-zA-Z0-9']+)\s+([A-Z][a-zA-Z0-9']*)")  
    name_pattern = re.compile(r"(?:Dr\.|Mr\.|Ms\.|Mrs\.)?\s*(\b(?!What\b)[A-Z][a-zA-Z0-9']*\b)(?:\s+([A-Z][a-zA-Z0-9']*)\b)?")
    match = name_pattern.search(query)
    if match:
        first_name = match.group(1)
        last_name = match.group(2) if match.group(2) else None
        patient_name = {"first_name": first_name, "last_name": last_name}
        return patient_name
        # return None
    else:
        return None

In [None]:
print(f'User Query: {query}')
name = extract_patient_name_with_custom_function(query)
print(f'Patient Name = {name}')

<br>***Manulaly get Patient name from user using Input Prompt***

In [18]:
def get_patient_name_from_user():
    while True:
        user_input_name = input("Please enter the patient's full name: ")
        confirmed = input(f"Is '{user_input_name}' correct? (yes/no): ").lower()
        if confirmed == 'yes':
            name_parts = user_input_name.split()
            patient_name = {"first_name": name_parts[0], "last_name": name_parts[-1] if len(name_parts) > 1 else None}
            # print(type(patient_name))
            return patient_name
        elif confirmed == 'no':
            continue
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")

In [None]:
name = get_patient_name_from_user()
print(f'Patient Name = {name}')

<br> ***Use LLM to extract Patient Name***

In [19]:
# Create Patient name Output Parser

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

# Define your desired data structure.
class PatientName(BaseModel):
    first_name: str = Field(description="extracted first name or partial name")
    last_name: str = Field(description="extracted last name or null")
    
# Set up a parser + inject instructions into the prompt template.
patient_name_parser = JsonOutputParser(pydantic_object=PatientName)
patient_name_parser

JsonOutputParser(pydantic_object=<class '__main__.PatientName'>)

In [20]:
patient_name_prompt_template = """You are a medical assistant tasked with extracting patient names from text.
The text may contain:
1. The patient's full name (first and last)
2. Only the patient's first name
3. A partial name (e.g., a nickname, a last name with a prefix)
4. Some Names will contain numbers and they are part of the name
5. Names will contain special characters (e.g., apostrophes, hyphens)
5. Names from diverse cultures and regions
6. If you detect middle names, combine them into the last_name: last_name = '{{All identified middle names}} {{last name}}' (with a space between middle and last names)

Identify and extract the patient's name information from the text. If you can identify both the first and last name, provide them. If you can only identify the first name or a partial name, provide that information and leave the missing part blank.

Always provide the output in the following JSON format:
{{"first_name": "[extracted first name or partial name]", "last_name": "[extracted last name or null]"}}

Here's the text to analyze:
{query}
"""

patient_name_prompt = ChatPromptTemplate.from_template(patient_name_prompt_template)
patient_name_prompt.partial_variables = {"format_instructions": patient_name_parser.get_format_instructions()}

# print('Output Parser Format Instructions:')
# pprint(patient_name_prompt.partial_variables)

print('\nFormatted Prompt:')
patient_name_prompt.pretty_print()


Formatted Prompt:

You are a medical assistant tasked with extracting patient names from text.
The text may contain:
1. The patient's full name (first and last)
2. Only the patient's first name
3. A partial name (e.g., a nickname, a last name with a prefix)
4. Some Names will contain numbers and they are part of the name
5. Names will contain special characters (e.g., apostrophes, hyphens)
5. Names from diverse cultures and regions
6. If you detect middle names, combine them into the last_name: last_name = '{All identified middle names} {last name}' (with a space between middle and last names)

Identify and extract the patient's name information from the text. If you can identify both the first and last name, provide them. If you can only identify the first name or a partial name, provide that information and leave the missing part blank.

Always provide the output in the following JSON format:
{"first_name": "[extracted first name or partial name]", "last_name": "[extracted last name

In [56]:
# LangChain Debug
debug_on = False
set_debug(debug_on)
set_verbose(debug_on)

patient_name = 'Akiko835 Larkin917'

patient_context = f'For the given patient: {patient_name}, '
query= f"please list all current medications, including the medication name, dosage instructions, and status (e.g., active, completed). Present this informationin a markdown table with the columns ''Medication Name'', ''Dosage'', and ''Status''. If no current medications are found, please state ''No current medications found''."

print(query)

# Query
query_dict={'query': query}
pprint(query_dict)

# Chain
fhir_chain = patient_name_prompt | llm
patient_name_response = fhir_chain.invoke(query_dict)


patient_name_response = json.loads(patient_name_response)
print(patient_name_response)

please list all current medications, including the medication name, dosage instructions, and status (e.g., active, completed). Present this informationin a markdown table with the columns ''Medication Name'', ''Dosage'', and ''Status''. If no current medications are found, please state ''No current medications found''.
{'query': 'please list all current medications, including the medication name, '
          'dosage instructions, and status (e.g., active, completed). Present '
          "this informationin a markdown table with the columns ''Medication "
          "Name'', ''Dosage'', and ''Status''. If no current medications are "
          "found, please state ''No current medications found''."}
{'first_name': None, 'last_name': None}


### Step-02: Get Patient ID

To find the relevant FHIR Patient Resource, even when dealing with potentially incomplete patient names from the user's query or LLM response, we:

- **Construct the Query:** We strategically build our query using the same template as our pre-processed resource text representation. This ensures higher accuracy even with partial names.
- **Perform Similarity Search:** This carefully crafted query is then used to search our VectorSearch Index, with the expectation that the top result is the matching FHIR Patient Resource.
- **Extract ID:** Finally, we retrieve the fhir_patient_id directly from the metadata of the identified document."
    


In [33]:
def get_patient_id(patient_name :dict) -> str:
    patient_vs_query_text = f"""The type of information in this entry is patient. The name use for this patient is official. The name family for this patient is {patient_name["last_name"]}. The name given 0 for this patient is {patient_name["first_name"]}"""
    # Create Retriever
    vs_retirever = vector_store.as_retriever(search_type="similarity")

    # Filter by resource_type = Patient
    vs_filter = [Namespace(name="fhir_resource_type", allow_tokens=["Patient"])]

    # k = 1 - We only want the top 1 result
    vs_retirever.search_kwargs = {"filter": vs_filter, "k":1}
    docs = vs_retirever.invoke(patient_vs_query_text)
    
    # print(f'Vector Search Results:\n{docs}\n')
    
    # Get patient id from Document Metadata
    patient_id = docs[0].metadata['fhir_patient_id'][0]
    return patient_id

In [34]:
patient_id_response = get_patient_id(patient_name_response)

print(f'Patient ID: {patient_id_response}')
# print(f'{type(response)}')

Patient ID: 05c4608d-bd9a-5d04-41d7-a0293da7f5a5


### Step-03: Identify FHIR Resource Type

In [35]:
# Create Resource Type Output Parser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

# Define your desired data structure.
class ResourceType(BaseModel):
    resource_type: str = Field(description="extracted resource type name")
    
# Set up a parser + inject instructions into the prompt template.
resource_type_parser = JsonOutputParser(pydantic_object=ResourceType)
resource_type_parser

JsonOutputParser(pydantic_object=<class '__main__.ResourceType'>)

In [36]:
FHIR_RESOURCE_LIST_URL = "https://build.fhir.org/resourceguide.html"

resource_type_prompt_template = f"""You are a healthcare specialist with deep knowledge of the FHIR standard.
Your task is to identify the most appropriate FHIR resource type for the given query.
Refer to the official FHIR resource guide at {FHIR_RESOURCE_LIST_URL}. 
If needed, consult the detailed documentation linked from that guide.

Return ONLY the resource type name if there is a clear match. If unsure, return "Unknown".

Always provide the output in the following JSON format:
{{{{"resource_type": "[extracted resource type name]"}}}}

Here's the text to analyze:
{{query}}
"""
resource_type_prompt = ChatPromptTemplate.from_template(resource_type_prompt_template)
resource_type_prompt.partial_variables = {"format_instructions": resource_type_parser.get_format_instructions()}

# print('Output Parser Format Instructions:')
# pprint(resource_type_prompt.partial_variables)

print('\nFormatted Prompt:')
resource_type_prompt.pretty_print()


Formatted Prompt:

You are a healthcare specialist with deep knowledge of the FHIR standard.
Your task is to identify the most appropriate FHIR resource type for the given query.
Refer to the official FHIR resource guide at https://build.fhir.org/resourceguide.html. 
If needed, consult the detailed documentation linked from that guide.

Return ONLY the resource type name if there is a clear match. If unsure, return "Unknown".

Always provide the output in the following JSON format:
{"resource_type": "[extracted resource type name]"}

Here's the text to analyze:
[33;1m[1;3m{query}[0m



In [37]:
# LangChain Debug
debug_on = False
set_debug(debug_on)
set_verbose(debug_on)

# Chain
resource_type_chain = resource_type_prompt | llm | resource_type_parser
resource_type_response = resource_type_chain.invoke({'query': query})

print(f'Query: {query}')
print(f'Resource Type: {resource_type_response["resource_type"]}')

Query: Please provide a summary of Akiko835 Larkin917's Conditions. If there are multiple active conditions, list the most recent one. In the case of multiple conditions with the same most recent date, list all of them. If no active conditions are found, please state 'No active medical conditions found'.
Resource Type: Condition


### Step-04: Vector Search

In this step we perform a Similarity search on VertexAI VectoreSearch Index to retrieve FHIR Reources that match the user query.

**Steps:**
- Perform a Vector Search with Filters based on the retrieved patient_id and resource_type
- Since FHIR Resources reference other resources, we also need to provide the referenced Resources to provide the full context to the LLM to imporve the accuracy of the respone. We do this by querying the Neo4J database to get immediate Neigbour resources for each resource returned by the 


In [119]:
def retrieve_relevant_resources(query: str, k: int, 
                                resource_type_text: str,
                               patient_id: str) -> list[str]:
    
    # Create Retriever
    vs_retirever = vector_store.as_retriever(search_type="similarity")
    
    # Filter by fhir_resource_type and fhir_patient_id to retrieve only relevant FHIR Resources
    vs_filter = [
        Namespace(name="fhir_resource_type", allow_tokens=[resource_type_text]),
        Namespace(name="fhir_patient_id", allow_tokens=[patient_id])
    ]
    
    # print(f'retrieve_relevant_resources resource_type: {resource_type_text}')
    if debug_on:
        print(f'vs_filter:')
        pprint(vs_filter)
        print('\n')
        
    # Retrieve all Resources based on above fitler
    vs_retirever.search_kwargs = {"filter": vs_filter, 'k':k}
    docs = vs_retirever.invoke(query)
    
    
    # print(f'Retrieved Resource Documents:')
    # pprint(docs)

    # retrieved_resource_ids = [doc.metadata["fhir_resource_id"][0] for doc in docs]
    # return retrieved_resource_ids
    return(docs)

In [39]:
debug_on = False
print(query)
docs = retrieve_relevant_resources(query,
                                   k=25,
                                  patient_id=patient_id_response,
                                  resource_type_text=resource_type_response['resource_type'])

retrieved_resource_ids = [doc.metadata['fhir_resource_id'][0] for doc in docs]

# for doc in docs:
#     resource_metadata = doc.metadata
#     patient_id = resource_metadata['fhir_patient_id']
#     resource_id = resource_metadata['fhir_resource_id']
#     resource_type = resource_metadata['fhir_resource_type']    
#     print(f'patient_id: {patient_id}\t resource_id:{resource_id}\t resource_type:{resource_type}')
#     # print(doc.page_content)

print(f'Total Resources: {len(docs)}')
# print(f'ResourcesIds List: {retrieved_resource_ids}')

Please provide a summary of Akiko835 Larkin917's Conditions. If there are multiple active conditions, list the most recent one. In the case of multiple conditions with the same most recent date, list all of them. If no active conditions are found, please state 'No active medical conditions found'.
retrieve_relevant_resources resource_type: Condition
Total Resources: 25


<br> ***With the above retrieved resources as llm context, let us try to ask the LLM user query and check its response***

In [40]:
user_query_prompt='''
System: The context below contains entries about the patient's healthcare. 
Please limit your answer to the information provided in the context. Do not make up facts.
Please limit your answers only about the patient in the user question. If you do not find the patient name in the context.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
If you are asked about the patient's name and one the entries is of type patient, you should look for the first given name and family name and answer with: [given] [family]
----------------
{context}

Here's the text to analyze:
{query}
'''

user_query_prompt = ChatPromptTemplate.from_template(user_query_prompt)
user_query_prompt.pretty_print()



System: The context below contains entries about the patient's healthcare. 
Please limit your answer to the information provided in the context. Do not make up facts.
Please limit your answers only about the patient in the user question. If you do not find the patient name in the context.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
If you are asked about the patient's name and one the entries is of type patient, you should look for the first given name and family name and answer with: [given] [family]
----------------
[33;1m[1;3m{context}[0m

Here's the text to analyze:
[33;1m[1;3m{query}[0m



In [41]:
# LangChain Debug
debug_on = False
set_debug(debug_on)
set_verbose(debug_on)

# get all page_content of docs
docs_page_contents_list = [doc.page_content for doc in docs]
docs_page_contents = '\n\n'.join(docs_page_contents_list)
# print(docs_page_contents)

prompt_inputs = {'query': query, 'context': docs_page_contents}

# Chain
# print(f'Prompt Inputs:')
# pprint(prompt_inputs)

print(f'User Query: {query}')
user_query_chain = user_query_prompt | llm
user_query_response = user_query_chain.invoke(prompt_inputs)
print(f'Response: {user_query_response}')

User Query: Please provide a summary of Akiko835 Larkin917's Conditions. If there are multiple active conditions, list the most recent one. In the case of multiple conditions with the same most recent date, list all of them. If no active conditions are found, please state 'No active medical conditions found'.
Response: Akiko835 Larkin917 has the following active conditions:
- Social isolation (finding) - onset date time: 05/24/2023 at 07:49:17
- Abnormal findings diagnostic imaging heart+coronary circulat (finding) - onset date time: 11/14/1987 at 00:07:02
- Hyperlipidemia - onset date time: 06/30/1965 at 06:56:35
- Has a criminal record (finding) - onset date time: 07/15/1959 at 07:47:50
- Ischemic heart disease (disorder) - onset date time: 11/04/1987 at 06:56:35
- Prediabetes - onset date time: 09/08/1943 at 06:56:35
- Anemia (disorder) - onset date time: 09/08/1943 at 06:56:35
- Osteoporosis (disorder) - onset date time: 10/23/1985 at 06:56:35


### Step-05 Neo4J Query - Enhanced Context

<br>***Fetch referenced resources for additional context for LLM***

To enhance the LLM's accuracy in answering user questions, it's crucial to fetch the text representation of all referenced FHIR resources. For instance, an Observation resource might reference Specimen, Device, Procedure, etc. This provides complete context to the LLM, enabling it to accurately answer queries involving these linked resources.

Additionally, this ensures the inclusion of key information like patient names from the referenced Patient resource, preventing incorrect responses stating that the context lacks information about the patient.

In [124]:
def fetch_enhanced_context(in_resource_ids: list) -> str:
    
    # Fetch relevant text from the graph database
    cipher = f"""
    MATCH (node: resource)
    WHERE node.id IN {in_resource_ids}

    OPTIONAL MATCH (node)-[r]-(neighbor :resource)
    WITH COLLECT(DISTINCT node) + COLLECT(DISTINCT neighbor) AS allNodes
    UNWIND allNodes as uniqueNode
    RETURN uniqueNode.text
    """

    try:
        response = graph.query(cipher)[0]
    except Exception as e:
        raise ValueError(f"Error in Graph Query: {e}")

    relevant_resource_text_list = [resource_id[0] for resource_id in response]
    
    # print(f'Number of resources matching query: {len(relevant_resource_text_list)}')
    # print(f'Enhanced Context Text:')
    # pprint(relevant_resource_text_list)
    
    return relevant_resource_text_list
    

In [43]:
enhanced_context_resources = fetch_enhanced_context(retrieved_resource_ids)
print(type(enhanced_context_resources))
print(f'enchanced_context_resource_text len: {len(enhanced_context_resources)}')
# print(f'enhanced_context_resources: {enhanced_context_resources}')

<class 'list'>
enchanced_context_resource_text len: 93


In [44]:
user_query_prompt='''
System: 
You are a Healthcare AI assistant. Your task is to respond to Doctors queries based on patient information from FHIR (Fast Healthcare Interoperability Resources) data.

**Context Handling:**

1. The context below contains entries about the patient's healthcare in FHIR format.
2. Identify and parse relevant FHIR resources within the context (e.g., Patient, Observation, Encounter).
3. Utilize the standard FHIR terminology and codes (e.g., LOINC for observations) to extract specific information.
4. Limit your answer to the information provided in the context. Do not make up facts.
5. Focus your answers on the patient specified in the user question. 
6. If you don't know the answer, simply state that you don't have enough information.
7. Ensure to look for the correct FHIR resource type in context to answer the query. For Example, to answer questions about claims history, search for FHIR resource of type 'Claim'
8. Utilize the 'CurrentDateTime' value in the context to calculate relative time periods (e.g., "last week") for queries referencing them.

**Date Handling:**

1. Pay very close attention to the dates in the context and user query.
2. Prioritize information from the most recent dates when responding to queries without a specified date.
3. Compare dates in the context and user query to determine the temporal relationship between events.
4. Use the 'CurrentDateTime' value, which is in MM/DD/YYYY format, to calculate relative time periods and filter relevant FHIR resources based on those periods.
5. Note that dates in the context are formatted as MM/DD/YYYY (e.g., 10/22/2015)

**Output Formatting:**

Before printing verify that you have considered the date and time in your response meets date time criteria in the user query (if mentioned).  

1. Respond to the user question with the above in mind.
2. Include the patient's name (given name and family name) in your response.
3. Format your output in markdown for clarity.
4. Make the patients name Bold.
5. Format the data into Markdown table with clear headers for information you think can be better represented in a table.

**Example Output:**
   - For vital signs: "[Patient Name]'s [Observation Name] was [Value] [Unit] on [Date]."
   - For encounters: "[Patient Name] had a [Encounter Type] on [Date] (reason: [Reason if available])."

----------------
Today's date - CurrentDateTime = {CurrentDateTime}

Context about the Patient
{context}

User Question:
{query}
'''

user_query_prompt = ChatPromptTemplate.from_template(user_query_prompt)
user_query_prompt.pretty_print()



System: 
You are a Healthcare AI assistant. Your task is to respond to Doctors queries based on patient information from FHIR (Fast Healthcare Interoperability Resources) data.

**Context Handling:**

1. The context below contains entries about the patient's healthcare in FHIR format.
2. Identify and parse relevant FHIR resources within the context (e.g., Patient, Observation, Encounter).
3. Utilize the standard FHIR terminology and codes (e.g., LOINC for observations) to extract specific information.
4. Limit your answer to the information provided in the context. Do not make up facts.
5. Focus your answers on the patient specified in the user question. 
6. If you don't know the answer, simply state that you don't have enough information.
7. Ensure to look for the correct FHIR resource type in context to answer the query. For Example, to answer questions about claims history, search for FHIR resource of type 'Claim'
8. Utilize the 'CurrentDateTime' value in the context to calculate 

In [45]:
# LangChain Debug
debug_on = False
set_debug(debug_on)
set_verbose(debug_on)

enhanced_context_resources = [res_text for res_text in enhanced_context_resources if res_text is not None]
enchanced_context_resources_text = '\n\n'.join(enhanced_context_resources)
# print(enhanced_context_resources)

current_datetime = datetime.now(timezone.utc).astimezone(timezone(offset=timedelta(hours=5, minutes=30)))
current_datetime_str = current_datetime.strftime("%m/%d/%Y")
    
prompt_inputs = {'query': query, 'CurrentDateTime': current_datetime_str,'context': enchanced_context_resources_text}

# Chain
# print(f'Prompt Inputs:')
# pprint(prompt_inputs)

print(f'User Query: {query}')
user_query_chain = user_query_prompt | llm
user_query_response = user_query_chain.invoke(prompt_inputs)
print(f'Response: {user_query_response}')

User Query: Please provide a summary of Akiko835 Larkin917's Conditions. If there are multiple active conditions, list the most recent one. In the case of multiple conditions with the same most recent date, list all of them. If no active conditions are found, please state 'No active medical conditions found'.
Response: The most recent active condition for Akiko835 Larkin917 is **Social isolation (finding)**, last recorded on 05/24/2023.


## Bringing it all together

In [126]:
debug_on = False
def answer_fhir_query(user_query: str):
    
    # Get Patient Name Chain
    patient_name_chain = patient_name_prompt | llm
    patient_name_response = patient_name_chain.invoke({'query': user_query})
    
    if patient_name_response:
        patient_name = json.loads(patient_name_response)
    else:
        patient_name = get_patient_name_from_user()
        
    
    if debug_on:
        print('Patient Name Info:')
        print(f'patient_name type: {type(patient_name)}')
        print(f'patient_name: {patient_name}')
        print('\n')
        
    # Get Patient_Id
    patient_id = get_patient_id(patient_name)
    if debug_on:
        print('Patient Id Info:')
        print(f'patient_id: {patient_id}')
        print('\n')
        
    # Identify Resource Type
    resource_type_chain = resource_type_prompt | llm | resource_type_parser
    resource_type_response = resource_type_chain.invoke(user_query)
    resource_type_text=resource_type_response['resource_type']

    
    if debug_on:
        print('Resource Type Info:')
        print(f'resource_type: {resource_type_response["resource_type"]}')
        print('\n')
        
    # Vector Search: Get Relevant Resources based on user query
    k=25
    vs_search_resource_docs = retrieve_relevant_resources(user_query, 
                                                          k,
                                                          patient_id=patient_id,
                                                          resource_type_text=resource_type_text)
    
    # for doc in vs_search_resource_docs:
    #     print(doc.metadata['fhir_resource_id'], "-", doc.metadata['fhir_resource_type'])
    
    vs_search_resource_ids = [doc.metadata['fhir_resource_id'][0] for doc in vs_search_resource_docs]
    
    if debug_on:
        print('\n')
        print('Resource Ids retrieved from Vector Search:')
        print(f'vs_search_resource_ids len: {len(vs_search_resource_ids)}')
        # print(f'vs_search_resource_ids: {vs_search_resource_ids}')
        print('\n')
    
    # Get Current Date and Time in format MM/DD/YYYY
    current_datetime = datetime.now(timezone.utc).astimezone(timezone(offset=timedelta(hours=5, minutes=30)))
    current_datetime_str = current_datetime.strftime("%m/%d/%Y")
    
    if resource_type_text == 'Patient':
        context_text = '\n\n'.join([doc.page_content for doc in vs_search_resource_docs])
        prompt_inputs = {'query': user_query, 'CurrentDateTime': current_datetime_str,'context': context_text}
        
    else:
        # Neo4J query - for getting enhanced context
        enhanced_context = fetch_enhanced_context(in_resource_ids=vs_search_resource_ids)
        enhanced_context_text = '\n\n'.join([res_text for res_text in enhanced_context if res_text is not None])
        prompt_inputs = {'query': user_query, 'CurrentDateTime': current_datetime_str,'context': enhanced_context_text}

        if debug_on:
            print('\n')
            print(f'# of Enhanced Context Resources: {len(enhanced_context)}')
            print(f'Enhanced Context Text:')
            pprint(enhanced_context_text)
            print('\n')
        
        
    
    
    # Finally calling the LLM to answer user query
   
    user_query_chain = user_query_prompt | llm
    user_query_response = user_query_chain.invoke(prompt_inputs)
    
    return user_query_response

In [128]:
from rich.console import Console
from rich.markdown import Markdown

console = Console()

langchain_debug = False
set_debug(langchain_debug)
set_verbose(langchain_debug)

user_query = "What is the Body Height of Benjamin360 Hintz995 and when was it measured?"
# user_query = "What is the Body Weight of Benjamin360 Hintz995 and when was it measured?"
# user_query = "Tell me about the last 5 Benjamin360's Procedures?"
# user_query = "Tell me about observations performed by Benjamin360 in the last 2 years?"
# user_query = "What allergies does Benjamin360 have?"

console.print(Markdown('# User Query'))
print(user_query)

console.print(Markdown('# LLM Response'))
llm_user_query_response = (answer_fhir_query(user_query))
console.print(Markdown(llm_user_query_response))


What is the Body Height of Benjamin360 Hintz995 and when was it measured?


In [129]:
output_formatting_instructions = "List the ouput in a Markdown table format."

vital_signs_prompt = """Please provide a summary of latest vital signs. Include the latest value, unit of measurement, and date taken (MM/DD/YYYY) for each of the following vital signs:

*   Body Height (LOINC code 8302-2)
*   Body Weight (LOINC code 29463-7)
*   Body Mass Index (BMI) [Ratio] (LOINC code 39156-5)
*   Body temperature (LOINC code 8310-5)
*   Systolic blood pressure (LOINC code 8480-6)
*   Diastolic blood pressure (LOINC code 8462-4)
*   Heart rate (LOINC code 8867-4)
*   Respiratory rate (LOINC code 9279-1)
*   Oxygen saturation in Arterial blood by Pulse oximetry (LOINC code 59408-5)

If a particular vital sign is not found in the records, please indicate so with 'N/A'. Present the information in a markdown table with the columns 'Vital Sign', 'Value', 'Unit', and 'Date Taken'."""

patient_summary_query = [
    {'Demographics': "please provide the following demographic information, if available in the context: full name, date of birth, gender, primary phone number, and home address."},
    {'Medical History': "Please provide a summary of all active Conditions. List the onset date, status and verification. List the ouput in a Markdown table format. If no active conditions are found, please state 'No active medical conditions found'."},
    {'Medications': "please list all current medications, including the medication name, dosage instructions, and status (e.g., active, completed). Present this informationin a markdown table with the columns 'Medication Name', 'Dosage', and 'Status'. If no current medications are found, please state 'No current medications found'."},
    {'Allergies':"please list all known allergies, including the allergen name and severity. Present this information in a markdown table with the columns 'Allergen' and 'Severity'. If no allergies are found, please state 'No known allergies found'."},
    {'Immunizations':"please list all immunizations for, including the vaccine code, date administered, and status. Present this information in a markdown table with the columns 'Vaccine Code', 'Date Administered', and 'Status'. If no immunizations are found, please state 'No immunizations found'."},
    {'Vital Signs': vital_signs_prompt},
    
]

In [149]:
langchain_debug = False
set_debug(langchain_debug)
set_verbose(langchain_debug)
def generate_patient_summary(patient_name):
    
    report_text = '# Patient Summary\n'
    
    # console.print(Markdown('# Patient Summary'))
    
    patient_context = f'For the given patient: {patient_name}, '
    
    for section in patient_summary_query:
        section_title = list(section.keys())[0]
        print(f'Processing Section: {section_title}')
        
        report_text = report_text + '___\n'
        section_header = f'## {section_title}\n'
        report_text = report_text + section_header + '\n'
        report_text = report_text + '___\n'
        # console.print(Markdown(section_header))
        
        section_question = list(section.values())[0]
        # llm_user_query_response = patient_context + section_question + output_formatting_instructions
        llm_user_query_response = answer_fhir_query(patient_context + section_question + output_formatting_instructions)
        report_text = report_text + llm_user_query_response + '\n'
        
        # console.print(Markdown(llm_user_query_response))
    
    # console.print(Markdown(report_text))
    report_text = report_text + '**END OF REPORT**'
    return report_text
        
#generate_patient_summary('Akiko835 Larkin917')  

In [151]:
patient_summary_name = 'Akiko835 Larkin917'
patient_summary_response = generate_patient_summary(patient_name) 

patient_summary_md_file = f'{patient_summary_name}.md'
with open (patient_summary_md_file, 'w') as f:
    f.write(patient_summary_response)
    


Processing Section: Demographics
Processing Section: Medical History
Processing Section: Medications
Processing Section: Allergies
Processing Section: Immunizations
Processing Section: Vital Signs


Please enter the patient's full name:  Akiko835 Larkin917
Is 'Akiko835 Larkin917' correct? (yes/no):  yes
