In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Unlock FHIR with RAG on Vertex AI

## Run the Notebook

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.10.13

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/adethyaa/unlock-fhir-with-rag-on-vertexai/blob/main/Unlock%20FHIR%20with%20RAG%20on%20Vertex%20AI.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/adethyaa/unlock-fhir-with-rag-on-vertexai/blob/main/Unlock%20FHIR%20with%20RAG%20on%20Vertex%20AI.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/adethyaa/unlock-fhir-with-rag-on-vertexai/blob/main/Unlock%20FHIR%20with%20RAG%20on%20Vertex%20AI.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

| | |
|-|-|
|Author(s) | [Vikrama Adethyaa](https://github.com/adethyaa) |

---
## 1. Overview

This notebook demonstrates building a natural language interface to complex [FHIR](https://fhir.org/about.html) datasets using [Google Cloud’s Vertex AI](https://cloud.google.com/vertex-ai?hl=en).  Leveraging Retrieval Augmented Generation (RAG), Enterprise Knowledge Graphs, and Vector Search, this solution empowers healthcare professionals to query FHIR data with natural language. This demo is inspired by [Sam Schifman's work](https://medium.com/@samschifman/rag-on-fhir-with-knowledge-graphs-04d8e13ee96e).

---
## 2. Environment Setup

### 2.1. Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

### 2.2. Install Packages and Dependencies
Please install the following packages to run this notebook.

In [None]:
# Install Vertex AI SDK
! pip install --user -r requirements.txt

### 2.3 Download Matching Engine Helper Scripts

- ***Matching Engine*** is now called ***[Vertex AI Vector Search](https://cloud.google.com/vertex-ai/docs/vector-search/overview)***
- The cell below downloads helper functions necessary for the Vertex AI Matching Engine. These functions improve notebook readability. You can find the ***[source code on Github](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/language/use-cases/document-qa/utils).***

In [None]:
import os
import urllib.request

util_folder = "utils"

if not os.path.exists(util_folder):
    os.makedirs(util_folder)

url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/document-qa/utils"
files = ["__init__.py", "matching_engine.py", "matching_engine_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"{util_folder}/{fname}")

***Restart Kernel***

Run the following cell to restart the kernel or use the button to restart the kernel.

<div class="alert alert-block alert-warning">
<b>⚠️ Kindly allow the kernel to finish restarting before continuing. ⚠️</b>
</div>

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### 2.4. Authenticating your notebook environment

- If you are using **Colab** to run this notebook, run the cell below and continue.
- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

**Vertex AI Workbench**
- Open a Terminal in the Jupyter notebook
- Execute the below command and follow the instructions

```bash
gcloud auth login
```

### 2.5. Define Global Variables

In [2]:
# Limit FHIR files loaded for demo purposes
# Since Data Ingestion into Neo4J and Vector Search Index takes time, We set the parameter below to control the number of files ingested.
DEMO_FILES_INGEST_LIMIT = 20 # @param {type:"integer"}

# GCP Parameters
PROJECT_ID = "propane-crawler-363311"  # @param {type:"string"}
REGION = "us-central1"  # @param {type: "string"}

# Neo4J Connection Parameters
NEO4J_URL="bolt://localhost:7687" # @param {type:"string"}
NEO4J_USER="neo4j" # @param {type:"string"}
NEO4J_PASSWORD="password" # @param {type:"string"}

# Dimension Vertex PaLM Text Embedding
ME_DIMENSIONS = 768 # @param {type:"integer"} 
ME_DISTANCE_MEASURE_TYPE = "DOT_PRODUCT_DISTANCE" # @param {type:"string"} 

# Update to bigger SHARDS for larger data volumes & performance
# Doc - https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index
ME_SHARD_SIZE = "SHARD_SIZE_SMALL" # @param ["SHARD_SIZE_SMALL", "SHARD_SIZE_MEDIUM", "SHARD_SIZE_LARGE"] 

# Vertex AI Vector Search (MatchingEngine) Endpoint Parameters
# Doc - https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index

# The machine types that you can use to deploy your index
ME_ENDPOINT_MACHINE_TYPE = "e2-standard-2" # @param ["n1-standard-16", "n1-standard-32", "e2-standard-2", "e2-standard-16", "e2-highmem-16", "n2d-standard-32"] 

ME_ENDPOINT_MIN_REPLICA_COUNT = 2 # @param {type:"integer"} 
ME_ENDPOINT_MAX_REPLICA_COUNT = 10 # @param {type:"integer"} 

# Vertex AI Vector Search (MatchingEngine) Index Parameters
ME_INDEX_NAME = 'fhir_me_index'  # @param {type: "string"}
ME_EMBEDDING_GCS_DIR = f'{PROJECT_ID}-me-bucket' # @param {type:"string"} 
ME_DESCRIPTION = "Index for FHIR Resources" # @param {type:"string"} 

ME_ENHANCED_CONTEXT_INDEX_NAME = f'{ME_INDEX_NAME}_enhanced' # @param {type:"string"} 
ME_ENHANCED_EMBEDDING_GCS_DIR = f'{ME_EMBEDDING_GCS_DIR}_enhanced' # @param {type:"string"} 
ME_ENHANCED_DESCRIPTION = f'Enhanced Context {ME_DESCRIPTION}' # @param {type:"string"} 



# Set the LLM to use
VERTEX_AI_MODEL_NAME = 'gemini-1.0-pro-001'
TEXT_EMBEDDING_MODEL_NAME = "textembedding-gecko@003"

### 2.6. Import Libraries

**Colab only:** Run the below cell to initialize the Vertex AI SDK. For Vertex AI Workbench, you don't need to run this.


In [3]:
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)

<br>*Import Python Libraries*

<div class="alert alert-block alert-warning">
<b>⚠️ Restart python kernel if issues while importing langchain  ⚠️</b>
</div>

In [4]:
# Utils
import os
import glob
from pprint import pprint
import json
import csv

import time
import datetime

import uuid
import numpy as np

import warnings

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from google.protobuf.json_format import MessageToDict

# Google Vertex AI
# from google.cloud import aiplatform
# print(f"Vertex AI SDK version: {aiplatform.__version__}")

# Neo4j Helper scripts
from utils.NEO4J_Graph import Graph
from utils.FHIR_to_string import FHIR_to_string
from utils.FHIR_to_graph import resource_to_node, resource_to_edges, flat_fhir_to_json_str

# Langchain
import langchain
print(f"LangChain version: {langchain.__version__}")

from langchain import PromptTemplate


from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_vertexai import VectorSearchVectorStore
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
    Namespace,
    NumericNamespace,
)


# Import libraries
from langchain.chains import RetrievalQA
from langchain_google_vertexai import ChatVertexAI


# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

# Import Custom LangChain Retriever
from utils.FHIRResourcesRetriever import FHIRResourcesRetriever

warnings.filterwarnings("ignore")

LangChain version: 0.1.16


---
## 3. Setup RAG Prerequisites

In this section we will create:
- Neo4J - local deployment for Enterprise Knowledge Graph
- VertexAI VectorSearch - Vector Database

### 3.1. Deploy Neo4J Docker Container

We'll use **Docker** to deploy a local Neo4j instance for demo purposes.  For production environments, we highly recommend Neo4j on Google Cloud Marketplace. [Try Neo4j on Google Cloud Marketplace](https://console.cloud.google.com/marketplace/product/endpoints/prod.n4gcp.neo4j.io?pli=1&mpp=4bfb2414ab973c741b6f067bf06d5575&mpid=%24device%3A18e0c346ea25f9-098a968de268b5-1d525637-384000-18e0c346ea25fa).

The command below launches a local Neo4j database Docker container named ***testneo4j***. Let's break down what it does:

**Port Mapping:**
- 7474: Access the Neo4j web interface through your browser (usually http://localhost:7474).
- 7687: Enables communication with Neo4j using the Bolt protocol (essential for working with the database).

**Data Volumes:** Folders on your machine ($HOME/neo4j/*) store your database, logs, imports, etc., so your data is safe even if the container stops.

**Secure Credentials:** Variables `{NEO4J_USER}` and `{NEO4J_PASSWORD}` set your Neo4j username and password for secure access.

In [5]:
%env NEO4J_USER={NEO4J_USER}
%env NEO4J_PASSWORD={NEO4J_PASSWORD}

env: NEO4J_USER=neo4j
env: NEO4J_PASSWORD=password


In [6]:
! docker run --name testneo4j -p7474:7474 -p7687:7687 -d \
    -v $HOME/neo4j/data:/data \
    -v $HOME/neo4j/logs:/logs \
    -v $HOME/neo4j/import:/var/lib/neo4j/import \
    -v $HOME/neo4j/plugins:/plugins \
    --env NEO4J_AUTH=$NEO4J_USER/$NEO4J_PASSWORD \
    --env='NEO4JLABS_PLUGINS=["apoc"]' \
    neo4j:latest

docker: Error response from daemon: Conflict. The container name "/testneo4j" is already in use by container "c11076036c54a06a362119887aa38eebcfe7f287f7f436f8b5b241f0d98e0012". You have to remove (or rename) that container to be able to reuse that name.
See 'docker run --help'.


In [7]:
# Check if Docker Container is running
! docker ps -a

CONTAINER ID   IMAGE          COMMAND                  CREATED        STATUS        PORTS                                                                                            NAMES
c11076036c54   neo4j:latest   "tini -g -- /startup…"   43 hours ago   Up 10 hours   0.0.0.0:7474->7474/tcp, :::7474->7474/tcp, 7473/tcp, 0.0.0.0:7687->7687/tcp, :::7687->7687/tcp   testneo4j


In [60]:
# Start the Container if it is not running
! docker start testneo4j

testneo4j


#### 3.1.1. Connect to Neo4J Database

This code block creates a Graph object instance, establishing a connection to the Neo4j database.
See **utils/NEO4J_Graph.py** for more information

In [8]:
graph = Graph(NEO4J_URL, NEO4J_USER, NEO4J_PASSWORD)

#### 3.1.2. Neo4j Database Helper Cells

The following three cells contain database management functions. For a new or blank database, you can skip them.

In [9]:
# Get type and number of each FHIR resource in the database
resource_metrics = graph.resource_metrics()
resource_metrics.sort()
pprint(resource_metrics)

[['AllergyIntolerance', 10],
 ['CarePlan', 60],
 ['CareTeam', 60],
 ['Claim', 3486],
 ['Condition', 656],
 ['Device', 39],
 ['DiagnosticReport', 3100],
 ['DocumentReference', 1859],
 ['Encounter', 1859],
 ['ExplanationOfBenefit', 3486],
 ['ImagingStudy', 8],
 ['Immunization', 259],
 ['Medication', 957],
 ['MedicationAdministration', 957],
 ['MedicationRequest', 1627],
 ['Observation', 13501],
 ['Patient', 20],
 ['Procedure', 2966],
 ['SupplyDelivery', 239]]


In [10]:
# metrics for counting nodes and relationships
node_count, relationship_count = graph.database_metrics()
print('Database Metrics:')
print(f'    - Node Count = {node_count}')
print(f'    - Relationship Count = {relationship_count}')

Database Metrics:
    - Node Count = 37884
    - Relationship Count = 190926


### 3.2. Create VertexAI VectorSearch Index

This involves the following steps:
- Create a GCS Bucket for storing text.
- Instantiate embedding model - used for creating vectors of Text.
- Create a VectorSearch Streaming Index
- Create VectorSearch IndexEndpoint & deploy Index
- Create LangChain VectorStore on VertexAI VectorSearch

#### 3.2.1. Create GCS Bucket 

The Google Cloud Storage Bucket will be used by Vector Store Index.
[Create and manage your index](https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index)

In [11]:
! set -x && gsutil mb -p $PROJECT_ID -l $REGION gs://$ME_EMBEDDING_GCS_DIR

+ gsutil mb -p propane-crawler-363311 -l us-central1 gs://propane-crawler-363311-me-bucket
Creating gs://propane-crawler-363311-me-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'propane-crawler-363311-me-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


#### 3.2.2. Instantiate embedding model

Load the pre-trained models textembedding-gecko@003 (embedding generation)
Learn more about Google's Foundation Models and their capabilities in this [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_model_apis).

In [12]:
# Create Text Embedding
text_embedding_model = VertexAIEmbeddings(
    model_name=TEXT_EMBEDDING_MODEL_NAME,
    project=PROJECT_ID,
    location=REGION,
    max_retries=6
)

text_embedding_model

VertexAIEmbeddings(client=<vertexai.language_models.TextEmbeddingModel object at 0x7f77bbc18a90>, project='propane-crawler-363311', location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='textembedding-gecko@003', client_preview=None, temperature=None, max_output_tokens=None, top_p=None, top_k=None, credentials=None, n=1, streaming=False, safety_settings=None, api_transport=None, api_endpoint=None, instance={'max_batch_size': 250, 'batch_size': 250, 'min_batch_size': 5, 'min_good_batch_size': 5, 'lock': <unlocked _thread.lock object at 0x7f77c36a9d40>, 'batch_size_validated': False, 'task_executor': <concurrent.futures.thread.ThreadPoolExecutor object at 0x7f77bbc18970>, 'embeddings_task_type_supported': True, 'get_embeddings_with_retry': <function TextEmbeddingModel.get_embeddings at 0x7f77b94be9e0>})

#### 3.2.3. Create a VectorSearch Streaming Index

In this step, we'll create the Vector Search Index.

For more details, refer to the Create and Manage Index: https://cloud.google.com/vertex-ai/docs/vector-search/create-manage-index documentation.
Vector Store supports two index types:

- **Batch Updates:** Ideal for less frequent modifications.
- **Streaming Updates:** Allows near-real-time additions and queries (our choice for this example).

<br>*Let's create a dummy embeddings file required to initialize the Vector Search Index.*

In [27]:
# Create Temp folder
! mkdir -p $HOME/temp

# Create Dummy Embdeddng Data
dummy_embedding = {"id": str(uuid.uuid4()), "embedding": list(np.zeros(ME_DIMENSIONS))}

# Write Dummy Embedding Data to a JSON file
with open('../temp/dummy_embedding.json', 'w') as f:
    json.dump(dummy_embedding, f)
    
# Copy the dummy_embedding.json to Cloud Storage Bucket.
! set -x && gsutil cp ../temp/dummy_embedding.json gs://{ME_EMBEDDING_GCS_DIR}/init_index/dummy_embedding.json

+ gsutil cp ../temp/dummy_embedding.json gs://propane-crawler-363311-me-bucket/init_index/dummy_embedding.json
Copying file://../temp/dummy_embedding.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


<br>*Let us now create a Streaming Index*

In [13]:
me_utils = MatchingEngineUtils(PROJECT_ID, REGION, ME_INDEX_NAME)

me_index = me_utils.create_index(
    embedding_gcs_uri=f'gs://{ME_EMBEDDING_GCS_DIR}/init_index',
    dimensions=ME_DIMENSIONS,
    index_update_method='streaming',
    index_algorithm='tree-ah',
    shard_size= ME_SHARD_SIZE,
    distance_measure_type=ME_DISTANCE_MEASURE_TYPE,
    description=ME_DESCRIPTION
)

INFO:root:Index fhir_me_index already exists with id projects/884766917846/locations/us-central1/indexes/7340819014102286336


In [29]:
# Get information about the Index
if me_index:
    index_metadata = MessageToDict(me_index._pb)
    print('Index Details:')
    print(f'- Index Name = {index_metadata["name"]}')
    print(f'- Update Method = {index_metadata["indexUpdateMethod"]}')
    print(f'- Dimensions = {index_metadata["metadata"]["config"]["dimensions"]}')
    print(f'- Shard Size = {index_metadata["metadata"]["config"]["shardSize"]}')
    print(f'- Distance Measure Type = {index_metadata["metadata"]["config"]["distanceMeasureType"]}')
    algorithm = list(index_metadata["metadata"]["config"]['algorithmConfig'].keys())[0]
    print(f'- Algorithm = {algorithm}')
    # print(f'- Index Stats = {index_metadata["indexStats"]}')

# index_metadata

Index Details:
- Index Name = projects/884766917846/locations/us-central1/indexes/7340819014102286336
- Update Method = STREAM_UPDATE
- Dimensions = 768.0
- Shard Size = SHARD_SIZE_SMALL
- Distance Measure Type = DOT_PRODUCT_DISTANCE
- Algorithm = treeAhConfig


#### 3.2.4. Create VectorSearch IndexEndpoint & deploy Index

In this step, we'll deploy the Index to a Vector Search Index Endpoint. This endpoint is essential for sending queries to your index. 

We'll use a [Public endpoint](https://cloud.google.com/vertex-ai/docs/vector-search/deploy-index-public) for this example. 

To set up a [Private Endpoint](https://cloud.google.com/vertex-ai/docs/vector-search/deploy-index-vpc), please refer to the documentation.

<div class="alert alert-block alert-warning">
<b>⚠️ Important: Deploying an Index to an Endpoint takes time, typically around 15-25 minutes or more. ⚠️</b>
</div>


In [32]:
me_endpoint = me_utils.deploy_index(
    machine_type=ME_ENDPOINT_MACHINE_TYPE,
    min_replica_count=ME_ENDPOINT_MIN_REPLICA_COUNT,
    max_replica_count=ME_ENDPOINT_MAX_REPLICA_COUNT,
    public_endpoint_enabled=True
)

me_endpoint

INFO:root:Index endpoint fhir_me_index-endpoint already exists with resource name as projects/884766917846/locations/us-central1/indexEndpoints/2407125622317907968 and endpoint domain name as 115788813.us-central1-884766917846.vdb.vertexai.goog
INFO:root:Skipping deploying Index. Index fhir_me_indexalready deployed with id projects/884766917846/locations/us-central1/indexes/7340819014102286336 to the index endpoint fhir_me_index-endpoint


name: "projects/884766917846/locations/us-central1/indexEndpoints/2407125622317907968"
display_name: "fhir_me_index-endpoint"
deployed_indexes {
  id: "fhir_me_index_20240429164231"
  index: "projects/884766917846/locations/us-central1/indexes/7340819014102286336"
  display_name: "fhir_me_index_20240429164231"
  create_time {
    seconds: 1714408951
    nanos: 798613000
  }
  index_sync_time {
    seconds: 1714409921
    nanos: 439565000
  }
  deployment_group: "default"
  dedicated_resources {
    machine_spec {
      machine_type: "e2-standard-2"
    }
    min_replica_count: 2
    max_replica_count: 10
  }
}
etag: "AMEw9yOdCMM98NCY6uFJmdgw9gRbaAZFD3fXhYNcsVEsFZKyfjtxeVjRDxSQbdFuVfw="
create_time {
  seconds: 1714408891
  nanos: 533732000
}
update_time {
  seconds: 1714408892
  nanos: 155730000
}
public_endpoint_domain_name: "115788813.us-central1-884766917846.vdb.vertexai.goog"
encryption_spec {
}

In [33]:
if me_endpoint:
    endpoint_metadata = MessageToDict(me_endpoint._pb)
    print(f'Endpoint Name: {endpoint_metadata["name"]}')
    print(f'Endpoint Public Domain Name: {endpoint_metadata["publicEndpointDomainName"]}')
    print('Deployed indexes on the endpoint:')
    
    for d in me_endpoint.deployed_indexes:
        print(f'    - Deployed Indexe ID = {d.id}')
        print(f'      Machine Type = {d.dedicated_resources.machine_spec.machine_type}')
        print(f'      Min Replica Count = {d.dedicated_resources.min_replica_count}')
        print(f'      Max Replica Count = {d.dedicated_resources.max_replica_count}')
        
# endpoint_metadata

Endpoint Name: projects/884766917846/locations/us-central1/indexEndpoints/2407125622317907968
Endpoint Public Domain Name: 115788813.us-central1-884766917846.vdb.vertexai.goog
Deployed indexes on the endpoint:
    - Deployed Indexe ID = fhir_me_index_20240429164231
      Machine Type = e2-standard-2
      Min Replica Count = 2
      Max Replica Count = 10


In [14]:
# Get Matching Engine Index id and Endpoint id
me_utils = MatchingEngineUtils(PROJECT_ID, REGION, ME_INDEX_NAME)
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = me_utils.get_index_and_endpoint()

print(f'- ME_INDEX_ID:{ME_INDEX_ID}\n- ME_INDEX_ENDPOINT_ID:{ME_INDEX_ENDPOINT_ID}')

- ME_INDEX_ID:projects/884766917846/locations/us-central1/indexes/7340819014102286336
- ME_INDEX_ENDPOINT_ID:projects/884766917846/locations/us-central1/indexEndpoints/2407125622317907968


#### 3.2.5 Create a LangChain VectorStore instance of VertexAI VectorSearch
In this section we create `VectorSearchVectorStore` Object and connect it to the Index & Endpoint We just created.
We will also write and retrieve test data to test the connection.


In [15]:
vector_store = VectorSearchVectorStore.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_GCS_DIR}".split("/")[2],
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
    stream_update=True,
    embedding=text_embedding_model
)
vector_store

<langchain_google_vertexai.vectorstores.vectorstores.VectorSearchVectorStore at 0x7f77b94dbb50>

#### 3.2.6. Testing VertexAI VectorSearch

We use VertexAI VectorSearch to retrieve relevant FHIR data based on the user's query. The search incorporates:
- **Semantic Matching**: Finding Resources with similar text embeddings to the query.
- **Metadata Filters**: Narrowing down results based on patient_id and resource_type to ensure relevance. (Refer to the Filter vector matches: https://cloud.google.com/vertex-ai/docs/vector-search/filtering documentation.)

A RAG based LLM prompt consist of 3 sections:
- **Instructions**: Guidance for the LLM on how to generate an accurate and relevant response based on the retrieved context and the user query.
- **Context**: We also apply filters based on patient_id and resource_type to ensure the retrieved context is relevant to the specific patient and the type of information sought. This is a key step, as the accuracy of the LLM's response directly depends on the relevance of this retrieved context. 
- **User Question**: The original query posed by the user.

For Demonstration purpose we will:
- First query VectorSearch without Filter.
- Next query VectorSearch with Filters. Refer to VectoSearch [Filter vector matches](https://cloud.google.com/vertex-ai/docs/vector-search/filtering) documentation for more information.

<br>*First, let us add sample data to vectorSearch*

In [16]:
# Create Sample Data to Ingest
resource_text_metadatas = [
    {
    "fhir_patient_id":"pid_111111111",
    "fhir_resource_id":"rid_111111111",
    "fhir_resource_type":"Test_Resource_type",
    "neo4j_node_id": "nid_111111111"
    },
    {
    "fhir_patient_id":"pid_222222222",
    "fhir_resource_id":"rid_222222222",
    "fhir_resource_type":"Test_Resource_type",
    "neo4j_node_id": "nid_222222222"
    },
    {
    "fhir_patient_id":"pid_333333333",
    "fhir_resource_id":"rid_333333333",
    "fhir_resource_type":"Test_Resource_type",
    "neo4j_node_id": "nid_333333333"
    }  
]

resource_texts = ['This is a sample Resource Type', 'This is a sample Patient Type', 'This is a sample Car Type']

In [37]:
# Write Sample Data to VectorSearch
ids = vector_store.add_texts(texts=resource_texts, 
                             metadatas=resource_text_metadatas, 
                             is_complete_overwrite=True)
ids

Upserting datapoints MatchingEngineIndex index: projects/884766917846/locations/us-central1/indexes/7340819014102286336


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Upserting datapoints MatchingEngineIndex index: projects/884766917846/locations/us-central1/indexes/7340819014102286336


MatchingEngineIndex index Upserted datapoints. Resource name: projects/884766917846/locations/us-central1/indexes/7340819014102286336


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex index Upserted datapoints. Resource name: projects/884766917846/locations/us-central1/indexes/7340819014102286336


['18072f8d-bdfb-49b6-b5de-9f2856fd30f3',
 'fc2e1c4b-d1e2-4301-bae1-85043a0355a2',
 '9a2790d3-130c-4182-b1e9-4f60694c4b68']

<br>*Query VectorSearch without Filter*

In [17]:
query_text = 'sample Resource'
response = vector_store.similarity_search_with_score(query=query_text, k=10)

response 

# Note: The response will likely contain all 3 data points. 
# Although they have different lexical forms (Sample Resource, Sample Process), 
# they share the same semantic meaning related to "sample," which is what our
# semantic search is designed to capture.

[(Document(page_content='This is a sample Resource Type', metadata={'fhir_patient_id': 'pid_111111111', 'fhir_resource_id': 'rid_111111111', 'fhir_resource_type': 'Test_Resource_type', 'neo4j_node_id': 'nid_111111111'}),
  0.7039443254470825),
 (Document(page_content="The type of information in this entry is observation. The status for this observation is final. The category of this observation is Survey. The code for this observation is Protocol for Responding to and Assessing Patients' Assets, Risks, and Experiences [PRAPARE]. This observation was effective date time on 09/01/2015 at 05:25:14. This observation was issued on 09/01/2015 at 05:25:14. This observation contains 21 components. The 1st component's code for this observation is Within the last year, have you been afraid of your partner or ex-partner?. The 1st component's value codeable concept for this observation is No. The 2nd component's code for this observation is Do you feel physically and emotionally safe where you cur

<br>*Query VectorSearch with Filters*

In [41]:
# Retrieving Data from VectorSearch with Filter

vs_filter = [Namespace(name="fhir_patient_id", allow_tokens=['pid_111111111'])]
response = vector_store.similarity_search_with_score(query=query_text, k=10, filter=vs_filter)
response

# Note: We now expect a single data point in the results, as we've applied a filter to retrieve
# only the resource associated with the specific patient ID ('fhir_patient_id=pid_111111111'). 
# This filter ensures we're focusing on information relevant to this particular patient. 

[(Document(page_content='This is a sample Resource Type', metadata={'fhir_patient_id': 'pid_111111111', 'fhir_resource_id': 'rid_111111111', 'fhir_resource_type': 'Test_Resource_type', 'neo4j_node_id': 'nid_111111111'}),
  0.7039443254470825)]

---
## 4. Fetch FHIR Synthetic Data

Now that we established the prerequiste Infrastructure, let us fetch the sample data.

We will be using sample FHIR data from [Synthea](https://synthea.mitre.org/) for the purpose of this demonstration. 
We will be using the pre-generated data available [here](https://synthetichealth.github.io/synthea-sample-data/downloads/latest/synthea_sample_data_fhir_latest.zip). 

### 4.1. Download synthetic FHIR Data from Synthea

In [None]:
! mkdir -p $HOME/working

! curl  \
--url https://synthetichealth.github.io/synthea-sample-data/downloads/latest/synthea_sample_data_fhir_latest.zip \
--output $HOME/working/synthea_sample_data_fhir_latest.zip

<br>*Unzip the synthea_sample_data_fhir_latest.zip*

In [None]:
! unzip $HOME/working/synthea_sample_data_fhir_latest.zip -d $HOME/working/bundles

### 4.2. Taking Inventory of FHIR Files

<br>*Let us look at the attributes of the FHIR Data we are going to ingest.*

In [42]:
# Feth all FHIR Files
fhir_folder_path = '../working/bundles'
fhir_files_list = glob.glob(f"{fhir_folder_path}/*.json")

# Limiting files to ingest
fhir_files_list.sort()
fhir_files_list = fhir_files_list[:DEMO_FILES_INGEST_LIMIT]


num_of_files = len(fhir_files_list)
print(f'Number of FHIR Files that would be ingested: {num_of_files}')

Number of FHIR Files that would be ingested: 20


In [43]:
file_counter = 1
total_resources = 0
file_resources_meta_list = []
resource_types_count = {}

for fhir_file in fhir_files_list:
    fhir_file_name = os.path.basename(fhir_file)
    # print(f'File {file_counter} of {num_of_files}: {fhir_file_name}')
    file_counter += 1
    
    with open(fhir_file) as raw:
        bundle = json.load(raw)
        resources_entry_list = bundle['entry']
            
        # Resources Counter
        num_of_resources = len(resources_entry_list)
        total_resources += num_of_resources
        file_resources_meta = {}
        file_resources_meta['file_name'] = fhir_file_name
        file_resources_meta['resources_count'] = num_of_resources
        
        file_resources_meta_list.append(file_resources_meta)
        
        # Count Individual Resource Types
        for entry in resources_entry_list:
            resource = entry['resource']
            resource_type = resource["resourceType"]

            if resource_type not in resource_types_count.keys():
                # print(f'Creating Dict entry for Resource: {resource_type}')
                resource_types_count[resource_type] = 0
            
            resource_types_count[resource_type] += 1


print(f'Resource Types Count:')
pprint(resource_types_count)

print('\nFile Resources Meta:')
pprint(file_resources_meta_list)

print('\n### Summary ###')
print(f'- Number of FHIR Files to process = {num_of_files}')            
print(f'- Total Resources:{total_resources}\n')



Resource Types Count:
{'AllergyIntolerance': 10,
 'CarePlan': 60,
 'CareTeam': 60,
 'Claim': 3486,
 'Condition': 656,
 'Device': 39,
 'DiagnosticReport': 3100,
 'DocumentReference': 1859,
 'Encounter': 1859,
 'ExplanationOfBenefit': 3486,
 'ImagingStudy': 8,
 'Immunization': 259,
 'Medication': 957,
 'MedicationAdministration': 957,
 'MedicationRequest': 1627,
 'Observation': 13501,
 'Patient': 20,
 'Procedure': 2966,
 'Provenance': 20,
 'SupplyDelivery': 239}

File Resources Meta:
[{'file_name': 'Akiko835_Jacelyn576_Larkin917_05c4608d-bd9a-5d04-41d7-a0293da7f5a5.json',
  'resources_count': 1024},
 {'file_name': 'Anneliese170_Berge125_b2af96cc-588f-c029-3acb-37c9d59bb2b4.json',
  'resources_count': 427},
 {'file_name': 'Annice210_Edris973_McClure239_e1a90b76-c8fb-724a-5053-7355db36519b.json',
  'resources_count': 719},
 {'file_name': 'Anthony633_Renner328_6fabf2a0-6ce7-5cc4-cfd6-2858ef8fdd38.json',
  'resources_count': 443},
 {'file_name': 'Arleen939_Kling921_076a24c5-c7f3-c743-e882-34

## 5. Neo4J Helper Functions

### 5.1. Neo4J Data Loading Helper Functions

In [21]:
# Create FHIR Resource nodes in Neo4j
def create_resource_node(resource:dict) -> str:
    # print(resource_to_node(resource))
    node_creation_cypher = resource_to_node(resource)
    query_result, runtime = graph.query(node_creation_cypher)
    
    if len(query_result[0]) != 1:
        print(query_result)
        raise Exception("Resource Node creation query result does not meet defined format.")
       
    node_id = query_result[0][0]
    return node_id

In [22]:
# Create Date Nodes and Edges between Resource Nodes and Date Nodes
def create_date_nodes_edges(resource):
    edges = []
    dates = set() # set is used here to make sure dates are unique

    # generated the cypher for creating the reference & date edges and capture dates
    node_edges, node_dates = resource_to_edges(resource)
    edges += node_edges
    dates.update(node_dates)
    
    # Create Date Nodes - 'MERGE' Skip if Node exists
    for date in dates:
        cypher = 'MERGE (n:Date {name:"' + date + '", id: "' + date + '"})'
        graph.query(cypher)
    
    # Connect Resource Nodes and Date Nodes via Edges 
    for edge in edges:
        try:
            graph.query(edge)
        except:
            print(f'Failed to create edge: {edge}')
    
    return len(dates), len(edges)

### 5.2. FHIR to Neo4J Helper Functions

In [23]:
def load_fhir_neo4j(fhir_files_list):
    file_counter = 0
    
    for fhir_file in fhir_files_list:
        
        fhir_file_name = os.path.basename(fhir_file)
        file_counter += 1
        
        if file_counter > 1:
            print("\n")
        
        print(f'File {file_counter} of {num_of_files}: {fhir_file_name}')
        
        with open(fhir_file) as raw:
            bundle = json.load(raw)
            resources_entry_list = bundle['entry']
            
            num_of_resources = len(resources_entry_list)
            print(f'    - Number of resources = {num_of_resources}')
                
            resource_counter = 0
            for entry in resources_entry_list:
                resource_counter += 1
                
                resource = entry['resource']
                # print(resource)
                resource_id = resource["id"]
                resource_type = resource["resourceType"]
                
                # Skip Provenance Resource
                if resource_type == 'Provenance':
                    continue

                    
                print(f'    - Processing Resource {resource_counter} of {num_of_resources}: Resource-Type = {resource_type}, Resource-ID = {resource_id}', end="\r", flush=True)
                
                #### LOAD DATA INTO NEO4J ####
                
                # Create Resource Nodes
                node_id = create_resource_node(resource)
                # print (node_id)
                
                # Create Date Nodes and Edges to connect Resource Nodes to Date Nodes
                date_nodes_count, edges_count = create_date_nodes_edges(resource)

### 5.3 Neo4J Data Retrieval Helper Functions

In [24]:
def get_nodes_in_batches(graph, batch_size=1000):
    skip = 0
    while True:
        query_string = f'MATCH (r:resource) RETURN r SKIP {skip} LIMIT {batch_size}'        
        # print(query_string)
        results = graph.query(query_string)
        nodes = results[0]
        
        if not nodes:
            break  # No more nodes to fetch
        
        yield nodes  # Process this batch
        skip += batch_size

In [25]:
# print report on Resource nodes
def get_neo4j_resourceNodes_stats():
    batch_counter = 0
    resource_counter = 0
    resources_types_dict = {}

    for batch in get_nodes_in_batches(graph):
        batch_counter += 1
        # print(f'Batch: {batch_counter}, Len of Batch = {len(batch)}')

        for node in batch:
            resource_counter += 1
            resource_type = node[0]['resource_type']

            if resource_type not in resources_types_dict.keys():
                resources_types_dict[resource_type] = 0

            resources_types_dict[resource_type] += 1
            # print(node[0])

    sorted_dict = dict(sorted(resources_types_dict.items()))
    pprint(sorted_dict)
    print(f'Total Resource Nodes = {resource_counter}')
    
get_neo4j_resourceNodes_stats()

{'AllergyIntolerance': 10,
 'CarePlan': 60,
 'CareTeam': 60,
 'Claim': 3486,
 'Condition': 656,
 'Device': 39,
 'DiagnosticReport': 3100,
 'DocumentReference': 1859,
 'Encounter': 1859,
 'ExplanationOfBenefit': 3486,
 'ImagingStudy': 8,
 'Immunization': 259,
 'Medication': 957,
 'MedicationAdministration': 957,
 'MedicationRequest': 1627,
 'Observation': 13501,
 'Patient': 20,
 'Procedure': 2966,
 'SupplyDelivery': 239}
Total Resource Nodes = 35149


---
## 6. Data Ingestion

In [26]:
# Set to True to Ingest Data. This is a safety swithc to prevent accidental triggering of Data Ingestion
ingest_data = False

### 6.1. Trigger Neo4J Data Ingestion

<div class="alert alert-block alert-warning">
<b>⚠️ Important: This process may take several hours to complete depending on the number of files and FHIR resources you are ingesting!⚠️</b>
</div>

In [27]:
if ingest_data:
    start_time = time.time()
    load_fhir_neo4j(fhir_files_list)
    end_time = time.time()

    total_duration = str(datetime.timedelta(seconds = end_time - start_time))
    print(f'\nData Loading Completed in {total_duration}')
else:
    print(f'Set ingest_data to True to ingest data. Currently ingest_data={ingest_data}')

Set ingest_data to True to ingest data. Currently ingest_data=False


In [28]:
# Get Report on Ingested Data
get_neo4j_resourceNodes_stats()

{'AllergyIntolerance': 10,
 'CarePlan': 60,
 'CareTeam': 60,
 'Claim': 3486,
 'Condition': 656,
 'Device': 39,
 'DiagnosticReport': 3100,
 'DocumentReference': 1859,
 'Encounter': 1859,
 'ExplanationOfBenefit': 3486,
 'ImagingStudy': 8,
 'Immunization': 259,
 'Medication': 957,
 'MedicationAdministration': 957,
 'MedicationRequest': 1627,
 'Observation': 13501,
 'Patient': 20,
 'Procedure': 2966,
 'SupplyDelivery': 239}
Total Resource Nodes = 35149


---
### 6.2. VectorSearch Data Ingestion Helper Scripts
[Vertex AI Vector Search Docs](https://cloud.google.com/vertex-ai/docs/vector-search/overview)

In [29]:
#### Load Resource Text to Vector Search ####

def add_resource_text_embedding(resource_data):      
    
    resource_text_metadata = {
        "fhir_patient_id": [resource_data['patient_id']],
        "fhir_resource_id": [resource_data['resource_id']],
        "fhir_resource_type": [resource_data['resource_type']],
        "neo4j_node_id": [resource_data['neo4j_id']]
    }

    # Add Resource Text Embeddings to Vector Search
    resource_text = resource_data['resource_text']
    if resource_text is not None:
        try:
            # ids = me.add_texts(texts=[resource_text], metadatas=[resource_text_metadata])
            ids = vector_store.add_texts(texts=[resource_text], metadatas=[resource_text_metadata], is_complete_overwrite=True)
            return ids
        except Exception as err:
            print(f"\nERROR: Unexpected while adding embeddings {err=}, {type(err)=}")
            print(f'Resource_Text = {resource_text}')
            print(f'Resource Metadata = {resource_text_metadata}')

In [30]:
def get_related_resource_node_texts(neo4j_node_id: str) -> str:
    contextualize_query = f"""
    MATCH (node :resource)
    WHERE elementId(node)='{neo4j_node_id}'
    MATCH(node)<-[]->(sc:resource)
    with node.text as self, reduce(s="", item in collect(distinct sc.text) | s + "\n\nSecondary Entry:\n" + item ) as ctxt limit 1
    return "Primary Entry:\n" + self + ctxt as text"""

    resource_text = graph.query(contextualize_query)[0][0][0]
    
    return resource_text

In [31]:
# Add Text Embeddings of all Resource Text to Vector Search

def create_embeddings_of_all_resource_text():
    batch_counter = 0
    resource_counter = 0
    resources_types_dict = {}
    
    resource_counter = 0
    vector_ids_csv_file = 'vector_ids.csv'
    with open(vector_ids_csv_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['S.NO','FHIR_RESOURCE_ID','VECTORSEARCH_ID', 'PATIENT_ID'])
    
    # Since we have large number of Resources, we iterate in batches
    for batch in get_nodes_in_batches(graph):
        batch_counter += 1
        # print(f'Batch: {batch_counter}, Len of Batch = {len(batch)}')
        
        for node in batch:
            resource_counter += 1
            resource_data = {
                'neo4j_id': node[0].element_id,
                'resource_id': node[0]['id'],
                'resource_type': node[0]['resource_type'],
                # 'resource_text': node[0]['text']        
            }
            
            # Get related Patient ID of the Resource
            resource_data['patient_id'] = ""
            if node[0]['resource_type'] == 'Patient':
                # print (node[0])
                resource_data['patient_id'] = node[0]['id']
            elif node[0]['subject_reference']:
                resource_data['patient_id'] = node[0]['subject_reference'].split(':')[-1]
            elif node[0]['patient_reference']: 
                resource_data['patient_id'] = node[0]['patient_reference'].split(':')[-1] 
            else:
                print(f"no patient id for resource_id: {node[0]['resource_id']}, resource_type: {node[0]['resource_type']}")
                # break

            resource_data['resource_text'] = node[0]['text']


            print(f"Processing patient_id: {resource_data['patient_id']} node_id: {resource_data['neo4j_id']}, resource_id={resource_data['resource_id']}, resource_type={resource_data['resource_type']}", end='\r', flush=True)

            # Create Embedding & Add to VectorSearch
            ids = add_resource_text_embedding(resource_data)
            print(f'Vector Search_ID: {ids}', end='\r', flush=True)
            
            with open(vector_ids_csv_file, 'a', newline='') as csv_file:
                resource_counter += 1
                writer = csv.writer(csv_file)
                
                if ids is not None:
                    writer.writerow([resource_counter, resource_data["resource_id"], ids[0], resource_data['patient_id']])
                else:
                    writer.writerow([resource_counter, resource_data["resource_id"], '', resource_data['patient_id']])                
                
            # break
            
            resource_type = resource_data['resource_type']
            if resource_type not in resources_types_dict.keys():
                resources_types_dict[resource_type] = 0

            resources_types_dict[resource_type] += 1

        # break
    print(f'\nProcessed Resources: {resource_counter}')
    print('Resources Types Processed:')
    print(resources_types_dict)


### 6.3 Trigger VectorSearch Data Ingestion
In the Cell below we are creating Embeddings of Resource Text and ingesting it into Vertex AI Vector Search.

<div class="alert alert-block alert-warning">
<b>⚠️ Important: This process may take several hours to complete depending on the number of files and FHIR resources you are ingesting! ⚠️</b>
</div>

To monitor ingest progress, open a new bash terminal and run the below command:
```bash
tail -f unlock-fhir-with-rag-on-vertexai/vector_ids.csv
```

In [32]:
if ingest_data:
    start_time = time.time()
    create_embeddings_of_all_resource_text()
    end_time = time.time()

    total_duration = str(datetime.timedelta(seconds = end_time - start_time))
    print(f'\nData Loading Completed in {total_duration}')

else:
    print(f'Set ingest_data to True to ingest data. Currently ingest_data={ingest_data}')

Set ingest_data to True to ingest data. Currently ingest_data=False


### 6.4 Query VectorSearch to Test Data Ingestion
In the step below we query the Vector Search DB to verify data load is successful.

In [35]:
query_text = "Sample resource type"
vs_response = vector_store.similarity_search_with_score(query=query_text)

In [37]:
len(vs_response)

4

In [50]:
# Filtering by Semantic Search Score
def filter_result_by_score(min_score: float, vs_respone: list) -> list:
    filtered_results = []
    
    for doc, score in vs_response:
        if score >= min_score:
            filtered_results.append((doc, score))
    
    return filtered_results

    
min_score = 0.7        
filtered_results = filter_result_by_score(min_score, vs_response)
len(filtered_results) # We will now get results with score min_score (0.7) and above.

[(Document(page_content='This is a sample Resource Type', metadata={'fhir_patient_id': 'pid_111111111', 'fhir_resource_id': 'rid_111111111', 'fhir_resource_type': 'Test_Resource_type', 'neo4j_node_id': 'nid_111111111'}),
  0.7533161640167236)]

---
## Preparing LLM Input: Prompts, Questions, Model
This section covers the design of LangChain Prompt Templates, User questions for LLM , and the choice of the LLM model for task execution

### Prompt Design

This cell defines the prompt template used to interact with the LLM. Try different prompts to see how they influence the LLM's output.

In [51]:
default_prompt='''
System: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}
Human: {question}
'''

my_prompt='''
System: The following information contains entries about the patient. 
Use the primary entry and then the secondary entries to answer the user's question.
Each entry is its own type of data and secondary entries are supporting data for the primary one. 
You should restrict your answer to using the information in the entries provided. 

If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}
----------------
User: {question}
'''

my_prompt_2='''
System: The context below contains entries about the patient's healthcare. 
Please limit your answer to the information provided in the context. Do not make up facts.
Please limit your answers only about the patient in the user question. If you do not find the patient name in the context.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
If you are asked about the patient's name and one the entries is of type patient, you should look for the first given name and family name and answer with: [given] [family]
----------------
{context}
Human: {question}
'''

prompt = PromptTemplate.from_template(my_prompt_2)

### Define User Questions

In [57]:
question_list = [
    "What can you tell me about Alfonso's claim created on 03/06/1977?",
    "What can you tell me about the medical claim created on 03/06/1977?",
    "Based on this explanation of benefits, how much did it cost and what service was provided?",
    "Based on this explanation of benefits created on July 15, 2016, how much did it cost and what service was provided?",
    "Based on this explanation of benefits created on March 6, 1978, how much did it cost and what service was provided?",
    "Based on this explanation of benefits created on January 11, 2009, how much did it cost and what service was provided?",
    "What was the blood pressure on 2/9/2014?",
    "What was the blood pressure?",
    "Based on this explanation of benefits created on January 18, 2014, how much did it cost and what service was provided?",
    "How much did the colon scan eighteen days after the first of the year 2019 cost?",
    "How much did the medical reconciliation on Dec. 29, 2023 cost?",
    "What can you tell me about Andrea7's claim created on 12/25/2003?",
    "What can you tell me about claim created on 12/25/2003?",
    "What allergies does Antone63 have?"
]

len(question_list)

14

In [58]:
question = question_list[13]
question

'What allergies does Antone63 have?'

---
## Testing Without RAG

The LLM would not be able to answer since it does not have the context. 
<br>Context is the Private User/Organization Data. FHIR Data in this example.

In [59]:
llm = VertexAI(model_name=VERTEX_AI_MODEL_NAME)

# Ask LLM the question
no_rag_answer = llm(question)
print(f'Question: {question}')
print(f'LLM Answer: {no_rag_answer}')

Question: What allergies does Antone63 have?
LLM Answer: I do not have access to personal medical information, including allergies, and cannot provide this information about Antone63.


## Testing with RAG - Ask the LLM with Context

This cell will ask the LLM with the string representation of the resource node that is found by the vector index.

### Testing VertexAI VectorSearch

We use VertexAI VectorSearch to retrieve relevant FHIR data based on the user's query. The search incorporates:
- **Semantic Matching**: Finding Resources with similar text embeddings to the query.
- **Metadata Filters**: Narrowing down results based on patient_id and resource_type to ensure relevance. (Refer to the Filter vector matches: https://cloud.google.com/vertex-ai/docs/vector-search/filtering documentation.)

A RAG based LLM prompt consist of 3 sections:
- **Instructions**: Guidance for the LLM on how to generate an accurate and relevant response based on the retrieved context and the user query.
- **Context**: We also apply filters based on patient_id and resource_type to ensure the retrieved context is relevant to the specific patient and the type of information sought. This is a key step, as the accuracy of the LLM's response directly depends on the relevance of this retrieved context. 
- **User Question**: The original query posed by the user.

For Demonstration purpose we will:
- First query VectorSearch without Filter.
- Next query VectorSearch with Filters. Refer to VectoSearch [Filter vector matches](https://cloud.google.com/vertex-ai/docs/vector-search/filtering) documentation for more information.

In [None]:
question

In [None]:
# Create chain to answer questions
NUMBER_OF_RESULTS = 20
SEARCH_DISTANCE_THRESHOLD = 0.7

# Expose index to the retriever
retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)

In [None]:
vector_qa = RetrievalQA.from_chain_type(
    llm=ChatVertexAI(model_name=VERTEX_AI_MODEL_NAME),
    chain_type='stuff',
    retriever=retriever,
    # return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"verbose": False, "prompt": prompt}
)

pprint(vector_qa.run(question))

### Cons of this approach
- The Vector Search similarity search fetches all matching Resource Types.
- However, the context does not include the Patients name for the retrieved Resources.
- Hence the LLM might summarize and extract the information from the context, but the Resources might not belong to a Patient in question.
- Sometimes the LLM responds "The context does not mention any patient".

## Testing with RAG - Ask the LLM with Enhanced Context

***Providing Context to the LLM***

We enrich the LLM's understanding by fetching text from linked resource nodes.

***Steps:***
- *Similarity Search:* Identify a matching resource's Neo4J Node ID in the Vector Search database.
- *Fetch Related Resources:* Query all nodes connected to the matching resource.
- *Concatenate Text:* Combine the text from all retrieved nodes to provide comprehensive context.

### Create a new Vector Search Index

In [None]:
# Create GCS Bucket for the new Enhanced Vector Search Index
! set -x && gsutil mb -p $PROJECT_ID -l $REGION gs://$ME_ENHANCED_EMBEDDING_GCS_DIR

In [None]:
# Create a new Vector Search Index for Enhanced Context
me_utils_enhanced = MatchingEngineUtils(PROJECT_ID, REGION, ME_ENHANCED_CONTEXT_INDEX_NAME)

me_index_enhanced = me_utils_enhanced.create_index(
    embedding_gcs_uri=f'gs://{ME_ENHANCED_EMBEDDING_GCS_DIR}/init_index',
    dimensions=ME_DIMENSIONS,
    index_update_method='streaming',
    index_algorithm='tree-ah',
    shard_size= ME_SHARD_SIZE,
    distance_measure_type=ME_DISTANCE_MEASURE_TYPE,
    description=ME_ENHANCED_DESCRIPTION
)

# me_index_enhanced

In [None]:
# Get information about the Index
if me_index_enhanced:
    index_metadata = MessageToDict(me_index_enhanced._pb)
    print('Index Details:')
    print(f'- Index Name = {index_metadata["name"]}')
    print(f'- Update Method = {index_metadata["indexUpdateMethod"]}')
    print(f'- Dimensions = {index_metadata["metadata"]["config"]["dimensions"]}')
    print(f'- Shard Size = {index_metadata["metadata"]["config"]["shardSize"]}')
    print(f'- Distance Measure Type = {index_metadata["metadata"]["config"]["distanceMeasureType"]}')
    algorithm = list(index_metadata["metadata"]["config"]['algorithmConfig'].keys())[0]
    print(f'- Algorithm = {algorithm}')
    # print(f'- Index Stats = {index_metadata["indexStats"]}')

# index_metadata

In [None]:
me_endpoint_enhanced = me_utils_enhanced.deploy_index(
    machine_type=ME_ENDPOINT_MACHINE_TYPE,
    min_replica_count=ME_ENDPOINT_MIN_REPLICA_COUNT,
    max_replica_count=ME_ENDPOINT_MAX_REPLICA_COUNT,
    public_endpoint_enabled=True
)

In [None]:
# Get Matching Engine Index id and Endpoint id
me_utils_enhanced = MatchingEngineUtils(PROJECT_ID, REGION, ME_ENHANCED_CONTEXT_INDEX_NAME)
ME_ENHANCED_INDEX_ID, ME_ENHANCED_INDEX_ENDPOINT_ID = me_utils_enhanced.get_index_and_endpoint()
print(f"ME_ENHANCED_INDEX_ID={ME_ENHANCED_INDEX_ID}")
print(f"ME_ENHANCED_INDEX_ENDPOINT_ID={ME_ENHANCED_INDEX_ENDPOINT_ID}")

In [None]:
# Initialize vector store
me_enhanced = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=f"gs://{ME_ENHANCED_EMBEDDING_GCS_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_ENHANCED_INDEX_ID,
    endpoint_id=ME_ENHANCED_INDEX_ENDPOINT_ID,
)

### Ingest Enhanced Context data to Vector Search

In the Cell below we are creating Embeddings of Resource Text and ingesting it into Vertex AI Vector Search.

<div class="alert alert-block alert-warning">
<b>⚠️ Important: Below step will take few minutes to complete! ⚠️</b>
</div>

In [None]:
# Trigger Enhanced Context Data Ingestion to vector Search
ingest_data = False

if ingest_data:
    create_embeddings_of_all_resource_text(me_enhanced, enhanced_context=True)
else:
    print(f'Set ingest_data to True to ingest data. Currently ingest_data={ingest_data}')

### RAG with Enhanced Context

In [None]:
# question = "What procedure was performed on Antone63 on 2014-04-20?"
question

In [None]:
# Create chain to answer questions
NUMBER_OF_RESULTS = 10
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever_enhanced = me_enhanced.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)

In [None]:
# question = "Based on this explanation of benefits created on February 11, 1999, how much did it cost and what service was provided?"
vector_qa_enhanced = RetrievalQA.from_chain_type(
    llm=ChatVertexAI(model_name=VERTEX_AI_MODEL_NAME),
    chain_type='stuff',
    retriever=retriever_enhanced,
    # return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"verbose": True, "prompt": prompt}
)

# question = 'Tell me about the latest medical reconciliation?'
pprint(vector_qa_enhanced.run(question))

### Cons of this approach
A problem with this approach is the data is repeated multiple times (e.g. Patient Information). This results in increased LLM Token consumption and costs.
An alternate strategy would be:
- First retrieve the Resource in Question from Vector Search
- Use the Resource ID to query Neo4J for related Resources (Text field)
- Remove Duplicate Resource Entries 
- Dynamically construct the Context and pass it to the LLM.

## Testing with RAG - Custom Retreiver

The Custom retriever addresses the above problem. It does so by:
- Identifying the Patient Name from User Query. If Patient name not present prompt the User for the Patient Name
- Idenitfy the FHIR Resource Type in Question
- Perform a similarity search by narrowing down the results based on Patient ID and Resource Type

Thus increasing the accuracy of the Context provided to the LLM for information extraction.

**Enhancement** - This approach does not address the below use case of getting Related Resources for more context. This can be fixed by combining the Custom Retriever and fetching related Resource text from Neo4J to provide the necessary context to the LLM.

In [None]:
me_utils = MatchingEngineUtils(PROJECT_ID, REGION, ME_INDEX_NAME)

# Get Matching Engine Index id and Endpoint id
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = me_utils.get_index_and_endpoint()

# Create Text Embedding
embeddings = VertexAIEmbeddings(
    model_name="textembedding-gecko@003",
    project=PROJECT_ID,
    location=REGION,
    max_retries=6
)

# Initialize vector store
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_GCS_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

In [None]:
question

In [None]:


llm = VertexAI(model_name=VERTEX_AI_MODEL_NAME)

graph = Graph(NEO4J_URL, NEO4J_USER, NEO4J_PASSWORD)
retriever = FHIRResourcesRetriever(llm=llm, me=me, neo4j_graph=graph)

vector_qa = RetrievalQA.from_chain_type(
    llm=ChatVertexAI(model_name=VERTEX_AI_MODEL_NAME),
    chain_type='stuff',
    retriever=retriever,
    # return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"verbose": False, "prompt": prompt}
)

# print(vector_qa.__dir__())
print(vector_qa.invoke(question))

## RAG with Enhanced Context and Custom Retriever

In [None]:
question

In [None]:
llm = VertexAI(model_name=VERTEX_AI_MODEL_NAME)

graph = Graph(NEO4J_URL, NEO4J_USER, NEO4J_PASSWORD)
retriever = FHIRResourcesRetriever(llm=llm, me=me_enhanced, neo4j_graph=graph)

vector_qa = RetrievalQA.from_chain_type(
    llm=ChatVertexAI(model_name=VERTEX_AI_MODEL_NAME),
    chain_type='stuff',
    retriever=retriever,
    # return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"verbose": True, "prompt": prompt}
)

print(vector_qa.invoke(question))

-----

In [None]:
def date_for_question(question_to_find_date):
    _llm = text_bison_llm 
    _response = _llm(f'''
    system:Given the following question from the user, extract the date the question is asking about.
    Return the answer formatted as JSON only, as a single line.
    Use the form:
    
    {{"date":"[THE DATE IN THE QUESTION]"}}
    
    Use the date format of month/day/year.
    Use two digits for the month and day.
    Use four digits for the year.
    So 3/4/23 should be returned as {{"date":"03/04/2023"}}.
    So 04/14/89 should be returned as {{"date":"04/14/1989"}}.
    
    Please do not include any special formatting characters, like new lines or "\\n".
    Please do not include the word "json".
    Please do not include triple quotes.
    
    If there is no date, do not make one up. 
    If there is no date return the word "none", like: {{"date":"none"}}
    
    user:{question_to_find_date}
    ''')
    date_json = json.loads(_response)
    return date_json['date']

date_str = date_for_question(question)
print(date_str)

In [None]:
def get_patient_name(question_to_find_names):
    # _llm = text_bison_llm 
    
    _response = llm(f'''
    system:Given the following question from the user, identify all potential first and last names within this sentence.
    
    The name might also contain numbers e.g. Andrea7, Jenkins714, Chasity985, Pagac496
    The name might also contain Apostrophe e.g. Andrea's, John's, Johns' James'
    If the nameis in the format Smith, John then first-name = John, last-name = Smith
    
    Return the answer formatted as first-name last-name.
    
    Use the form:
    first-name last name
    
    Please do not include any special formatting characters, like new lines or "\\n".
    Please do not include triple quotes.
    
    If there are no names, do not make one up. 
    If there are no names return an empty string link ""
    
    user:{question_to_find_names}
    ''')
    names = _response
    if names == "":
        input("Please enter the Patient name:")
    return names

# Jenkins714 Andrea7
q = question
patient_name = get_patient_name(q).strip()
print(patient_name)


# While Loading data into Neo4J we created a 'Text' Attribute to convert Resource JSON format to Text.
# We will use the same sentence structure to get an accurate seearch result from Vector Store.
patient_name_query=f"The type of information in this entry is patient. The name use for this patient is official. The name family for this patient is {patient_name}. The name given 0 for this patient is {patient_name}."
# print(patient_name_query)

response = me.similarity_search(patient_name_query, k=2)
for doc in response:
    if doc.metadata['score'] > .85:
        print(doc)
        print("\n")

---
## Cleaning Up

<div class="alert alert-block alert-warning">
<b>⚠️ Important: To avoid incurring charges, please delete the Google Cloud resources used in this tutorial. ⚠️</b>
</div>



In [None]:
CLEANUP_RESOURCES = True

### Delete Neo4J Docker

In [None]:
# Wipe Neo4J Database
graph = Graph(NEO4J_URL, NEO4J_USER, NEO4J_PASSWORD)
if CLEANUP_RESOURCES:
    graph.wipe_database()

In [None]:
# DELETE NEO4J CONTAINER
if CLEANUP_RESOURCES:
    ! docker stop testneo4j
    ! docker rm -fv testneo4j
    ! sudo rm -rf $HOME/neo4j

### Delete Vector Search Indexes & Index-Endpoints

- Delete ME Vector Search Index and Endpoints

In [None]:
me_utils = MatchingEngineUtils(PROJECT_ID, REGION, ME_INDEX_NAME)
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = me_utils.get_index_and_endpoint()

# Delete Endpoint
if CLEANUP_RESOURCES and "me_utils" in globals():
    print(
        f"Undeploying all deployed indexes and deleting the index endpoint {ME_INDEX_ENDPOINT_ID}"
    )
    me_utils.delete_index_endpoint()

# Delete Index     
if CLEANUP_RESOURCES and "me_utils" in globals():
    print(f"Deleting the index {ME_INDEX_ID}")
    me_utils.delete_index()    

# Delete Bucket    
if CLEANUP_RESOURCES:
    # Delete contents of the bucket 
    ! gsutil -m rm -r gs://{ME_EMBEDDING_GCS_DIR}
    ! gsutil rb gs://{ME_EMBEDDING_GCS_DIR}

print('Vector Search and GCS Bucket Cleaning complete!')

- Delete ME_ENHANCED Vector Search Index and Endpoints

In [None]:
me_utils_enhanced = MatchingEngineUtils(PROJECT_ID, REGION, ME_ENHANCED_CONTEXT_INDEX_NAME)
ME_ENHANCED_INDEX_ID, ME_ENHANCED_INDEX_ENDPOINT_ID = me_utils_enhanced.get_index_and_endpoint()

# Delete Endpoint
if CLEANUP_RESOURCES and "me_utils_enhanced" in globals():
    print(
        f"Undeploying all deployed indexes and deleting the index endpoint {ME_ENHANCED_INDEX_ENDPOINT_ID}"
    )
    me_utils_enhanced.delete_index_endpoint()

# Delete Index    
if CLEANUP_RESOURCES and "me_utils_enhanced" in globals():
    print(f"Deleting the index {ME_ENHANCED_INDEX_ID}")
    me_utils_enhanced.delete_index()

# Delete Bucket
if CLEANUP_RESOURCES:
    ! gsutil -m rm -r gs://{ME_ENHANCED_EMBEDDING_GCS_DIR}
    ! gsutil rb gs://{ME_ENHANCED_EMBEDDING_GCS_DIR}

# To-Do

- Example MedLM Integration - Query all Patients with high risk of some disease (Stroke)
- Query all Vacation candiates and based on time send Notification to them. E.g. Child vacccination message to Parent - Query Neo4J Date Nodes range