In [1]:
import os
import json
import sys
import logging
import openai

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

import chromadb

from typing import Union, Dict
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.schema import Document
from llama_index import load_index_from_storage

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
openai.log = "info"

In [2]:
MAIN_DIR = ".."
DATA_DIR = os.path.join(MAIN_DIR, "data")
DOCUMENT_DIR = os.path.join(MAIN_DIR, "data", "document_sources")
EXCLUDE_DICT = os.path.join(DATA_DIR, "exclude_pages.json")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)

os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]
openai.api_key = api_keys["OPENAI_API_KEY"]

In [3]:
def convert_prompt_to_string(prompt) -> str:
    return prompt.format(**{v: v for v in prompt.template_vars})

def generate_query(profile: str, scan: str):
    return "Patient Profile: {}\nScan ordered: {}".format(profile, scan)

def convert_doc_to_dict(doc: Union[Document, Dict]) -> Dict:
    if isinstance(doc, Document):
        json_doc = {
            "page_content": doc.text,
            "metadata": {
                "source": doc.metadata["file_name"],
                "page": doc.metadata["page_label"]
            }
            }
    elif isinstance(doc, Dict):
        json_doc = {
            "page_content": doc["text"],
            "metadata": {
                "source": doc["metadata"]["file_name"],
                "page": doc["metadata"]["page_label"]
            }
        }
    return json_doc

def get_experiment_logs(description: str, log_folder: str):
    logger = logging.getLogger(description)

    stream_handler = logging.StreamHandler(sys.stdout)

    if not os.path.exists(log_folder):
        os.makedirs(log_folder, exist_ok=True)

    file_handler = logging.FileHandler(filename=os.path.join(log_folder, "logfile.log"))

    formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)

    logger.setLevel(logging.INFO)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger


In [20]:
chunk_size=1024
chunk_overlap=20

## Create vectorstore

In [21]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=512)
embs = OpenAIEmbedding()

service_context = ServiceContext.from_defaults(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    embed_model = embs
    )

### Simple

In [6]:
vector_dir = os.path.join(DATA_DIR, "emb_store", "simple",
                          f"openai_{chunk_size}_{chunk_overlap}")

print(vector_dir)

../data/emb_store/simple/openai_512_20


In [7]:
# ## SAVE
# from llama_index.vector_stores import SimpleVectorStore

# vector_store = SimpleVectorStore()
# documents = SimpleDirectoryReader(DOCUMENT_DIR).load_data()
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# vector_index = VectorStoreIndex.from_documents(
#     documents=documents,
#     service_context=service_context,
#     storage_context=storage_context
# )

# vector_index.set_index_id("msk-mri")

# vector_index.storage_context.persist(vector_dir)

In [11]:
from llama_index.vector_stores import SimpleVectorStore
vector_store = SimpleVectorStore.from_persist_dir(vector_dir)
storage_context = StorageContext.from_defaults(persist_dir = vector_dir)
vector_index = load_index_from_storage(
    storage_context=storage_context, index_name="msk-mri"
    )

INFO:llama_index.indices.loading:Loading all indices.


In [13]:
query_engine = vector_index.as_query_engine()
sample_query = "test Query"
response = query_engine.query(sample_query)
print(response.response)

I'm sorry, but I cannot provide an answer to the query as it does not contain a specific question or request for information. Please provide a clear question or request and I will be happy to assist you.


In [43]:
## LOAD
storage_context = StorageContext.from_defaults(persist_dir=vector_dir)
vector_index = load_index_from_storage(storage_context, index_id="vector_index")

query_engine = vector_index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

INFO:llama_index.indices.loading:Loading indices with ids: ['vector_index']


### FAISS

In [27]:
PERSIST_DIR = os.path.join(
    DATA_DIR, "emb_store", "faiss", f"openai_{chunk_size}_{chunk_overlap}"
    )

print(PERSIST_DIR)

../data/emb_store/faiss/openai_1024_20


In [28]:
# # SAVE
# from llama_index.vector_stores import FaissVectorStore
# import faiss

# d = 1536
# faiss_index = faiss.IndexFlatL2(d)

# os.makedirs(PERSIST_DIR)

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(
#     vector_store=vector_store
# )

# documents = SimpleDirectoryReader(DOCUMENT_DIR).load_data()[:5]

# index = VectorStoreIndex.from_documents(
#     documents,
#     storage_context=storage_context,
#     service_context=service_context
#     )

# index.storage_context.persist(persist_dir = PERSIST_DIR)

In [29]:
# LOAD

from llama_index.vector_stores import FaissVectorStore
from llama_index import load_index_from_storage

vector_store = FaissVectorStore.from_persist_dir(PERSIST_DIR)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    persist_dir=PERSIST_DIR
)

vector_index = load_index_from_storage(storage_context=storage_context)

INFO:root:Loading llama_index.vector_stores.faiss from ../data/emb_store/faiss/openai_1024_20/vector_store.json.
INFO:llama_index.indices.loading:Loading all indices.


In [30]:
query_engine = vector_index.as_query_engine()
sample_query = "test Query"
response = query_engine.query(sample_query)
print(response.response)

I'm sorry, but I cannot answer the query as it does not provide any specific information or context to work with. Please provide more details or a specific question.


### Chroma

In [21]:
chroma_dir=os.path.join(DATA_DIR, "emb_store", "chroma",
                        f"openai_{chunk_size}_{chunk_overlap}")

../data/emb_store/chroma/openai/openai_512_20


In [18]:
# from llama_index.vector_stores import ChromaVectorStore

# db = chromadb.PersistentClient(path=chroma_dir)
# chroma_collection = db.get_or_create_collection("test")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# documents = SimpleDirectoryReader(DOCUMENT_DIR).load_data()

# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, service_context=service_context
# )

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event ClientStartEvent: module 'chromadb' has no attribute 'get_settings'
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event ClientCreateCollectionEvent: module 'chromadb' has no attribute 'get_settings'
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event CollectionAddEvent: module 'chromadb' has no attribute 'get_settings'


In [28]:
# load from disk
db2 = chromadb.PersistentClient(path=chroma_dir)
chroma_collection = db2.get_or_create_collection("test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)

query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event ClientStartEvent: module 'chromadb' has no attribute 'get_settings'
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event CollectionQueryEvent: module 'chromadb' has no attribute 'get_settings'


### Weaviate

In [63]:
weaviate_path = os.path.join(
    DATA_DIR, "emb_store", "weaviate", f"openai_{chunk_size}_{chunk_overlap}"
)
print(weaviate_path)

../data/emb_store/weaviate/openai_512_20


In [65]:
import weaviate
from weaviate.embedded import EmbeddedOptions

embedded_options = EmbeddedOptions(
    persistence_data_path=weaviate_path,
    binary_path=os.path.join(weaviate_path, "bin"),
    port=6666
)
client = weaviate.Client(
    embedded_options=EmbeddedOptions()
    )

embedded weaviate is already listening on port 6666


In [59]:
from llama_index.vector_stores import WeaviateVectorStore

vector_store = WeaviateVectorStore(weaviate_client=client, index_name="Test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"test_j0sIBOkr08rd","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-10-17T20:37:13+08:00","took":69685}


In [73]:
query_engine = index.as_query_engine(
    vector_store_query_mode="hybrid", similarity_top_k=2
)
response = query_engine.query("What did the author do growing up?")

## Query Engine

In [75]:
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

system_template = """
You are a radiologist expert at providing imaging recommendations for patients with musculoskeletal conditions.
If you do not know an answer, just say "I dont know", do not make up an answer.
==========
TASK:
1. Extract from given PATIENT PROFILE relevant information for classification of imaging appropriateness.
Important information includes AGE, SYMPTOMS, DIAGNOSIS (IF ANY), which stage of diagnosis (INITIAL IMAGING OR NEXT STUDY).
2. Refer to the reference information given under CONTEXT to analyse the appropriate imaging recommendations given the patient profile.
3. Recommend if the image scan ordered is appropriate given the PATIENT PROFILE and CONTEXT. If the scan is not appropriate, recommend an appropriate procedure.
STRICTLY answer based on the given PATIENT PROFILE and CONTEXT.
==========
OUTPUT INSTRUCTIONS:
Your output should contain the following:
1. Classification of appropriateness for the ordered scan.
2. Provide explanation for the appropriateness classification.
3. If classification answer is USUALLY NOT APPROPRIATE, either recommend an alternative appropriate scan procedure or return NO SCAN REQUIRED.

Format your output as follow:
1. Classification: Can be one of [USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE, INSUFFICIENT INFORMATION]
2. Explanation:
3. Recommendation: Can be alternative procedure, NO SCAN REQUIRED or NO CHANGE REQUIRED 
==========
CONTEXT:
{context_str}
==========
"""

human_template = "{query_str}"
messages = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_template),
    ChatMessage(role=MessageRole.USER, content=human_template)   
]

CHAT_PROMPT_TEMPLATE = ChatPromptTemplate(messages)

print(CHAT_PROMPT_TEMPLATE.format(context_str="context_str", query_str="query_str"))

system: 
You are a radiologist expert at providing imaging recommendations for patients with musculoskeletal conditions.
If you do not know an answer, just say "I dont know", do not make up an answer.
TASK:
1. Extract from given PATIENT PROFILE relevant information for classification of imaging appropriateness.
Important information includes AGE, SYMPTOMS, DIAGNOSIS (IF ANY), which stage of diagnosis (INITIAL IMAGING OR NEXT STUDY).
2. Refer to the reference information given under CONTEXT to analyse the appropriate imaging recommendations given the patient profile.
3. Recommend if the image scan ordered is appropriate given the PATIENT PROFILE and CONTEXT. If the scan is not appropriate, recommend an appropriate procedure.
STRICTLY answer based on the given PATIENT PROFILE and CONTEXT.
OUTPUT INSTRUCTIONS:
Your output should contain the following:
1. Classification of appropriateness for the ordered scan.
2. Provide explanation for the appropriateness classification.
3. If classificat

In [76]:
from llama_index import get_response_synthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

retriever = VectorIndexRetriever(
    index = vector_index, similarity_top_k=5
)

response_synthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode="simple_summarize",
    text_qa_template=CHAT_PROMPT_TEMPLATE,
)

query_engine = RetrieverQueryEngine(
    retriever=retriever, response_synthesizer=response_synthesizer
)

# Run testcases

In [77]:
import pandas as pd

testcase_df = pd.read_csv(
    os.path.join(DATA_DIR, "queries", "MSK LLM Fictitious Case Files Full.csv")
)

patient_profiles = testcase_df["Clinical File"]
scan_orders = testcase_df["MRI scan ordered"]
testcase_df["queries"] = [generate_query(patient_profile, scan_order)
        for patient_profile, scan_order in zip(patient_profiles, scan_orders)]

testcase_df.head(3)

Unnamed: 0,ACR scenario,Appropriateness Category,MRI scan ordered,Difficulty,Clinical File,AR Rad,AR Ortho,AR LLM,queries
0,Traumatic shoulder pain. Nonlocalized shoulder...,UA,MRI shoulder without IV contrast,L,22 year old Chinese Male. No significant past ...,,,,Patient Profile: 22 year old Chinese Male. No ...
1,Known axial spondyloarthritis. Follow-up for t...,MBA,MRI sacroiliac joints\nwithout and with IV con...,H,43 year old Indian Male. Known ankylosing spo...,,,,Patient Profile: 43 year old Indian Male. Kno...
2,Acute blunt or penetrating trauma to the hand ...,UNA,MRI wrist without IV\ncontrast,L,38 year old Bangladeshi Male. No significant ...,,,,Patient Profile: 38 year old Bangladeshi Male....


In [78]:
sample_query = testcase_df["queries"][0]
response = query_engine.query(sample_query)

In [95]:
testcase_df["Clinical File"][2].split(".")

['38 year old Bangladeshi Male',
 '  No significant past medical history',
 '   Was using power drill when it slipped and penetrated right wrist',
 '  On examination: oozing wound on dorsum of right wrist, no obvious bone visualised',
 '  Wrist swelling',
 ' Unable to assess range of motion due to pain',
 '  Nil imaging performed thus far',
 '   MRI right wrist without IV contrast to assess soft tissue and bony injury',
 '  ']

# Filter out references

In [4]:
documents = SimpleDirectoryReader(DOCUMENT_DIR).load_data()

In [5]:
page_dict = {}
for doc in documents:
    if doc.metadata["file_name"] not in page_dict:
        page_dict[doc.metadata["file_name"]] = 1
    else:
        page_dict[doc.metadata["file_name"]] += 1

In [6]:
reference_pages = []
for doc in documents:
    if "References" in doc.text:
        reference_pages.append((doc.metadata["file_name"], doc.metadata["page_label"], doc.text))   
        
reference_dict = {}
for doc, page, _ in reference_pages:
    reference_dict[doc] = int(page)
    
exclude_dict = {}
for file_name, reference_startpage in reference_dict.items():
    reference_endpage = page_dict[file_name]
    exclude_dict[file_name] = list(range(reference_startpage, reference_endpage+1))

In [70]:
with open(os.path.join(DATA_DIR, "exclude_pages.json"), "w") as f:
    json.dump(exclude_dict, f)