# Import Packages

In [48]:
import os
import json
import sys
import logging
import openai
import tiktoken
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from typing import Union, Dict, List, Callable

import chromadb
from llama_index import get_response_synthesizer
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.embeddings import OpenAIEmbedding
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.node_parser import SentenceWindowNodeParser

from llama_index.schema import MetadataMode, NodeWithScore, Document
from llama_index.storage import StorageContext
from llama_index.vector_stores import ChromaVectorStore

from custom import CustomRetrieverQueryEngine, CustomCombinedRetriever
from utils import generate_vectorindex
from utils import load_vectorindex

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
openai.log = "info"

In [2]:
MAIN_DIR = ".."
DATA_DIR = os.path.join(MAIN_DIR, "data")
DOCUMENT_DIR = os.path.join(MAIN_DIR, "data", "document_sources")
EXCLUDE_DICT = os.path.join(DATA_DIR, "exclude_pages.json")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)

os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]
openai.api_key = api_keys["OPENAI_API_KEY"]

In [3]:
def convert_prompt_to_string(prompt) -> str:
    return prompt.format(**{v: v for v in prompt.template_vars})

def generate_query(profile: str, scan: str):
    return "Patient Profile: {}\nScan ordered: {}".format(profile, scan)

def convert_doc_to_dict(doc: Union[Document, NodeWithScore, Dict]) -> Dict:
    if isinstance(doc, NodeWithScore):
        json_doc = {
            "page_content": doc.text,
            "metadata": doc.metadata,
            "score": doc.score
            } 
    elif isinstance(doc, Document):
        json_doc = {
            "page_content": doc.text,
            "metadata": doc.metadata,
            "score": ""
            }
    elif isinstance(doc, Dict):
        json_doc = {
            "page_content": doc["text"],
            "metadata": doc["metadata"],
            "score": "None"
        }
    return json_doc

def get_experiment_logs(description: str, log_folder: str):
    logger = logging.getLogger(description)

    stream_handler = logging.StreamHandler(sys.stdout)

    if not os.path.exists(log_folder):
        os.makedirs(log_folder, exist_ok=True)

    file_handler = logging.FileHandler(filename=os.path.join(log_folder, "logfile.log"))

    formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)

    logger.setLevel(logging.INFO)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger

def filter_by_pages(
    doc_list: List[Document],
    exclude_info: Dict[str, List]
) -> List[Document]:
    filtered_list = []
    for doc in doc_list:
        file_name = doc.metadata["file_name"]
        page = doc.metadata["page_label"]
        if file_name not in exclude_info.keys():
            filtered_list.append(doc)
            continue
        if int(page) not in exclude_info[file_name]:
            filtered_list.append(doc)

    return filtered_list

def count_tokens(
    text: str,
    tokenizer: Callable = tiktoken.encoding_for_model("gpt-3.5-turbo")
):
    tokens_sequence = tokenizer.encode(text)
    return len(tokens_sequence)

# Prepare Documents

In [21]:
embed_model = OpenAIEmbedding(embed_batch_size=50)

In [26]:
emb_database_path = os.path.join(DATA_DIR, "emb_store")

with open(os.path.join(emb_database_path, "tables.json"), "r") as f:
    table_list = json.load(f)

with open(os.path.join(emb_database_path, "texts.json"), "r") as f:
    text_list = json.load(f)

table_docs = []
for table in table_list:
    table["metadata"]["mode"] = "tabular"
    doc = Document(
        text=table["text"],
        metadata=table["metadata"],
        excluded_embed_metadata_keys = ['file_name', 'page_label', 'variant', 'mode'],
        excluded_llm_metadata_keys = ['file_name', 'page_label', 'variant']
        )
    table_docs.append(doc)

description_texts = [doc.get_metadata_str(mode=MetadataMode.EMBED) for doc in table_docs]
description_embs = embed_model.get_text_embedding_batch(description_texts)

for doc, emb in zip(table_docs, description_embs):
    doc.embedding = emb
    
text_docs = []
for text in text_list:
    text["metadata"]["mode"] = "text"
    doc = Document(
        text=text["text"],
        metadata=text["metadata"],
        excluded_embed_metadata_keys = ['file_name', 'page_label', 'variant', 'mode'],
        excluded_llm_metadata_keys = ['file_name', 'page_label', 'variant']
        )
    text_docs.append(doc)

# Create Vectorstore

In [24]:
multimodal_vector_path = os.path.join(emb_database_path, "multimodal-chroma-textwindow")
desc_persist_dir = os.path.join(multimodal_vector_path, "descriptions")
if not os.path.exists(desc_persist_dir):
    os.makedirs(desc_persist_dir, exist_ok=True)

## Table Store

In [34]:
# Table Index
generate_vectorindex(
    embeddings=embed_model,
    emb_size=1536,
    documents=table_docs,
    output_directory=os.path.join(desc_persist_dir, "tables"),
    emb_store_type="chroma",
    chunk_size=1024,
    chunk_overlap=0,
    index_name="tables",
)

2023-11-01 15:30:41,592:INFO: Processing documents from provided list.
INFO:config:Processing documents from provided list.
2023-11-01 15:30:41,594:INFO: 199 documents remained after page filtering.
INFO:config:199 documents remained after page filtering.
2023-11-01 15:30:41,596:INFO: Total number of documents to create vector index store: 199
INFO:config:Total number of documents to create vector index store: 199
2023-11-01 15:30:41,598:INFO: Creating chroma Vectorstore
INFO:config:Creating chroma Vectorstore
INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
2023-11-01 15:30:44,069:INFO: Successfully created chroma vectorstore at ../data/multimodal-chroma-textwindow/descriptions/tables
INFO:config:Successfully created chroma vectorstore at ../data/multimodal-chroma-textwindow/descriptions/tables


## Text Store

Using SentenceWindowNodeParser parser
- Embedding content: sentence content
- Synthesizing content: Metadata + Window Content (k=5)

In [40]:
# index_name = "texts"
# chroma_client = chromadb.PersistentClient(path=os.path.join(desc_persist_dir, "texts"))
# chroma_collection = chroma_client.get_or_create_collection(index_name)
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [41]:
# node_parser = SentenceWindowNodeParser(window_size=5)
# service_context = ServiceContext.from_defaults(embed_model=embed_model, node_parser=node_parser)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# vector_index = VectorStoreIndex.from_documents(
#     text_docs, storage_context=storage_context, service_context=service_context
# )

# Load and test

In [61]:
tables_index = load_vectorindex(
    db_directory = os.path.join(desc_persist_dir, "tables"),
    emb_store_type = "chroma", index_name = "tables",
)

table_retriever = tables_index.as_retriever(similarity_top_k=5)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


2023-11-01 16:35:28,009:INFO: chroma VectorStore successfully loaded from ../data/multimodal-chroma-textwindow/descriptions/tables.
INFO:config:chroma VectorStore successfully loaded from ../data/multimodal-chroma-textwindow/descriptions/tables.


In [62]:
texts_index = load_vectorindex(
    db_directory = os.path.join(desc_persist_dir, "texts"),
    emb_store_type = "chroma", index_name = "texts",
)

text_retriever = texts_index.as_retriever(similarity_top_k=5)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


2023-11-01 16:35:29,841:INFO: chroma VectorStore successfully loaded from ../data/multimodal-chroma-textwindow/descriptions/texts.
INFO:config:chroma VectorStore successfully loaded from ../data/multimodal-chroma-textwindow/descriptions/texts.


In [63]:
combined_retriever = CustomCombinedRetriever(
    table_retriever=table_retriever,
    text_retriever=text_retriever,
    token_limit = 7000
)

retriever_query_engine = CustomRetrieverQueryEngine(
    retriever=combined_retriever,
    response_synthesizer=get_response_synthesizer(),
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ]
)

In [64]:
sample_query = """Patient profile: Patient age: 68 Sex: Male. Chief complaint: Severe pain in the left hip after a fall. The patient fell down from the stairs while carrying groceries. He has a history of osteoporosis and takes medication for high blood pressure. The patient is unable to bear weight on the left leg and has severe pain in the left hip. There is no visible deformity, but there is tenderness in the left hip region. 
Scan Order: X-ray pelvis and left hip"""

sample_response = retriever_query_engine.query(sample_query)

In [66]:
for retrieved_node in sample_response.source_nodes:
    node_content = retrieved_node.node.get_content(MetadataMode.LLM)
    print("\n" + node_content)
    if "window" in retrieved_node.node.metadata:
        print("\Embedded Sentence:", retrieved_node.node.metadata["original_text"])


description:  Acute hip pain. Fall or minor trauma. Suspect fracture. Initial imaging.
condition: acute hip pain suspected fracture
mode: tabular

| Procedure | Appropriateness Category |
|---|---|
| Radiography hip | Usually Appropriate |
| Radiography pelvis | Usually Appropriate |
| Radiography pelvis and hips | Usually Appropriate |
| CT pelvis and hips with IV contrast | Usually Not Appropriate |
| CT pelvis and hips without and with IV contrast | Usually Not Appropriate |
| CT pelvis and hips without IV contrast | Usually Not Appropriate |
| MRI pelvis and affected hip without and with IV contrast | Usually Not Appropriate |
| MRI pelvis and affected hip without IV contrast | Usually Not Appropriate |
| Bone scan hips | Usually Not Appropriate |
| US hip | Usually Not Appropriate |

description:  Acute hip pain. Fall or minor trauma. Negative radiographs. Suspect fracture. Next imaging study.
condition: acute hip pain suspected fracture
mode: tabular

| Procedure | Appropriatene