In [1]:
import os
import json
import sys
import logging
import openai

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

import chromadb

from typing import Union, Dict, List
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.schema import Document
from llama_index import load_index_from_storage

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
openai.log = "info"

In [2]:
MAIN_DIR = ".."
DATA_DIR = os.path.join(MAIN_DIR, "data")
DOCUMENT_DIR = os.path.join(MAIN_DIR, "data", "document_sources")
EXCLUDE_DICT = os.path.join(DATA_DIR, "exclude_pages.json")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)

os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]
openai.api_key = api_keys["OPENAI_API_KEY"]

In [3]:
def convert_prompt_to_string(prompt) -> str:
    return prompt.format(**{v: v for v in prompt.template_vars})

def generate_query(profile: str, scan: str):
    return "Patient Profile: {}\nScan ordered: {}".format(profile, scan)

def convert_doc_to_dict(doc: Union[Document, Dict]) -> Dict:
    if isinstance(doc, Document):
        json_doc = {
            "page_content": doc.text,
            "metadata": {
                "source": doc.metadata["file_name"],
                "page": doc.metadata["page_label"]
            }
            }
    elif isinstance(doc, Dict):
        json_doc = {
            "page_content": doc["text"],
            "metadata": {
                "source": doc["metadata"]["file_name"],
                "page": doc["metadata"]["page_label"]
            }
        }
    return json_doc

def get_experiment_logs(description: str, log_folder: str):
    logger = logging.getLogger(description)

    stream_handler = logging.StreamHandler(sys.stdout)

    if not os.path.exists(log_folder):
        os.makedirs(log_folder, exist_ok=True)

    file_handler = logging.FileHandler(filename=os.path.join(log_folder, "logfile.log"))

    formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)

    logger.setLevel(logging.INFO)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger

def filter_by_pages(
    doc_list: List[Document],
    exclude_info: Dict[str, List]
) -> List[Document]:
    filtered_list = []
    for doc in doc_list:
        file_name = doc.metadata["file_name"]
        page = doc.metadata["page_label"]
        if file_name not in exclude_info.keys():
            filtered_list.append(doc)
            continue
        if int(page) not in exclude_info[file_name]:
            filtered_list.append(doc)

    return filtered_list

In [4]:
documents = SimpleDirectoryReader(DOCUMENT_DIR).load_data()
print("Total no of docs before filtering:", len(documents))
with open(EXCLUDE_DICT, "r") as f:
    exclude_pages = json.load(f)
documents = filter_by_pages(doc_list=documents, exclude_info=exclude_pages)
print("Total number of docs after filtering", len(documents))

Total no of docs before filtering: 546
Total number of docs after filtering 395


In [5]:
def organize_by_files(
    doc_list: List[Document]
):
    doc_dict = {}
    for doc in doc_list:
        filename = doc.metadata["file_name"]
        if filename not in doc_dict:
            doc_dict[filename] = [doc]
        else:
            doc_dict[filename].append(doc)
            
    return doc_dict

In [11]:
path_list = os.listdir(DOCUMENT_DIR)
doc_dict = organize_by_files(documents)
sample_file = path_list[0]
sample_docs = doc_dict[sample_file]

In [17]:
sample_text = sample_docs[0].text
sample_text

'Revised 2018  \nACR Appropriateness Criteria® 1 Acute Hand and Wrist Trauma  American College of Radiology  \nACR Appropriateness Criteria® \nAcute Hand and Wrist Trauma  \nVariant 1:  Acute blunt or penetrating trauma  to the hand or wrist.  Initial imaging.  \nProcedure  Appropriateness Category  Relative Radiation Level  \nRadiography  area of int erest  Usually Appropriate  Varies  \nCT area of interest  with IV contrast  Usually Not Appropriate  Varies  \nCT area of interest  without and with IV \ncontrast  Usually Not Appropriate  Varies  \nCT area of interest  without IV contrast  Usually Not Appropriate  Varies  \nMRI area of interest  without and with IV \ncontrast  Usually Not Appropriate  O \nMRI area of interest  without IV contrast  Usually Not Appropriate  O \nBone scan area of interest  Usually Not Appropriate  ☢☢☢  \nUS area of interest  Usually Not Appropriate  O \nVariant 2: Suspect acute hand  or wrist trauma . Initial radiographs negative or equivocal. Next imaging

In [18]:
print(sample_text)

Revised 2018  
ACR Appropriateness Criteria® 1 Acute Hand and Wrist Trauma  American College of Radiology  
ACR Appropriateness Criteria® 
Acute Hand and Wrist Trauma  
Variant 1:  Acute blunt or penetrating trauma  to the hand or wrist.  Initial imaging.  
Procedure  Appropriateness Category  Relative Radiation Level  
Radiography  area of int erest  Usually Appropriate  Varies  
CT area of interest  with IV contrast  Usually Not Appropriate  Varies  
CT area of interest  without and with IV 
contrast  Usually Not Appropriate  Varies  
CT area of interest  without IV contrast  Usually Not Appropriate  Varies  
MRI area of interest  without and with IV 
contrast  Usually Not Appropriate  O 
MRI area of interest  without IV contrast  Usually Not Appropriate  O 
Bone scan area of interest  Usually Not Appropriate  ☢☢☢  
US area of interest  Usually Not Appropriate  O 
Variant 2: Suspect acute hand  or wrist trauma . Initial radiographs negative or equivocal. Next imaging 
study. 
Procedu