# Setup

In [86]:
import os, json
import pandas as pd
import yaml
from copy import deepcopy

import re
from tqdm import tqdm
import sys
import logging
from typing import Dict, Union, Optional, List, Literal
from datetime import datetime
from config import MAIN_DIR
from custom_storage import load_vectorindex
import tiktoken

from llama_index.vector_stores import SimpleVectorStore
from llama_index import ServiceContext
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.response.schema import Response
from llama_index.schema import Document, NodeWithScore
from llama_index import load_index_from_storage, get_response_synthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import LongContextReorder
from llama_index.indices.postprocessor.types import BaseNodePostprocessor
from llama_index.callbacks import CallbackManager, TokenCountingHandler

In [140]:
DATA_DIR = os.path.join(MAIN_DIR, "data")
ARTIFACT_DIR = os.path.join(MAIN_DIR, "artifacts")
EMB_DIR = os.path.join(DATA_DIR, "emb_store")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]

# Helper Functions

In [3]:
fix_prompt_template = """TASK: Extract the following information from the provided text query.
1. Appropropriateness of the scan ordered.
2. Most Appropriate Imaging Modality
===============
FORMAT INSTRUCTIONS: Your output should contains the following:
Appropriateness: Can be one of [USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE, INSUFFICIENT INFORMATION]
Recommendation: The most appropriate imaging modality
===============
TEXT QUERY: {query}
"""

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

FIX_PROMPT = PromptTemplate.from_template(fix_prompt_template)

fixing_chain = LLMChain(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=512),
    prompt=FIX_PROMPT
)

In [151]:
def convert_prompt_to_string(prompt) -> str:
    return prompt.format(**{v: v for v in prompt.template_vars})

def generate_query(profile: str, scan: str):
    return "Patient Profile: {}\nScan ordered: {}".format(profile, scan)

def convert_doc_to_dict(doc: Union[Document, NodeWithScore, Dict]) -> Dict:
    if isinstance(doc, Union[Document, NodeWithScore]):
        json_doc = {
            "page_content": doc.text,
            "metadata": {
                "source": doc.metadata["file_name"],
                "page": doc.metadata["page_label"]
            }
            }
    elif isinstance(doc, Dict):
        json_doc = {
            "page_content": doc["text"],
            "metadata": {
                "source": doc["metadata"]["file_name"],
                "page": doc["metadata"]["page_label"]
            }
        }
    return json_doc

def get_experiment_logs(description: str, log_folder: str):
    logger = logging.getLogger(description)

    stream_handler = logging.StreamHandler(sys.stdout)

    if not os.path.exists(log_folder):
        os.makedirs(log_folder, exist_ok=True)

    file_handler = logging.FileHandler(filename=os.path.join(log_folder, "logfile.log"))

    formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)

    logger.setLevel(logging.INFO)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger

def remove_final_sentence(
    text: str,
    return_final_sentence: bool = False
):
    text = text.strip()
    if text.endswith("."):
        text = text[:-1]
    sentence_list = text.split(".")
    previous_text = ".".join(sentence_list[:-1])
    final_sentence = sentence_list[-1]
    return (previous_text, final_sentence) if return_final_sentence else previous_text

def query_wrapper(
    template: str, 
    input_text: Union[str, Dict[str, str]]
) -> str:
    placeholders = re.findall(pattern = r"{([A-Za-z0-9_-]+)}", string=template)
    if isinstance(input_text, str):
        assert len(placeholders) == 1, "Must Provide a single placeholder when input_text is string."
        placeholder = placeholders[0]
        return template.format(**{placeholder:input_text})
    
    assert len(input_text) == len(placeholders)
    for key in input_text.keys():
        assert key in placeholders, f"{key} not present in template."
    
    return template.format(**input_text)

def setup_query_engine(
    db_directory: str,
    emb_store_type: Literal["simple, faiss"] = "simple",
    index_name: Optional[str] = None,
    similarity_top_k: int = 4,
    text_qa_template: Optional[PromptTemplate] = None,
    synthesizer_llm: str = "gpt-3.5-turbo",
    emb_type: str = "openai",
    synthesizer_temperature: int = 0,
    synthesizer_max_tokens: int = 512,
    response_mode: str = "simple_summarize",
    node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
    callback_manager: Optional[CallbackManager] = None,
):
    
    vector_index = load_vectorindex(db_directory, emb_store_type=emb_store_type, index_name=index_name)
    
    if emb_type == "openai":
        embs = OpenAIEmbedding()

    retriever = VectorIndexRetriever(
        index = vector_index, similarity_top_k=similarity_top_k
    )

    # Setup Synthesizer 
    service_context = ServiceContext.from_defaults(
        llm=OpenAI(
            temperature=synthesizer_temperature,
            model=synthesizer_llm, max_tokens=synthesizer_max_tokens
            ),
        embed_model=embs, callback_manager=callback_manager
    )

    response_synthesizer = get_response_synthesizer(
        service_context=service_context, response_mode=response_mode,
        text_qa_template=text_qa_template
    )
    
    # Setup QueryEngine
    query_engine = RetrieverQueryEngine(
        retriever=retriever, response_synthesizer=response_synthesizer,
        node_postprocessors = node_postprocessors
    )
    
    return query_engine

def process_result_json(
    testcase_df: pd.DataFrame,
    responses: List[Response],
    save_path: Optional[str] = None
):
    json_responses = []
    queries = testcase_df["queries"]
    scan_orders = testcase_df["MRI scan ordered"]
    
    tk = tqdm(zip(queries, responses, scan_orders), total=len(responses))
    for query, response, scan_order in tk:
        testcase_info = {
            "question": query,
            "result": response.response,
            "source_documents": [convert_doc_to_dict(doc) for doc in response.source_nodes]
        }
        answer_query = "Scan Ordered: {}\nAnswer: {}".format(scan_order, testcase_info["result"])
        fixed_answer = fixing_chain(answer_query)
        try:
            appropriateness, recommendation = re.findall(
            #  r"^Appropriateness: ([0-9A-Za-z ]+)\nRecommendation: ([0-9A-Za-z \.]+)$", fixed_answer["text"])[0]
                r"^[^\n]*Appropriateness: ([^\n]+)\n+[^\n]*Recommendation: ([^\n]+)$", fixed_answer["text"])[0]
        except:
            appropriateness, recommendation = "", ""
        testcase_info["appropriateness"] = appropriateness
        testcase_info["recommendation"] = recommendation

        json_responses.append(testcase_info)
        
    if save_path:
        with open(save_path, "w") as f:
            json.dump(json_responses, f)
    return json_responses

def process_result_df(
    testcase_df: pd.DataFrame, results: Union[List[Dict], List[Response]], save_path: Optional[str] = None
):
    if isinstance(results[0], Response):
        results = process_result_json(testcase_df, results)
    
    result_df = deepcopy(testcase_df)
    result_df["gpt_raw_answer"] = [response["result"] for response in results]
    result_df["gpt_classification"] = [response["appropriateness"] for response in results]
    result_df["gpt_classification"] = result_df["gpt_classification"].str.upper()
    result_df["gpt_recommendation"] = [response["recommendation"] for response in results]
    result_df["context"] = [
        "\n\n".join(["Guideline: {}, Page: {}\nPage Content: {}".format(
            document["metadata"]["source"], document["metadata"]["page"],
            document["page_content"]) for document in response["source_documents"]]
                ) for response in results
    ]

    result_df = result_df.rename(columns = {"Appropriateness Category": "human_gt"})

    result_df["human_gt"] = result_df["human_gt"].str.replace(r"^UA$", "USUALLY APPROPRIATE", regex=True)
    result_df["human_gt"] = result_df["human_gt"].str.replace(r"^UNA$", "USUALLY NOT APPROPRIATE", regex=True)
    result_df["human_gt"] = result_df["human_gt"].str.replace(r"^MBA$", "MAY BE APPROPRIATE", regex=True)
    result_df["human_gt"] = result_df["human_gt"].str.replace(r"^ICI$", "INSUFFICIENT INFORMATION", regex=True)
    
    result_df["match"] = (result_df["gpt_classification"] == result_df["human_gt"])

    if save_path:
        result_df.to_csv(save_path)

    return result_df

def run_test_cases(
    testcase_df: pd.DataFrame,
    exp_args: Dict,
    text_qa_template: Optional[PromptTemplate] = None,
    node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
    artifact_dir: str = ARTIFACT_DIR,
    emb_folder: str = EMB_DIR,
):
    save_folder = os.path.join(
        artifact_dir, "{}_{}_{}_{}_{}".format(
            exp_args["synthesizer_llm"],
            exp_args["chunk_size"],
            exp_args["chunk_overlap"],
            exp_args["description"],
            datetime.now().strftime("%d-%m-%Y-%H-%M")
        )
    )

    if not os.path.exists(save_folder):
        print(save_folder)
        os.makedirs(save_folder)

    logger = get_experiment_logs(exp_args["description"], log_folder=save_folder)

    db_directory = os.path.join(
        emb_folder, exp_args["vectorstore"],
        "{}_{}_{}".format(exp_args["emb_type"], exp_args["chunk_size"], exp_args["chunk_overlap"])
        )
    
    logger.info(f"--------------------\nLoading VectorDB from {db_directory}")

    token_counter = TokenCountingHandler(
        tokenizer=tiktoken.encoding_for_model(exp_args["synthesizer_llm"]).encode
    )
    callback_manager = CallbackManager([token_counter])

    query_engine = setup_query_engine(
        db_directory,
        emb_store_type=exp_args["vectorstore"],
        index_name=exp_args["index_name"],
        similarity_top_k=exp_args["similarity_top_k"],
        text_qa_template=text_qa_template,
        synthesizer_llm = exp_args["synthesizer_llm"],
        synthesizer_temperature = exp_args["synthesizer_temperature"],
        synthesizer_max_tokens = exp_args["synthesizer_max_tokens"],
        response_mode = "simple_summarize",
        node_postprocessors = node_postprocessors,
        callback_manager = callback_manager
    )

    logger.info(
        "-------------\nExperiment settings:\n{}".format(
            "\n".join([f"{k}:{v}" for k, v in exp_args.items()])
        )
    )

    with open(os.path.join(save_folder, "settings.yaml"), "w") as f:
        yaml.dump(exp_args, f)

    token_counter.reset_counts()
    responses = []

    logger.info(
        "-------------\nPROMPT: {}".format(convert_prompt_to_string(query_engine._response_synthesizer._text_qa_template))
    )

    logger.info(
        "------START RUNNING TEST CASES---------"
    )

    for test_case in tqdm(testcase_df["queries"], total=len(testcase_df["queries"])):
        response = query_engine.query(test_case)
        responses.append(response)
        
    logger.info("--------------\nTokens Consumption: Total: {}, Prompt: {}, Completion: {}, Embeddings: {}"
                .format(token_counter.total_llm_token_count,
                        token_counter.prompt_llm_token_count,
                        token_counter.completion_llm_token_count,
                        token_counter.embedding_token_counts))

    logger.info(f"----------\nTest case Completed. Saving Artifacts into {save_folder}")
    json_responses = process_result_json(
        testcase_df, responses=responses, save_path=os.path.join(save_folder, "results.json")
    )

    result_df = process_result_df(
        testcase_df, json_responses, save_path=os.path.join(save_folder, "result.csv")
        )

    accuracy = result_df["match"].sum() / len(result_df) * 100

    logger.info("------EVALUATION-----")
    logger.info(f"Accuracy score: {accuracy}")
    logger.info(
        str(result_df.groupby(["gpt_classification", "human_gt"])["match"].value_counts())
    )
    logger.info(
        str(result_df.groupby(["human_gt", "gpt_classification"])["match"].value_counts())
    )

    return json_responses, result_df, responses

# Load Test Data

In [164]:
testcase_df = pd.read_csv(
        os.path.join(DATA_DIR, "queries", "MSK LLM Fictitious Case Files Full.csv"),
        usecols = ['ACR scenario', 'Appropriateness Category', 'MRI scan ordered',
                   'Difficulty', 'Clinical File']
        )
patient_profiles = testcase_df["Clinical File"]
scan_orders = testcase_df["MRI scan ordered"]

# Baseline Experiment (Rau et al 2023)

## Exp Settings Define

In [25]:
# Prepare Prompt
CHAT_PROMPT_TEMPLATE = None

In [19]:
exp_args = dict(
    # Retrieval 
    emb_type = "openai",
    vectorstore = "simple",
    chunk_size = 512,
    chunk_overlap = 20,
    similarity_top_k = 3,
    index_name = "msk-mri",
    description="BaselineExperimentRau2023",

    # Generation
    synthesizer_llm = "gpt-3.5-turbo",
    synthesizer_max_tokens = 512,
    synthesizer_temperature = 0,
    response_mode = "compact",
)

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)
callback_manager = CallbackManager([token_counter])

text_qa_template = CHAT_PROMPT_TEMPLATE
node_postprocessors = [LongContextReorder()]

## Prepare Test Cases

In [21]:
rau_prompt_template = (
    "Case: {input_text}\n"
    "Scan Ordered: {scan_order}\n"
    "Question: Is this imaging modality for this case USUALLY APPROPRIATE, "
    "MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE or INSUFFICIENT INFORMATION. "
    "Then state precisely the most appropriate imaging modality and if contrast "
    "agent is needed"
    )

testcase_df["queries"] = [
    query_wrapper(rau_prompt_template, {"input_text": patient_profile, "scan_order": scan_order})
    for patient_profile, scan_order in zip(patient_profiles, scan_orders)
    ]

## Run Test Cases

In [23]:
json_responses, result_df, responses = run_test_cases(
    testcase_df=testcase_df,
    exp_args=exp_args,
    text_qa_template=text_qa_template,
    node_postprocessors=node_postprocessors
    )

/mnt/c/Users/User/Desktop/lbp_mri/artifacts/gpt-3.5-turbo_512_20_BaselineExperimentRau2023_22-10-2023-15-34


# Baseline Experiment (Rau et al 2023) - Remove Final Sentence

## Exp Settings Define

In [54]:
# Prepare Prompt
CHAT_PROMPT_TEMPLATE = None

In [55]:
exp_args = dict(
    # Retrieval 
    emb_type = "openai",
    vectorstore = "simple",
    chunk_size = 512,
    chunk_overlap = 20,
    similarity_top_k = 3,
    index_name = "msk-mri",
    description="BaselineExperimentRau2023RemoveFinalSentence",

    # Generation
    synthesizer_llm = "gpt-3.5-turbo",
    synthesizer_max_tokens = 512,
    synthesizer_temperature = 0,
    response_mode = "compact",
)

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)
callback_manager = CallbackManager([token_counter])

text_qa_template = CHAT_PROMPT_TEMPLATE
node_postprocessors = [LongContextReorder()]

## Prepare Test Cases

In [56]:
rau_prompt_template = (
    "Case: {input_text}\n"
    "Scan Ordered: {scan_order}\n"
    "Question: Is this imaging modality for this case USUALLY APPROPRIATE, "
    "MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE or INSUFFICIENT INFORMATION. "
    "Then state precisely the most appropriate imaging modality and if contrast "
    "agent is needed"
    )

testcase_df["queries"] = [
    query_wrapper(rau_prompt_template, {"input_text": remove_final_sentence(patient_profile), "scan_order": scan_order})
    for patient_profile, scan_order in zip(patient_profiles, scan_orders)
    ]

## Run Test Cases

In [None]:
json_responses, result_df, responses = run_test_cases(
    testcase_df=testcase_df,
    exp_args=exp_args,
    text_qa_template=text_qa_template,
    node_postprocessors=node_postprocessors
    )

# Run 1 - GPT-4, Chunk=512, Chunk_no=5

## Exp Settings Define

In [65]:
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

system_template = """
You are a radiologist expert at providing imaging recommendations for patients with musculoskeletal conditions.
If you do not know an answer, just say "I dont know", do not make up an answer.
==========
TASK:
1. Extract from given PATIENT PROFILE relevant information for classification of imaging appropriateness.
Important information includes AGE, SYMPTOMS, DIAGNOSIS (IF ANY), which stage of diagnosis (INITIAL IMAGING OR NEXT STUDY).
2. Refer to the reference information given under CONTEXT to analyse the appropriate imaging recommendations given the patient profile.
3. Given the PATIENT PROFILE and CONTEXT, refer to the SCORING CRITERIA and recommend if the image scan ordered is USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE or there is INSUFFICIENT INFORMATION to recommend the appropriateness.  
If the scan is not appropriate, recommend an appropriate procedure.

STRICTLY answer based on the given PATIENT PROFILE and CONTEXT. 
==========
SCORING CRITERIA:
- USUALLY APPROPRIATE: The imaging procedure or treatment is indicated in the specified clinical scenarios at a favorable risk-benefit ratio for patients.
- MAY BE APPROPRIATE: The imaging procedure or treatment may be indicated in the specified clinical scenarios as an alternative to imaging procedures or treatments with a more favorable risk-benefit ratio, or the risk-benefit ratio for patients is equivocal.
- USUALLY NOT APPROPRIATE: The imaging procedure or treatment is unlikely to be indicated in the specified clinical scenarios, or the risk-benefit ratio for patients is likely to be unfavorable.
- INSUFFICIENT INFORMATION: There is not enough information from PATIENT PROFILE and CONTEXT information to conclude the appropriateness
==========
OUTPUT INSTRUCTIONS:
Your output should contain the following:
1. Classification of appropriateness for the ordered scan.
2. Provide explanation for the appropriateness classification.
3. If classification answer is USUALLY NOT APPROPRIATE, either recommend an alternative appropriate scan procedure or return NO SCAN REQUIRED.

Format your output as follow:
1. Classification: Can be one of [USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE, INSUFFICIENT INFORMATION]
2. Explanation:
3. Recommendation: Can be alternative procedure, NO SCAN REQUIRED or NO CHANGE REQUIRED 
==========
CONTEXT:

{context_str}
==========
"""

human_template = "{query_str}"
messages = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_template),
    ChatMessage(role=MessageRole.USER, content=human_template)   
]

CHAT_PROMPT_TEMPLATE = ChatPromptTemplate(messages)

In [78]:
exp_args = dict(
    # Retrieval 
    emb_type = "openai",
    vectorstore = "faiss",
    chunk_size = 512,
    chunk_overlap = 20,
    similarity_top_k = 5,
    index_name = "msk-mri",
    description="Topk=5_RemoveFinalSentence",

    # Generation
    synthesizer_llm = "gpt-4",
    synthesizer_max_tokens = 512,
    synthesizer_temperature = 0,
    response_mode = "simple_summarize",
)

text_qa_template = CHAT_PROMPT_TEMPLATE
node_postprocessors = [LongContextReorder()]

## Prepare Test Cases

In [109]:
question_template = "Patient Profile: {profile}\nScan ordered: {scan_order}"

testcase_df["queries"] = [
    query_wrapper(question_template, {"profile": remove_final_sentence(patient_profile, True)[0],
                                      "scan_order": remove_final_sentence(patient_profile, True)[1]})
    for patient_profile in patient_profiles
    ]

## Run Test Cases

In [None]:
json_responses, result_df, responses = run_test_cases(
    testcase_df=testcase_df,
    exp_args=exp_args,
    text_qa_template=text_qa_template,
    node_postprocessors=node_postprocessors
    )

# Run 2 - GPT-4, Chunk=1024, Overlap=128 Chunk_no=7

## Exp Settings Define

In [136]:
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

system_template = """
You are a radiologist expert at providing imaging recommendations for patients with musculoskeletal conditions.
If you do not know an answer, just say "I dont know", do not make up an answer.
==========
TASK: 
1. Extract from given PATIENT PROFILE relevant information for classification of imaging appropriateness.
Important information includes AGE, SYMPTOMS, DIAGNOSIS (IF ANY), which stage of diagnosis (INITIAL IMAGING OR NEXT STUDY).
2. Refer to the reference information given under CONTEXT to analyse the appropriate imaging recommendations given the patient profile.
3. Given the PATIENT PROFILE and CONTEXT, refer to the SCORING CRITERIA and recommend if the image scan ordered is USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE or there is INSUFFICIENT INFORMATION to recommend the appropriateness.  
If the scan is not appropriate, recommend an appropriate procedure.

STRICTLY answer based on the given PATIENT PROFILE and CONTEXT. 
==========
SCORING CRITERIA:
- USUALLY APPROPRIATE: The imaging procedure or treatment is indicated in the specified clinical scenarios at a favorable risk-benefit ratio for patients.
- MAY BE APPROPRIATE: The imaging procedure or treatment may be indicated in the specified clinical scenarios as an alternative to imaging procedures or treatments with a more favorable risk-benefit ratio, or the risk-benefit ratio for patients is equivocal.
- USUALLY NOT APPROPRIATE: The imaging procedure or treatment is unlikely to be indicated in the specified clinical scenarios, or the risk-benefit ratio for patients is likely to be unfavorable.
- INSUFFICIENT INFORMATION: There is not enough information from PATIENT PROFILE and CONTEXT information to conclude the appropriateness
==========
OUTPUT INSTRUCTIONS:
Your output should contain the following:
1. Classification of appropriateness for the ordered scan.
2. Provide explanation for the appropriateness classification.
3. If classification answer is USUALLY NOT APPROPRIATE, either recommend an alternative appropriate scan procedure or return NO SCAN REQUIRED.
==========
CONTEXT:

{context_str}
==========
"""

human_template = "{query_str}"
messages = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_template),
    ChatMessage(role=MessageRole.USER, content=human_template)   
]

CHAT_PROMPT_TEMPLATE = ChatPromptTemplate(messages)

In [137]:
exp_args_4 = dict(
    # Retrieval 
    emb_type = "openai",
    vectorstore = "faiss",
    chunk_size = 1024,
    chunk_overlap = 128,
    similarity_top_k = 7,
    index_name = "msk-mri",
    description="Topk=7_RemoveFinalSentence",

    # Generation
    synthesizer_llm = "gpt-4",
    synthesizer_max_tokens = 512,
    synthesizer_temperature = 0,
    response_mode = "simple_summarize",
)

text_qa_template = CHAT_PROMPT_TEMPLATE
node_postprocessors = [LongContextReorder()]

## Prepare Test Cases

In [138]:
question_template = "Patient Profile: {profile}\nScan ordered: {scan_order}"

testcase_df["queries"] = [
    query_wrapper(question_template, {"profile": remove_final_sentence(patient_profile, True)[0],
                                      "scan_order": remove_final_sentence(patient_profile, True)[1]})
    for patient_profile in patient_profiles
    ]

## Run Test Cases

In [None]:
json_responses, result_df, responses = run_test_cases(
    testcase_df=testcase_df,
    exp_args=exp_args_4,
    text_qa_template=text_qa_template,
    node_postprocessors=node_postprocessors
    )

# Run 3 - GPT-4, Chunk=512, Chunk_no=5

## Exp Settings Define

In [155]:
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

system_template = """
You are a radiologist expert at providing imaging recommendations for patients with musculoskeletal conditions.
If you do not know an answer, just say "I dont know", do not make up an answer.
==========
TASK: You are given a PATIENT PROFILE and a SCAN ORDER. Your task is to evaluate if the appropriateness of the SCAN ORDER based on the PATIENT PROFILE.
Perform step-by-step the following sequence of reasoning.
1. Extract from given PATIENT PROFILE relevant information including AGE, SYMPTOMS, previous DIAGNOSIS (IF ANY), which stage of diagnosis (INITIAL IMAGING OR NEXT STUDY).
2. Refer to the reference information given under CONTEXT to analyse the appropriate imaging recommendations given the patient profile.
3. Based on the SCORING CRITERIA, recommend if the image scan ordered is USUALLY APPROPRIATE, MAY BE APPROPRIATE, USUALLY NOT APPROPRIATE or there is INSUFFICIENT INFORMATION to recommend the appropriateness.  
If the scan is not appropriate, recommend an appropriate procedure.

STRICTLY answer based on the given PATIENT PROFILE and CONTEXT. 
==========
SCORING CRITERIA:
- USUALLY APPROPRIATE: The imaging procedure or treatment is indicated in the specified clinical scenarios at a favorable risk-benefit ratio for patients.
- MAY BE APPROPRIATE: The imaging procedure or treatment may be indicated in the specified clinical scenarios as an alternative to imaging procedures or treatments with a more favorable risk-benefit ratio, or the risk-benefit ratio for patients is equivocal.
- USUALLY NOT APPROPRIATE: The imaging procedure or treatment is unlikely to be indicated in the specified clinical scenarios, or the risk-benefit ratio for patients is likely to be unfavorable.
- INSUFFICIENT INFORMATION: There is not enough information from PATIENT PROFILE and CONTEXT information to conclude the appropriateness
==========
CONTEXT:

{context_str}
==========
"""

human_template = "{query_str}"
messages = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_template),
    ChatMessage(role=MessageRole.USER, content=human_template)   
]

CHAT_PROMPT_TEMPLATE = ChatPromptTemplate(messages)

In [156]:
exp_args = dict(
    # Retrieval 
    emb_type = "openai",
    vectorstore = "faiss",
    chunk_size = 512,
    chunk_overlap = 20,
    similarity_top_k = 5,
    index_name = "msk-mri",
    description="Topk=5_RemoveFinalSentence",

    # Generation
    synthesizer_llm = "gpt-4",
    synthesizer_max_tokens = 512,
    synthesizer_temperature = 0,
    response_mode = "simple_summarize",
)

text_qa_template = CHAT_PROMPT_TEMPLATE
node_postprocessors = [LongContextReorder()]

## Prepare Test Cases

In [165]:
question_template = "Patient Profile: {profile}\nScan ordered: {scan_order}"

testcase_df["queries"] = [
    query_wrapper(question_template, {"profile": remove_final_sentence(patient_profile, True)[0],
                                      "scan_order": remove_final_sentence(patient_profile, True)[1]})
    for patient_profile in patient_profiles
    ]

sub_testcase_df = testcase_df[testcase_df["Appropriateness Category"].isin(['MBA','ICI'])]

## Run Test Cases

In [166]:
json_responses, result_df, responses = run_test_cases(
    testcase_df=sub_testcase_df,
    exp_args=exp_args,
    text_qa_template=text_qa_template,
    node_postprocessors=node_postprocessors
    )

/mnt/c/Users/User/Desktop/lbp_mri/artifacts/gpt-4_512_20_Topk=5_RemoveFinalSentence_23-10-2023-00-11
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:28,982:INFO: --------------------
Loading VectorDB from /mnt/c/Users/User/Desktop/lbp_mri/data/emb_store/faiss/openai_512_20
2023-10-23 00:11:29,246:INFO: faiss VectorStore

100%|██████████| 24/24 [06:38<00:00, 16.62s/it]

2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,276:INFO: --------------
Tokens Consumption: Total: 62529, Prompt: 56166, Completion: 6363, Embeddings: []
2023-10-23 00:18:08,301:INFO: ----------
Test case Completed. Saving Artifacts into /mnt/c/Users/User/Desktop/lbp_mri/artifacts/gpt-4_512_20_Topk=5_RemoveFinalSentence_23-10-2023-00-11
2023-10-23 00:18:08,301:INFO: ----------
Test case Co


100%|██████████| 24/24 [00:29<00:00,  1.22s/it]

2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,579:INFO: ------EVALUATION-----
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,585:INFO: Accuracy score: 37.5
2023-10-23 00:18:37,604:INFO: gpt_classification        human_gt                  match
                          MAY BE APPROPRIATE        False    1
INSUFFICIENT INFORMATION  INSUFFICIENT INFORMATION  True     5
                          MAY BE APPROPRIATE        False    1
MAY BE APPROPRIATE        INSUFFICIENT INFORMATION  False    1
                          MAY BE APPROPRIA


