# Setup

In [None]:
!pip install --quiet langchain openai faiss-cpu tiktoken pypdf PyMuPDF

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive

drive.mount("/content/drive")

%cd drive/MyDrive/LLM/ulcerative_colitis

Mounted at /content/drive
/content/drive/MyDrive/LLM/ulcerative_colitis


# Import dependencies

In [26]:
import os, sys, json, logging
import os.path as osp
import re
import pandas as pd

from typing import Union, Sequence, Dict, Callable, List, Optional
from pydantic import BaseModel, Field, validator
from pprint import pprint
from tqdm.auto import tqdm
from IPython.display import display, Markdown
from time import time
from datetime import datetime

from langchain import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders.pdf import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import PydanticOutputParser

from config import MAIN_DIR, DATA_DIR, EMBSTORE_DIR, ARTIFACT_DIR, DOCUMENT_SOURCE

from shutil import rmtree
from utils import load_single_document

In [23]:
PROJECT = "uc"

with open(osp.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    keys = json.load(f)

OPENAI_KEY = keys["OPENAI_API_KEY"]

In [11]:
LOGGER = logging.getLogger()

log_path = os.path.join(MAIN_DIR, "log_file.txt")
file_handler = logging.FileHandler(
    filename=log_path)

formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
file_handler.setFormatter(formatter)

LOGGER.setLevel(logging.INFO)
LOGGER.addHandler(file_handler)

# User-defined Functions (UDF)

In [12]:
def generate_vectorstore(
    data_directory: str,
    embedder,
    output_directory: str = "./vectorstore",
    chunk_size: int=1000,
    chunk_overlap: int=250,
    relevant_pages: Optional[Dict] = None
    ):
    LOGGER.info("Creating new vectorstore.")

    if osp.exists(output_directory):
        rmtree(output_directory)
    os.makedirs(output_directory, exist_ok = True)

    document_files = os.listdir(data_directory)
    documents = []

    for filename in document_files:
        relevant_page_idx = relevant_pages[filename] if filename in relevant_pages else []
        for page in PyMuPDFLoader(os.path.join(data_directory, filename)).load():
            if page.metadata["page"] in relevant_page_idx:
                page.metadata["modal"] = "text"
                documents.append(page)

    print(f"Total number of pages to be processed: {len(documents)}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"No of chunks: {len(texts)}")

    docsearch = FAISS.from_documents(texts, embedder)
    docsearch.save_local(output_directory)
    assert "index.faiss" in os.listdir(output_directory) and "index.pkl" in os.listdir(output_directory)
    LOGGER.info(f"Successfully created vectorstore at {output_directory}")

    return docsearch

# Experiment Class

In [13]:
class DrugOutput(BaseModel):
    drug_name: str = Field(description = "Name of the drug")
    advantages: str = Field(description = "Advantages of the drug ")
    disadvantages: str = Field(description = "Disadvantages of the drug")

class Experiment():
    def __init__(
        self,
        prompt_template: Union[PromptTemplate, ChatPromptTemplate],
        vector_store: str,
        llm_type: str="gpt-3.5-turbo",
        emb: str="text-embedding-ada-002",
        keys_json: str=osp.join(MAIN_DIR, "auth", "api_keys.json"),
        temperature: float = 0,
        max_tokens: int = 512,
        gt: Optional[str] = None,
        verbose: bool = False
        ):

        self.llm_type = llm_type.lower()
        self.temperature = temperature
        self.max_tokens = max_tokens

        with open(keys_json, "r") as f:
            keys = json.load(f)

        self.openai_key = keys["OPENAI_API_KEY_FOR_GPT4"] if self.llm_type == "gpt-4" else keys["OPENAI_API_KEY"]

        if isinstance(prompt_template,ChatPromptTemplate):
            self.llm = ChatOpenAI(model_name=self.llm_type, temperature=self.temperature,
                            max_tokens=self.max_tokens, openai_api_key=self.openai_key)
        else:
            self.llm = OpenAI(model_name=self.llm_type, temperature=self.temperature,
                            max_tokens=self.max_tokens, openai_api_key=self.openai_key
                            )
        self.embedder = OpenAIEmbeddings(openai_api_key = self.openai_key)
        try:
            self.load_vectorstore(vector_store)
        except:
            print("Vectorstore invalid. Please load valid vectorstore or create new vectorstore.")
            self.docsearch = None

        self.prompt_template = prompt_template
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = pd.read_csv(gt, encoding = "ISO-8859-1") if gt else None
        self.drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)
        self.chain = None
        self.verbose = verbose

    def load_vectorstore(self, vectorstore_path):
        assert "index.faiss" in os.listdir(vectorstore_path) and "index.pkl" in os.listdir(vectorstore_path), "Invalid Vectorstore"
        self.docsearch = FAISS.load_local(vectorstore_path, self.embedder)
        LOGGER.info("Successfully loaded existing vectorstore from local storage")

    def generate_vectorstore(
        self,
        data_directory: str,
        output_directory: str = "./vectorstore",
        chunk_size: int=1000,
        chunk_overlap: int=250,
        relevant_pages: Optional[Dict] = None
        ):
        self.docsearch = generate_vectorstore(
            data_directory = data_directory,
            embedder = self.embedder,
            output_directory = output_directory,
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            relevant_pages = relevant_pages
        )


    def run_test_cases(self, test_cases: Union[List[str], str]):
        if isinstance(test_cases, str):
            with open(test_cases, "r", encoding = "utf-8-sig") as f:
                test_cases = f.readlines()
            test_cases = [test_case.rstrip() for test_case in test_cases]

        if not self.chain:
            self._create_retriever_chain()

        for test_case in test_cases:
            print("Query: {}".format(test_case))
            output = self.chain(test_case)
            self.questions.append(output["question"])
            self.answers.append(output["answer"])
            sources = []
            for document in output["source_documents"]:
                sources.append(
                    {
                        "title": document.metadata["title"],
                        "filename": document.metadata["source"].split("/")[-1],
                        "page": document.metadata["page"],
                        "modal": document.metadata["modal"],
                        "text": document.page_content
                    }
                )

            self.sources.append(sources)

    @staticmethod
    def convert_prompt_to_string(prompt):
        return prompt.format(**{v:v for v in prompt.input_variables})

    @staticmethod
    def format_results(result_dict):
        sources = [
        str(re.findall(r"[ \w-]+?(?=\.)", name)[0])
        for name in (
            list(set([doc.metadata["source"] for doc in result_dict["source_documents"]])))]

        response = f"""### Response:
        {result_dict['answer']}
        \n### Relevant sources:
        {', '.join(sources)}
        """
        return response

    @classmethod
    def print_result(cls, result):
        display(Markdown(Experiment.format_results(result)))

    @staticmethod
    def process_source(source):
        return "\n\n".join([f"{k}: {v}" for k, v in source.items()])

    def save_json(self, output_path):
        output_dict = {}
        output_dict["prompt"] = Experiment.convert_prompt_to_string(self.prompt_template)
        output_dict["test_cases"] = []

        for question, answer, source in zip(self.questions, self.answers, self.sources):
            output_dict["test_cases"].append(
                {
                    "question": question,
                    "answer": answer,
                    "sources": source
                }
            )

        with open(output_path, "w") as f:
            json.dump(output_dict, f)

    def load_groundtruth(self, gt_path):
        self.groundtruth = pd.read_csv(gt_path)

    def reset(self):
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = None

    def load_json(self, json_path, reset = False):
        if reset:
            self.reset()
        with open(json_path, "r") as f:
            input_dict = json.load(f)
        for test_case in input_dict["test_cases"]:
            self.questions.append(test_case["question"])
            self.answers.append(test_case["answer"])
            self.sources.append(test_case["sources"])

    def write_csv(self, output_csv: str):

        pd_answers = [[], []]
        pd_pros = [[], []]
        pd_cons = [[], []]
        pd_sources = [[], [], [], [], [], []]

        for answer, sources in zip(self.answers, self.sources):
            drugs = [self.drug_parser.parse(drug) for drug in re.findall(re.compile(r"{[^{}]+}"), answer)]
            pd_answers[0].append(drugs[0].drug_name if len(drugs) > 0 else None)
            pd_answers[1].append(drugs[1].drug_name if len(drugs) > 1 else None)
            pd_pros[0].append(drugs[0].advantages if len(drugs) > 0 else None)
            pd_cons[0].append(drugs[0].disadvantages if len(drugs) > 0 else None)
            pd_pros[1].append(drugs[1].advantages if len(drugs) > 1 else None)
            pd_cons[1].append(drugs[1].disadvantages if len(drugs) > 1 else None)

            for idx, source in enumerate(sources):
                pd_sources[idx].append(Experiment.process_source(source))

            if idx + 1 < len(pd_sources):
                for i in range(idx+1, len(pd_sources)):
                    pd_sources[i].append(None)

        info = {"question": self.questions}

        if self.ground_truth is not None:
            info["gt_rec1"] = self.ground_truth["Recommendation 1"].tolist()
            info["gt_rec2"] = self.ground_truth["Recommendation 2"].tolist()
            info["gt_rec3"] = self.ground_truth["Recommendation 3"].tolist()
            info["gt_avoid"] = self.ground_truth["Drug Avoid"].tolist()
            info["gt_reason"] = self.ground_truth["Reasoning"].tolist()

        info["prompt"] = [Experiment.convert_prompt_to_string(self.prompt_template)] * len(self.questions)
        info["raw_answer"] = self.answers
        info["answer1"] = pd_answers[0]; info["pro1"] = pd_pros[0]; info["cons1"] = pd_cons[0]
        info["answer2"] = pd_answers[1]; info["pro2"] = pd_pros[1]; info["cons2"] = pd_cons[1]
        info["source1"] = pd_sources[0]; info["source2"] = pd_sources[1]; info["source3"] = pd_sources[2]
        info["source4"] = pd_sources[3]; info["source5"] = pd_sources[4]; info["source6"] = pd_sources[5]


        panda_df = pd.DataFrame(
            info
        )

        panda_df.to_csv(output_csv, header = True)

    def _create_retriever_chain(
        self,
        chain_type: str = "stuff",
        return_source_documents=True,
        reduce_k_below_max_tokens=True,
        ):
        self.chain = RetrievalQAWithSourcesChain.from_chain_type(
            llm=self.llm,
            chain_type=chain_type,
            retriever=self.docsearch.as_retriever(),
            return_source_documents=return_source_documents,
            chain_type_kwargs={"prompt": self.prompt_template},
            reduce_k_below_max_tokens=reduce_k_below_max_tokens,
            verbose=self.verbose
            )

# Create Vectorstore

In [37]:
for filename in os.listdir(os.path.join(DOCUMENT_SOURCE, PROJECT)):
    pages = PyMuPDFLoader(os.path.join(DOCUMENT_SOURCE, PROJECT, filename)).load()
    pprint(pages[0].__dict__["metadata"])

{'author': 'Juan S Lasa MD',
 'creationDate': "D:20211223225432+05'30'",
 'creator': 'Elsevier',
 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf',
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': "D:20220108175830+05'30'",
 'page': 0,
 'producer': 'Acrobat Distiller 6.0 for Windows',
 'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf',
 'subject': 'The Lancet Gastroenterology & Hepatology, 7 (2022) 161-170. '
            'doi:10.1016/S2468-1253(21)00377-0',
 'title': 'Efficacy and safety of biologics and small molecule drugs for '
          'patients with moderate-to-severe ulcerative colitis: a systematic '
          'review and network meta-analysis',
 'total_pages': 10,
 'trapped': ''}
{'author': 'Manasi Agrawal',
 'creationDate': "D:20210619033246+05'30'",
 'creator': 'Elsevier',
 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/

In [None]:
RELEVANT_PAGES = {
    'agrawal.pdf': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'PIIS1542356520300446.pdf': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, ],
    'gutjnl-2021-326390R2 CLEAN.pdf': [1] + list(range(3, 31)),
    'otad009.pdf': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    '1-s2.0-S2468125321003770-main.pdf': [0, 1, 2, 3, 4, 5, 6, 7, 8],
    'juillerat 2022.pdf':[0, 1, 2, 3, 4, 5]
}

datastore_paths = [os.path.join(DATA_DIR, file_name) for file_name in os.listdir(DATA_DIR) if file_name.endswith(".pdf")]
print("Number of documents in datastore:", len(datastore_paths))
for i, path in enumerate(datastore_paths):
    print(f"Index {i + 1}: {path}")

Number of documents in datastore: 6
Index 1: /content/drive/MyDrive/LLM/ulcerative_colitis/data/agrawal.pdf
Index 2: /content/drive/MyDrive/LLM/ulcerative_colitis/data/PIIS1542356520300446.pdf
Index 3: /content/drive/MyDrive/LLM/ulcerative_colitis/data/gutjnl-2021-326390R2 CLEAN.pdf
Index 4: /content/drive/MyDrive/LLM/ulcerative_colitis/data/otad009.pdf
Index 5: /content/drive/MyDrive/LLM/ulcerative_colitis/data/1-s2.0-S2468125321003770-main.pdf
Index 6: /content/drive/MyDrive/LLM/ulcerative_colitis/data/juillerat 2022.pdf


In [None]:
sample_path = "/content/drive/MyDrive/LLM/ulcerative_colitis/data/1-s2.0-S2468125321003770-main.pdf"
pdfloader = PyMuPDFLoader(sample_path)

sample_data = pdfloader.load()
print("Number of pages:", len(sample_data))
sample_page = sample_data[4]
print(str(sample_page.metadata) + "\n")
content = sample_page.page_content
print(f"Text Length: {len(content)}\n")
# content = re.sub(r"\t+", " ", content)
pprint(content[:1000])
metadata = sample_page.metadata
pprint(metadata)

In [None]:
generate_vectorstore(DATA_DIR, OpenAIEmbeddings(openai_api_key = OPENAI_KEY), EMBSTORE_DIR, relevant_pages = RELEVANT_PAGES)

# Prototypes

## Test Cases

In [15]:
with open(osp.join(DATA_DIR, "queries", "uc.txt"), "r", encoding = "utf-8-sig") as f:
    test_cases = f.readlines()

test_cases = [test_case.rstrip() for test_case in test_cases]
test_cases

['40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations',
 '70 year old female with newly diagnosed severe UC',
 '35 year old male with known moderate UC with prior exposure to infliximab but has worsening colitis on endoscopy despite compliance',
 '60 year old female with newly diagnosed moderate UC with a background of congestive cardiac failure',
 '38 year old female with newly diagnosed moderate UC and psoriasis',
 '25 year old pregnant woman with severe distal ulcerative colitis',
 '56 year old man with moderate to severe ulcerative colitis and ankylosing spondylitis',
 '38 year old man with severe ulcerative colitis and has lost response to vedolizumab',
 '28 year old woman who has severe extensive ulcerative colitis and has a history of lymphoma which was treated 4 years ago',
 '36 year old woman with moderate ulcerative colitis and multiple sclerosis']

## Experiment 1: Only Text - Normal Prompt Template - GPT4

### Prompt Setup

In [None]:
### STANDARD PROMPT TEMPLATE
drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)

prompt_template = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

{summaries}

{format_instructions}

Question: {question}
Answer:
"""

TEST_PROMPT_TEMPLATE_1 = PromptTemplate(
    template = prompt_template,
    input_variables = ["summaries", "question"],
    partial_variables={"format_instructions": drug_parser.get_format_instructions()}
)

print(TEST_PROMPT_TEMPLATE_1.format(summaries = "Summaries", question = "User Query"))

Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

Summaries

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
t

### Run Experiments

In [16]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "Text_Only"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [None]:
# Create and run experiment
exp1 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_1,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp1.run_test_cases(test_cases)

# Save Output
exp1.save_json(save_path+".json")
exp1.write_csv(save_path+".csv")

## Experiment 2: Only Text - CHAT Prompt Template - GPT4

### Prompt Setup

In [None]:
### CHAT PROMTP TEMPLATE
system_prompt = """
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

{summaries}

"""

TEST_PROMPT_TEMPLATE_2 = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_prompt, input_variables = ["summaries"]),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

print(TEST_PROMPT_TEMPLATE_2.format(summaries = "Summaries", question = "User Query"))

System: 
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

Summaries


Human: User Query


### Run Experiments

In [None]:
# Settings
LLM_TYPE = "gpt-3.5-turbo"
DESCRIPTION = "Text_Only_With_CHAT_Prompt"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [None]:
# Create and run experiment
exp2 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_2,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp2.run_test_cases(test_cases)

# Save Output
exp2.save_json(save_path+".json")
exp2.write_csv(save_path+".csv")

INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.
INFO:root:Successfully loaded existing vectorstore from local storage


Query: 40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 70 year old female with newly diagnosed severe UC


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 35 year old male with known moderate UC with prior exposure to infliximab but has worsening colitis on endoscopy despite compliance


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 60 year old female with newly diagnosed moderate UC with a background of congestive cardiac failure


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 38 year old female with newly diagnosed moderate UC and psoriasis


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 25 year old pregnant woman with severe distal ulcerative colitis


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
Query: 56 year old man with moderate to severe ulcerative colitis and ankylosi

In [None]:
# Create and run experiment
exp2 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_2,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp2.load_json(save_path + ".json")

# Save Output
exp2.write_csv(save_path+".csv")

INFO:root:Successfully loaded existing vectorstore from local storage


## Experiment 3:

In [68]:
add_docs_path = os.path.join(MAIN_DIR, "data/additional_docs.json")

In [71]:
with open(add_docs_path, "r") as f:
    additional_documents = json.load(f)

In [72]:
additional_documents

[{'filename': 'tables/uc_juillerat_2022_tab1.csv',
  'mode': 'table',
  'description': 'Efficacy of biological treatments according to the line of treatment, earlier exposure, disease phenotype and patient characteristics. ',
  'metadata': {'author': 'Pascal Juillerat',
   'creator': 'Elsevier',
   'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/juillerat 2022.pdf',
   'keywords': '',
   'page': 3,
   'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/juillerat 2022.pdf',
   'subject': 'Current Research in Pharmacology and Drug Discovery, 3 (2022) 100104. doi:10.1016/j.crphar.2022.100104',
   'title': 'Positioning biologics in the treatment of IBD: A practical guide - Which mechanism of action for whom?',
   'total_pages': 9}}]

In [64]:
documents = []
for table in additional_documents:
    rows = load_single_document(os.path.join(DATA_DIR, table["filename"]))
    for row in rows:
        row_no = row.metadata["row"]
        metadata = {k:v for k, v in table["metadata"].items()}
        metadata["row"] = row_no
        row.page_content = table["description"] + ":" + row.page_content
        row.metadata = metadata
        documents.append(row)

In [24]:
docsearch = FAISS.load_local(
    os.path.join(EMBSTORE_DIR, PROJECT, "faiss/text-embedding-ada-002/v1"),
    OpenAIEmbeddings(openai_api_key = OPENAI_KEY)
)

## Custom Agent

### QA from text Agent

### CSV Agent

In [None]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

model_name = 'text-davinci-003'
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature, openai_api_key = OPENAI_KEY)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

joke_query = "Tell me a joke."
_input = prompt.format_prompt(query=joke_query)

output = model(_input.to_string())

In [None]:
print(_input.text)

Answer the user query.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"drug_name": {"title": "Drug Name", "description": "Name of the drug", "type": "string"}, "description": {"title": "Description", "description": "Overall summary of the drug", "type": "string"}, "advantages": {"title": "Advantages", "description": "Advantages of the drug ", "type": "string"}, "disadvantages": {"title": "Disadvantages", "description": "Disadvantages of the drug", "type": "string"}}, "required": ["drug_name", "description", "advantages", "disadvantages"]}
```
Tell me a joke.



In [None]:
output

'\n{"drug_name": "Joke", "description": "A joke to make you laugh", "advantages": "It can make you laugh and bring joy", "disadvantages": "It may not be funny"}'