# Setup

In [None]:
!pip install --quiet langchain openai faiss-cpu tiktoken pypdf PyMuPDF

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive

drive.mount("/content/drive")

%cd drive/MyDrive/LLM/ulcerative_colitis

Mounted at /content/drive
/content/drive/MyDrive/LLM/ulcerative_colitis


# Import dependencies

In [87]:
import os, sys, json, logging
import os.path as osp
import re
import pandas as pd

from typing import Union, Sequence, Dict, Callable, List, Optional
from pydantic import BaseModel, Field, validator
from pprint import pprint
from tqdm.auto import tqdm
from IPython.display import display, Markdown
from time import time
from datetime import datetime
from langchain.docstore.document import Document
from chromadb.config import Settings

from langchain import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import VectorStore, FAISS, Chroma, Pinecone
import pinecone
from langchain.document_loaders.pdf import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import PydanticOutputParser

from config import MAIN_DIR, DATA_DIR, EMBSTORE_DIR, ARTIFACT_DIR, DOCUMENT_SOURCE

from shutil import rmtree
from utils import load_single_document, load_documents

In [23]:
PROJECT = "uc"

with open(osp.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    keys = json.load(f)

OPENAI_KEY = keys["OPENAI_API_KEY"]

In [116]:
LOGGER = logging.getLogger()

log_path = os.path.join(MAIN_DIR, "log", "logfile.txt")
file_handler = logging.FileHandler(
    filename=log_path)

formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
file_handler.setFormatter(formatter)

LOGGER.setLevel(logging.INFO)
LOGGER.addHandler(file_handler)

# User-defined Functions (UDF)

In [118]:
def convert_csv_to_documents(table_info: Dict) -> List[Document]:
    """Convert a dictionary containing table information into list of Documents

    Args:
        table_info (Dict): Dictionary containing .csv table information

    Returns:
        List[Document]: List of rows inside the table
    """
    assert table_info["mode"] == "table" and table_info["filename"].endswith(".csv")
    documents = []
    rows = load_single_document(table_info["filename"])
    for row in rows:
        row_no = row.metadata["row"]
        metadata = {k: v for k, v in table_info["metadata"].items()}
        metadata["row"] = row_no
        metadata["modal"] = table_info["mode"]
        row.page_content = table_info["description"] + ":" + row.page_content
        row.metadata = metadata
        documents.append(row)

    return documents


def convert_json_to_documents(json_info: Dict) -> List[Document]:
    """Convert a dictionary containing json information into list of Documents

    Args:
        table_info (Dict): Dictionary containing .json table information

    Returns:
        List[Document]: List of Documents
    """
    return []


def generate_vectorstore(
    embeddings: Callable,
    source_directory: Optional[str] = None,
    output_directory: str = "./vectorstore",
    emb_store_type: str = "faiss",
    chunk_size: int = 1000,
    chunk_overlap: int = 250,
    exclude_pages: Optional[Dict] = None,
    pinecone_idx_name: Optional[str] = None,
    additional_docs: Optional[List] = None,
    key_path: Optional[str] = os.path.join(MAIN_DIR, "auth", "api_keys.json"),
) -> VectorStore:
    """Generate New Vector Index Database

    Args:
        source_directory (str): Directory contains source documents
        embeddings (Callable): Function to convert text to vector embeddings
        output_directory (str, optional): Output directory of vector index database. Defaults to "./vectorstore".
        emb_store_type (str, optional): Type of vector index database. Defaults to "faiss".
        chunk_size (int, optional): Maximum size of text chunks (characters) after split. Defaults to 1000.
        chunk_overlap (int, optional): Maximum overlapping window between text chunks. Defaults to 250.
        exclude_pages (Optional[Dict], optional): Dictionary of pages to be excluded from documents. Defaults to None.
        pinecone_idx_name (Optional[str], optional): Name of pinecone index to be created or loaded. Defaults to None.
        additional_docs (Optional[str], optional): Additional Tables, Images or Json to be added to doc list. Defaults to None.
        key_path (Optional[str], optional): Path to file containing API info.
            Defaults to os.path.join(MAIN_DIR, "auth", "api_keys.json").

    Returns:
        Vectorstore: Vector Database
    """

    if os.path.exists(output_directory):
        rmtree(output_directory)
    os.makedirs(output_directory, exist_ok=True)

    if source_directory:
        LOGGER.info(f"Loading documents from {source_directory}")

        documents = load_documents(source_directory, exclude_pages=exclude_pages)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        texts = text_splitter.split_documents(documents)

        LOGGER.info(f"Loaded {len(documents)} documents from {source_directory}")
        LOGGER.info(
            f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)"
        )
    else:
        texts = []

    if additional_docs:
        texts.extend(additional_docs)

    LOGGER.info(
        f"Total number of text chunks to create vector index store: {len(texts)}"
    )

    if emb_store_type == "chroma":
        chroma_settings = Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=output_directory,
            anonymized_telemetry=False,
        )
        db = Chroma.from_documents(
            texts,
            embeddings,
            persist_directory=output_directory,
            client_settings=chroma_settings,
        )
        db.persist()

    elif emb_store_type == "faiss":
        db = FAISS.from_documents(texts, embedding=embeddings)
        db.save_local(output_directory)
        assert "index.faiss" in os.listdir(
            output_directory
        ) and "index.pkl" in os.listdir(output_directory)

    elif emb_store_type == "pinecone":
        with open(key_path, "r") as f:
            keys = json.loads(f)
        PINECONE_API_KEY = keys["PINECONE_API"]["KEY"]
        PINECONE_ENV = keys["PINECONE_API"]["ENV"]

        pinecone.init(
            api_key=PINECONE_API_KEY,
            environment=PINECONE_ENV,
        )

        if not pinecone_idx_name:
            pinecone_idx_name = "index_{}".format(
                datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
            )

        if pinecone_idx_name not in pinecone.list_indexes():
            db = Pinecone.from_documents(
                texts, embedding=embeddings, index_name=pinecone_idx_name
            )

        else:
            db = Pinecone.from_existing_index(pinecone_idx_name, embeddings)
            db.add_documents(texts)

    LOGGER.info(
        f"Successfully created {emb_store_type} vectorstore at {output_directory}"
    )

    return db

# Experiment Class

In [82]:
class DrugOutput(BaseModel):
    drug_name: str = Field(description = "Name of the drug")
    advantages: str = Field(description = "Advantages of the drug ")
    disadvantages: str = Field(description = "Disadvantages of the drug")

class Experiment:
    """Experiment Module"""

    def __init__(
        self,
        prompt_template: Union[PromptTemplate, ChatPromptTemplate],
        vector_store: str,
        llm_type: str = "gpt-3.5-turbo",
        emb: str = "text-embedding-ada-002",
        keys_json: str = osp.join(MAIN_DIR, "auth", "api_keys.json"),
        temperature: float = 0,
        max_tokens: int = 512,
        gt: Optional[str] = None,
        verbose: bool = False,
    ):
        """Initiate Instance for an experiment run

        Args:
            prompt_template (Union[PromptTemplate, ChatPromptTemplate]): Prompt to be feed to LLM
            vector_store (str): Path to Vector Index Database
            llm_type (str, optional): Type of LLM Model. Defaults to "gpt-3.5-turbo".
            emb (str, optional): Type of Embedding Model. Defaults to "text-embedding-ada-002".
            keys_json (str, optional): Path to API Keys. Defaults to osp.join(MAIN_DIR, "auth", "api_keys.json").
            temperature (float, optional): Temperature Settings for LLM model. Lower temperature makes LLM more deterministic
                while higher temperature makes LLM more random. Defaults to 0.
            max_tokens (int, optional): Max_Tokens Settings for LLM model. Defaults to 512.
            gt (Optional[str], optional): Path to Ground Truth file. Defaults to None.
            verbose (bool, optional): Verbose Setting. Defaults to False.
        """

        self.llm_type = llm_type.lower()
        self.temperature = temperature
        self.max_tokens = max_tokens

        with open(keys_json, "r") as f:
            keys = json.load(f)

        self.openai_key = (
            keys["OPENAI_API_KEY_FOR_GPT4"]
            if self.llm_type == "gpt-4"
            else keys["OPENAI_API_KEY"]
        )

        if isinstance(prompt_template, ChatPromptTemplate):
            self.llm = ChatOpenAI(
                model_name=self.llm_type,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                openai_api_key=self.openai_key,
            )
        else:
            self.llm = OpenAI(
                model_name=self.llm_type,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                openai_api_key=self.openai_key,
            )
        self.embedder = OpenAIEmbeddings(model=emb, openai_api_key=self.openai_key)
        try:
            self.load_vectorstore(vector_store)
        except ValueError:
            print(
                "Vectorstore invalid. Please load valid vectorstore or create new vectorstore."
            )

        self.prompt_template = prompt_template
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = self.load_groundtruth(gt) if gt else None
        self.drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)
        self.chain = None
        self.verbose = verbose

    def load_vectorstore(self, vectorstore_path: str):
        """Load Vectorstore from path

        Args:
            vectorstore_path (str): Path to vector database folder.
        """
        assert "index.faiss" in os.listdir(
            vectorstore_path
        ) and "index.pkl" in os.listdir(vectorstore_path), "Invalid Vectorstore"
        self.docsearch = FAISS.load_local(vectorstore_path, self.embedder)
        LOGGER.info("Successfully loaded existing vectorstore from local storage")

    def generate_vectorstore(
        self,
        data_directory: Optional[str] = None,
        output_directory: str = "./vectorstore",
        emb_store_type: str = "faiss",
        chunk_size: int = 1000,
        chunk_overlap: int = 250,
        exclude_pages: Optional[Dict] = None,
        pinecone_idx_name: Optional[str] = None,
        additional_docs: Optional[str] = None,
        key_path: Optional[str] = os.path.join(MAIN_DIR, "auth", "api_keys.json"),
    ):
        """Generate New vectorstore

        Args:
            data_directory (str): Directory contains source documents
            output_directory (str, optional): Output directory of vector index database. Defaults to "./vectorstore".
            emb_store_type (str, optional): Type of vector index database. Defaults to "faiss".
            chunk_size (int, optional): Maximum size of text chunks (characters) after split. Defaults to 1000.
            chunk_overlap (int, optional): Maximum overlapping window between text chunks. Defaults to 250.
            exclude_pages (Optional[Dict], optional): Dictionary of pages to be excluded from documents. Defaults to None.
            pinecone_idx_name (Optional[str], optional): Name of pinecone index to be created or loaded. Defaults to None.
            additional_docs (Optional[str], optional): Additional Tables, Images or Json to be added to doc list. Defaults to None.
            key_path (Optional[str], optional): Path to file containing API info.
                Defaults to os.path.join(MAIN_DIR, "auth", "api_keys.json").
        """
        self.docsearch = generate_vectorstore(
            data_directory=data_directory,
            embedder=self.embedder,
            output_directory=output_directory,
            emb_store_type=emb_store_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            exclude_pages=exclude_pages,
            pinecone_idx_name=pinecone_idx_name,
            additional_docs=additional_docs,
            key_path=key_path,
        )

    def run_test_cases(self, test_cases: Union[List[str], str]):
        """Run and save test cases to memory

        Args:
            test_cases (Union[List[str], str]): List of test queries.
        """
        if isinstance(test_cases, str):
            with open(test_cases, "r", encoding="utf-8-sig") as f:
                test_cases = f.readlines()
            test_cases = [test_case.rstrip() for test_case in test_cases]

        if not self.chain:
            self._create_retriever_chain()

        for test_case in test_cases:
            print("Query: {}".format(test_case))
            output = self.chain(test_case)
            self.questions.append(output["question"])
            self.answers.append(output["answer"])
            sources = []
            for document in output["source_documents"]:
                sources.append(
                    {
                        "title": document.metadata["title"],
                        "filename": document.metadata["source"].split("/")[-1],
                        "page": document.metadata["page"],
                        "modal": document.metadata["modal"],
                        "text": document.page_content,
                    }
                )

            self.sources.append(sources)

    @staticmethod
    def convert_prompt_to_string(
        prompt: Union[PromptTemplate, ChatPromptTemplate]
    ) -> str:
        """Convert Prompt Object to string format

        Args:
            prompt (Union[PromptTemplate, ChatPromptTemplate]): Prompt Template

        Returns:
            str: Prompt String Template
        """
        return prompt.format(**{v: v for v in prompt.input_variables})

    @staticmethod
    def process_source(source: Dict) -> str:
        """_summary_

        Args:
            source (Dict): Source Document Information

        Returns:
            str: Source Document Information in string
        """
        return "\n\n".join([f"{k}: {v}" for k, v in source.items()])

    def save_json(self, output_path: str):
        """Save Output of test case runs to json file

        Args:
            output_path (str): Output Path to json file.
        """
        output_dict = {}
        output_dict["prompt"] = Experiment.convert_prompt_to_string(
            self.prompt_template
        )
        output_dict["test_cases"] = []

        for question, answer, source in zip(self.questions, self.answers, self.sources):
            output_dict["test_cases"].append(
                {"question": question, "answer": answer, "sources": source}
            )

        with open(output_path, "w") as f:
            json.dump(output_dict, f)

    def load_groundtruth(self, gt_path: str) -> pd.DataFrame:
        """Load Ground Truth information from .csv file

        Args:
            gt_path (str): Path to Ground Truth file

        Returns:
            pd.DataFrame: DataFrame containing Ground Truth data.
        """
        return pd.read_csv(gt_path, encoding="ISO-8859-1")

    def reset(self):
        """Reset queries and answers"""
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = None

    def load_json(self, json_path: str, reset: bool = False):
        """Load Queries and Answers from Json file

        Args:
            json_path (str): Path to json output file to load into instance
            reset (bool, optional): If reset, clear queries and answers from memory before loading. Defaults to False.
        """
        if reset:
            self.reset()
        with open(json_path, "r") as f:
            input_dict = json.load(f)
        for test_case in input_dict["test_cases"]:
            self.questions.append(test_case["question"])
            self.answers.append(test_case["answer"])
            self.sources.append(test_case["sources"])

    def write_csv(self, output_csv: str):
        """Write questions and answers to .csv files

        Args:
            output_csv (str): Path to output csv file
        """

        pd_answers = [[], []]
        pd_pros = [[], []]
        pd_cons = [[], []]
        pd_sources = [[], [], [], [], [], []]

        for answer, sources in zip(self.answers, self.sources):
            drugs = [
                self.drug_parser.parse(drug)
                for drug in re.findall(re.compile(r"{[^{}]+}"), answer)
            ]
            pd_answers[0].append(drugs[0].drug_name if len(drugs) > 0 else None)
            pd_answers[1].append(drugs[1].drug_name if len(drugs) > 1 else None)
            pd_pros[0].append(drugs[0].advantages if len(drugs) > 0 else None)
            pd_cons[0].append(drugs[0].disadvantages if len(drugs) > 0 else None)
            pd_pros[1].append(drugs[1].advantages if len(drugs) > 1 else None)
            pd_cons[1].append(drugs[1].disadvantages if len(drugs) > 1 else None)

            for idx, source in enumerate(sources):
                pd_sources[idx].append(Experiment.process_source(source))

            if idx + 1 < len(pd_sources):
                for i in range(idx + 1, len(pd_sources)):
                    pd_sources[i].append(None)

        info = {"question": self.questions}

        if self.ground_truth is not None:
            info["gt_rec1"] = self.ground_truth["Recommendation 1"].tolist()
            info["gt_rec2"] = self.ground_truth["Recommendation 2"].tolist()
            info["gt_rec3"] = self.ground_truth["Recommendation 3"].tolist()
            info["gt_avoid"] = self.ground_truth["Drug Avoid"].tolist()
            info["gt_reason"] = self.ground_truth["Reasoning"].tolist()

        info["prompt"] = [
            Experiment.convert_prompt_to_string(self.prompt_template)
        ] * len(self.questions)
        info["raw_answer"] = self.answers
        info["answer1"] = pd_answers[0]
        info["pro1"] = pd_pros[0]
        info["cons1"] = pd_cons[0]
        info["answer2"] = pd_answers[1]
        info["pro2"] = pd_pros[1]
        info["cons2"] = pd_cons[1]
        info["source1"] = pd_sources[0]
        info["source2"] = pd_sources[1]
        info["source3"] = pd_sources[2]
        info["source4"] = pd_sources[3]
        info["source5"] = pd_sources[4]
        info["source6"] = pd_sources[5]

        panda_df = pd.DataFrame(info)

        panda_df.to_csv(output_csv, header=True)

    def _create_retriever_chain(
        self,
        chain_type: str = "stuff",
        return_source_documents: bool = True,
        reduce_k_below_max_tokens: bool = True,
    ):
        """Initiate QA from Source Chain

        Args:
            chain_type (str, optional): Chain Type. Can be stuff|map_reduce|refine|map_rerank. Defaults to "stuff".
            return_source_documents (bool, optional): Whether to return source documents along side answers. Defaults to True.
            reduce_k_below_max_tokens (bool, optional): _description_. Defaults to True.
        """
        self.chain = RetrievalQAWithSourcesChain.from_chain_type(
            llm=self.llm,
            chain_type=chain_type,
            retriever=self.docsearch.as_retriever(),
            return_source_documents=return_source_documents,
            chain_type_kwargs={"prompt": self.prompt_template},
            reduce_k_below_max_tokens=reduce_k_below_max_tokens,
            verbose=self.verbose,
        )


# Create Vectorstore

In [37]:
for filename in os.listdir(os.path.join(DOCUMENT_SOURCE, PROJECT)):
    pages = PyMuPDFLoader(os.path.join(DOCUMENT_SOURCE, PROJECT, filename)).load()
    pprint(pages[0].__dict__["metadata"])

{'author': 'Juan S Lasa MD',
 'creationDate': "D:20211223225432+05'30'",
 'creator': 'Elsevier',
 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf',
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': "D:20220108175830+05'30'",
 'page': 0,
 'producer': 'Acrobat Distiller 6.0 for Windows',
 'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf',
 'subject': 'The Lancet Gastroenterology & Hepatology, 7 (2022) 161-170. '
            'doi:10.1016/S2468-1253(21)00377-0',
 'title': 'Efficacy and safety of biologics and small molecule drugs for '
          'patients with moderate-to-severe ulcerative colitis: a systematic '
          'review and network meta-analysis',
 'total_pages': 10,
 'trapped': ''}
{'author': 'Manasi Agrawal',
 'creationDate': "D:20210619033246+05'30'",
 'creator': 'Elsevier',
 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/

In [91]:
EXCLUDE_DICT = {
    "agrawal.pdf": [13, 14, 15, 16, 17, 18],
    "PIIS1542356520300446.pdf": [12, 13, 14, 15, 16, 17, 18],
    "gutjnl-2021-326390R2 CLEAN.pdf": [0, 2, 31, 32, 33, 34, 35, 36,
                                       37, 38, 39, 40, 41, 42, 43, 44, 45]\
                                        + list(range(3, 31)),
    "otad009.pdf": [15, 16],
    "1-s2.0-S2468125321003770-main.pdf": [9],
    "juillerat 2022.pdf": [6, 7, 8],
}
datastore_paths = [os.path.join(DOCUMENT_SOURCE, PROJECT, file_name) for file_name in os.listdir(os.path.join(DOCUMENT_SOURCE, PROJECT)) if file_name.endswith(".pdf")]
print("Number of documents in datastore:", len(datastore_paths))
for i, path in enumerate(datastore_paths):
    print(f"Index {i + 1}: {path}")

Number of documents in datastore: 6
Index 1: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf
Index 2: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/agrawal.pdf
Index 3: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/gutjnl-2021-326390R2 CLEAN.pdf
Index 4: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/juillerat 2022.pdf
Index 5: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/otad009.pdf
Index 6: /mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/PIIS1542356520300446.pdf


In [93]:
sample_path = datastore_paths[0]
pdfloader = PyMuPDFLoader(sample_path)

sample_data = pdfloader.load()
print("Number of pages:", len(sample_data))
sample_page = sample_data[4]
print(str(sample_page.metadata) + "\n")
content = sample_page.page_content
print(f"Text Length: {len(content)}\n")
# content = re.sub(r"\t+", " ", content)
pprint(content[:1000])
metadata = sample_page.metadata
pprint(metadata)

Number of pages: 10
{'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf', 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/1-s2.0-S2468125321003770-main.pdf', 'page': 4, 'total_pages': 10, 'format': 'PDF 1.7', 'title': 'Efficacy and safety of biologics and small molecule drugs for patients with moderate-to-severe ulcerative colitis: a systematic review and network meta-analysis', 'author': 'Juan S Lasa MD', 'subject': 'The Lancet Gastroenterology & Hepatology, 7 (2022) 161-170. doi:10.1016/S2468-1253(21)00377-0', 'keywords': '', 'creator': 'Elsevier', 'producer': 'Acrobat Distiller 6.0 for Windows', 'creationDate': "D:20211223225432+05'30'", 'modDate': "D:20220108175830+05'30'", 'trapped': ''}

Text Length: 5743

('Articles\n'
 'www.thelancet.com/gastrohep   Vol 7   February 2022 \n'
 '165\n'
 'U-ACCOMPLISH8). No phase 3 RCTs with etrasimod or \n'
 'TD-1473 were found.\n'
 'Among 22 studies evaluat

In [None]:
generate_vectorstore(
    embeddings=OpenAIEmbeddings(openai_api_key = OPENAI_KEY),
    source_directory=os.path.join(DOCUMENT_SOURCE, PROJECT),
    output_directory=os.path.join(EMBSTORE_DIR, PROJECT, "faiss", "text-embedding-ada-002", "test"),
    emb_store_type="faiss",
    chunk_size=1000,
    chunk_overlap=250,
    exclude_pages=EXCLUDE_DICT)

# Prototypes

## Test Cases

In [83]:
with open(osp.join(DATA_DIR, "queries", "uc.txt"), "r", encoding = "utf-8-sig") as f:
    test_cases = f.readlines()

test_cases = [test_case.rstrip() for test_case in test_cases]
test_cases

['40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations',
 '70 year old female with newly diagnosed severe UC',
 '35 year old male with known moderate UC with prior exposure to infliximab but has worsening colitis on endoscopy despite compliance',
 '60 year old female with newly diagnosed moderate UC with a background of congestive cardiac failure',
 '38 year old female with newly diagnosed moderate UC and psoriasis',
 '25 year old pregnant woman with severe distal ulcerative colitis',
 '56 year old man with moderate to severe ulcerative colitis and ankylosing spondylitis',
 '38 year old man with severe ulcerative colitis and has lost response to vedolizumab',
 '28 year old woman who has severe extensive ulcerative colitis and has a history of lymphoma which was treated 4 years ago',
 '36 year old woman with moderate ulcerative colitis and multiple sclerosis']

## Experiment 1: Only Text - Normal Prompt Template - GPT4

### Prompt Setup

In [None]:
### STANDARD PROMPT TEMPLATE
drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)

prompt_template = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

{summaries}

{format_instructions}

Question: {question}
Answer:
"""

TEST_PROMPT_TEMPLATE_1 = PromptTemplate(
    template = prompt_template,
    input_variables = ["summaries", "question"],
    partial_variables={"format_instructions": drug_parser.get_format_instructions()}
)

print(TEST_PROMPT_TEMPLATE_1.format(summaries = "Summaries", question = "User Query"))

Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

Summaries

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
t

### Run Experiments

In [16]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "Text_Only"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [None]:
# Create and run experiment
exp1 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_1,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp1.run_test_cases(test_cases)

# Save Output
exp1.save_json(save_path+".json")
exp1.write_csv(save_path+".csv")

## Experiment 2: Only Text - CHAT Prompt Template - GPT4

### Prompt Setup

In [None]:
### CHAT PROMTP TEMPLATE
system_prompt = """
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

{summaries}

"""

TEST_PROMPT_TEMPLATE_2 = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_prompt, input_variables = ["summaries"]),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

print(TEST_PROMPT_TEMPLATE_2.format(summaries = "Summaries", question = "User Query"))

System: 
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

Summaries


Human: User Query


### Run Experiments

In [None]:
# Settings
LLM_TYPE = "gpt-3.5-turbo"
DESCRIPTION = "Text_Only_With_CHAT_Prompt"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [None]:
# Create and run experiment
exp2 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_2,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp2.run_test_cases(test_cases)

# Save Output
exp2.save_json(save_path+".json")
exp2.write_csv(save_path+".csv")

In [None]:
# Create and run experiment
exp2 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_2,
    vector_store = EMBSTORE_DIR,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp2.load_json(save_path + ".json")

# Save Output
exp2.write_csv(save_path+".csv")

INFO:root:Successfully loaded existing vectorstore from local storage


## Experiment 3 - ADD Table:

In [117]:
### STANDARD PROMPT TEMPLATE
drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)

prompt_template = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

{summaries}

{format_instructions}

Question: {question}
Answer:
"""

TEST_PROMPT_TEMPLATE_3 = PromptTemplate(
    template = prompt_template,
    input_variables = ["summaries", "question"],
    partial_variables={"format_instructions": drug_parser.get_format_instructions()}
)

print(TEST_PROMPT_TEMPLATE_3.format(summaries = "Summaries", question = "User Query"))

Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

Summaries

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
t

In [None]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "Text_with_juillerat_table"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
VECTORSTORE = os.path.join(EMBSTORE_DIR, PROJECT, "faiss", "text-embedding-ada-002", "v2-add")
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [107]:
add_docs_path = os.path.join(MAIN_DIR, "data/additional_docs.json")

with open(add_docs_path, "r") as f:
    additional_documents = json.load(f)
    
add_docs = []
for table in additional_documents:
    rows = load_single_document(os.path.join(MAIN_DIR, table["filename"]))
    for row in rows:
        row_no = row.metadata["row"]
        metadata = {k:v for k, v in table["metadata"].items()}
        metadata["row"] = row_no
        metadata["mode"] = table["mode"]
        row.page_content = table["description"] + ":" + row.page_content
        row.metadata = metadata
        add_docs.append(row)

In [108]:
add_docs

[Document(page_content='Efficacy of biological treatments according to the line of treatment, earlier exposure, disease phenotype and patient characteristics. :Patient Profile: Fresh No previous treatment\nBest: Infliximab (IFX*)\n2nd Best: Vedoluzimab (VEDO)\n3rd Best: Ustekinumab (USTE)\n4th Best: Golimumab (GOL)\n5th Best: Adalimumab (ADA)', metadata={'author': 'Pascal Juillerat', 'creator': 'Elsevier', 'file_path': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/juillerat 2022.pdf', 'keywords': '', 'page': 3, 'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/document_store/uc/juillerat 2022.pdf', 'subject': 'Current Research in Pharmacology and Drug Discovery, 3 (2022) 100104. doi:10.1016/j.crphar.2022.100104', 'title': 'Positioning biologics in the treatment of IBD: A practical guide - Which mechanism of action for whom?', 'total_pages': 9, 'row': 0, 'mode': 'table'}),
 Document(page_content='Efficacy of biological treatments according to the line of trea

In [115]:
generate_vectorstore(
    embeddings=OpenAIEmbeddings(openai_api_key = OPENAI_KEY),
    source_directory=os.path.join(DOCUMENT_SOURCE, PROJECT),
    output_directory=VECTORSTORE,
    emb_store_type="faiss",
    chunk_size=1000,
    chunk_overlap=250,
    exclude_pages=EXCLUDE_DICT,
    additional_docs=add_docs)

<langchain.vectorstores.faiss.FAISS at 0x7fd4c6ae0eb0>

### Run Experiments

In [None]:
# Create and run experiment
exp3 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_3,
    vector_store = VECTORSTORE,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp3.run_test_cases(test_cases)

# Save Output
exp3.save_json(save_path+".json")
exp3.write_csv(save_path+".csv")

## Experiment 4 - Two Steps: Summary -> QA

### Summarization

In [122]:
from langchain.chains.summarize import load_summarize_chain

In [153]:
documents = load_single_document(
    os.path.join(MAIN_DIR, "data/document_store/uc/juillerat 2022.pdf")
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500, chunk_overlap=500
)
texts = text_splitter.split_documents(documents)
print(len(texts))

37


In [155]:
len(texts[0].page_content)

2479

In [156]:
prompt_template = """
Write a summary on the following text:
{text}

The summary will contain information relevant to treatment for moderate to severe ulcerative colitis (UC) for
the following patient profile: 
40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations

The summary will compare the pros and cons of different biological drugs on the patient.

Summary:"""

BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
                        input_variables=["text"])

chain = load_summarize_chain(ChatOpenAI(model_name="gpt-3.5-turbo",
                                        openai_api_key=keys["OPENAI_API_KEY"],
                                        max_tokens=1024,
                                        temperature=0), 
                             chain_type="map_reduce", 
                             map_prompt=BULLET_POINT_PROMPT
)

output_summary = chain.run(texts)

In [159]:
pprint(output_summary)

('Biological therapies have greatly improved the management of inflammatory '
 'bowel diseases (IBD), with anti-TNF being the main treatment. The number of '
 'available biological therapies has doubled in the last decade, and '
 'personalized treatment is important based on clinical situation, efficacy, '
 'and safety. Patient demographics and comorbidities should also be '
 'considered. Biologic therapies may increase the risk of infection and '
 'malignancy. A 40-year-old male with newly diagnosed moderate UC and '
 'extraintestinal manifestations may benefit from various biologic treatments. '
 'Obesity may affect the effectiveness of anti-TNFalpha agents. Vedolizumab '
 'and ustekinumab can treat inflammatory cutaneous lesions in IBD. The impact '
 'of body mass index on the efficacy of biological therapies in patients with '
 'psoriasis is being studied.')


In [144]:
len([text.page_content for text in texts])

91

In [120]:
# documents = load_documents(os.path.join(DOCUMENT_SOURCE, PROJECT),
#                            exclude_pages=EXCLUDE_DICT)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000, chunk_overlap=250
# )
# texts = text_splitter.split_documents(documents)
# print(len(texts))

367


## Custom Agent

### QA from text Agent

### CSV Agent

In [None]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

model_name = 'text-davinci-003'
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature, openai_api_key = OPENAI_KEY)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

joke_query = "Tell me a joke."
_input = prompt.format_prompt(query=joke_query)

output = model(_input.to_string())

In [None]:
print(_input.text)

Answer the user query.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"drug_name": {"title": "Drug Name", "description": "Name of the drug", "type": "string"}, "description": {"title": "Description", "description": "Overall summary of the drug", "type": "string"}, "advantages": {"title": "Advantages", "description": "Advantages of the drug ", "type": "string"}, "disadvantages": {"title": "Disadvantages", "description": "Disadvantages of the drug", "type": "string"}}, "required": ["drug_name", "description", "advantages", "disadvantages"]}
```
Tell me a joke.



In [None]:
output

'\n{"drug_name": "Joke", "description": "A joke to make you laugh", "advantages": "It can make you laugh and bring joy", "disadvantages": "It may not be funny"}'