# Setup

In [None]:
!pip install --quiet langchain openai faiss-cpu tiktoken pypdf PyMuPDF

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive

drive.mount("/content/drive")

%cd drive/MyDrive/LLM/ulcerative_colitis

Mounted at /content/drive
/content/drive/MyDrive/LLM/ulcerative_colitis


# Import dependencies

In [80]:
import os, sys, json, logging
import os.path as osp
import re
import pandas as pd

from typing import Any, Union, Tuple, Sequence, Dict, Callable, List, Optional
from pydantic import BaseModel, Field, validator
from pprint import pprint
from tqdm.auto import tqdm
from time import time
from datetime import datetime
from langchain.docstore.document import Document
from chromadb.config import Settings

from langchain import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import VectorStore, FAISS, Chroma, Pinecone
import pinecone
from langchain.document_loaders.pdf import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain, _split_list_of_docs, _collapse_docs
from langchain.chains.combine_documents.map_rerank import MapRerankDocumentsChain
from langchain.chains.combine_documents.refine import RefineDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.callbacks.manager import Callbacks

import config
from config import MAIN_DIR, DATA_DIR, ARTIFACT_DIR, DOCUMENT_SOURCE

from shutil import rmtree
from utils import load_single_document, load_documents
import yaml

from pydantic import root_validator

In [20]:
PROJECT = "uc"

with open(osp.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    keys = json.load(f)

OPENAI_KEY = keys["OPENAI_API_KEY"]
SOURCE_DATA = os.path.join(DOCUMENT_SOURCE, PROJECT)
EMBSTORE_DIR = os.path.join(config.EMBSTORE_DIR, PROJECT, "faiss", "text-embedding-ada-002")

EXCLUDE_DICT = {
    "agrawal.pdf": [13, 14, 15, 16, 17, 18],
    "PIIS1542356520300446.pdf": [12, 13, 14, 15, 16, 17, 18],
    "gutjnl-2021-326390R2 CLEAN.pdf": [0, 2, 31, 32, 33, 34, 35, 36,
                                       37, 38, 39, 40, 41, 42, 43, 44, 45]\
                                        + list(range(3, 31)),
    "otad009.pdf": [15, 16],
    "1-s2.0-S2468125321003770-main.pdf": [9],
    "juillerat 2022.pdf": [6, 7, 8],
}

In [6]:
LOGGER = logging.getLogger()

log_path = os.path.join(MAIN_DIR, "log", "logfile.txt")
file_handler = logging.FileHandler(
    filename=log_path)

formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
file_handler.setFormatter(formatter)

LOGGER.setLevel(logging.INFO)
LOGGER.addHandler(file_handler)

# User-defined Functions (UDF)

In [83]:
def convert_csv_to_documents(table_info: Dict, concatenate_rows: bool = True) -> List[Document]:
    """Convert a dictionary containing table information into list of Documents

    Args:
        table_info (Dict): Dictionary containing .csv table information

    Returns:
        List[Document]: List of rows inside the table
    """
    assert table_info["mode"] == "table" and table_info["filename"].endswith(".csv")
    rows = load_single_document(os.path.join(MAIN_DIR, table_info["filename"]))
    documents = []
    table_content = table_info["description"] + "\n\n"
    for row in rows:
        if concatenate_rows:
            table_content += row.page_content + "\n\n"
            table_doc = Document(
                page_content=table_content,
                metadata=table_info["metadata"]
            )
        else:
            row_no = row.metadata["row"]
            metadata = {k: v for k, v in table_info["metadata"].items()}
            metadata["row"] = row_no
            metadata["modal"] = table_info["mode"]
            row.page_content = table_info["description"] + ":" + row.page_content
            row.metadata = metadata
            documents.append(row)
            
    if concatenate_rows:
        documents.append(table_doc)
    
    return documents


def generate_vectorstore(
    embeddings: Callable,
    source_directory: Optional[str] = None,
    output_directory: str = "./vectorstore",
    emb_store_type: str = "faiss",
    chunk_size: int = 1000,
    chunk_overlap: int = 250,
    exclude_pages: Optional[Dict] = None,
    pinecone_idx_name: Optional[str] = None,
    additional_docs: Optional[List] = None,
    key_path: Optional[str] = os.path.join(MAIN_DIR, "auth", "api_keys.json"),
) -> VectorStore:
    """Generate New Vector Index Database

    Args:
        source_directory (str): Directory contains source documents
        embeddings (Callable): Function to convert text to vector embeddings
        output_directory (str, optional): Output directory of vector index database. Defaults to "./vectorstore".
        emb_store_type (str, optional): Type of vector index database. Defaults to "faiss".
        chunk_size (int, optional): Maximum size of text chunks (characters) after split. Defaults to 1000.
        chunk_overlap (int, optional): Maximum overlapping window between text chunks. Defaults to 250.
        exclude_pages (Optional[Dict], optional): Dictionary of pages to be excluded from documents. Defaults to None.
        pinecone_idx_name (Optional[str], optional): Name of pinecone index to be created or loaded. Defaults to None.
        additional_docs (Optional[str], optional): Additional Tables, Images or Json to be added to doc list. Defaults to None.
        key_path (Optional[str], optional): Path to file containing API info.
            Defaults to os.path.join(MAIN_DIR, "auth", "api_keys.json").

    Returns:
        Vectorstore: Vector Database
    """

    if os.path.exists(output_directory):
        rmtree(output_directory)
    os.makedirs(output_directory, exist_ok=True)

    if source_directory:
        LOGGER.info(f"Loading documents from {source_directory}")

        documents = load_documents(source_directory, exclude_pages=exclude_pages)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        texts = text_splitter.split_documents(documents)

        LOGGER.info(f"Loaded {len(documents)} documents from {source_directory}")
        LOGGER.info(
            f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)"
        )
    else:
        texts = []

    if additional_docs:
        texts.extend(additional_docs)

    LOGGER.info(
        f"Total number of text chunks to create vector index store: {len(texts)}"
    )

    if emb_store_type == "chroma":
        chroma_settings = Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=output_directory,
            anonymized_telemetry=False,
        )
        db = Chroma.from_documents(
            texts,
            embeddings,
            persist_directory=output_directory,
            client_settings=chroma_settings,
        )
        db.persist()

    elif emb_store_type == "faiss":
        db = FAISS.from_documents(texts, embedding=embeddings)
        db.save_local(output_directory)
        assert "index.faiss" in os.listdir(
            output_directory
        ) and "index.pkl" in os.listdir(output_directory)

    elif emb_store_type == "pinecone":
        with open(key_path, "r") as f:
            keys = json.loads(f)
        PINECONE_API_KEY = keys["PINECONE_API"]["KEY"]
        PINECONE_ENV = keys["PINECONE_API"]["ENV"]

        pinecone.init(
            api_key=PINECONE_API_KEY,
            environment=PINECONE_ENV,
        )

        if not pinecone_idx_name:
            pinecone_idx_name = "index_{}".format(
                datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
            )

        if pinecone_idx_name not in pinecone.list_indexes():
            db = Pinecone.from_documents(
                texts, embedding=embeddings, index_name=pinecone_idx_name
            )

        else:
            db = Pinecone.from_existing_index(pinecone_idx_name, embeddings)
            db.add_documents(texts)

    LOGGER.info(
        f"Successfully created {emb_store_type} vectorstore at {output_directory}"
    )

    return db
    
def check_documents_token(
    docs: List[Document],
    llm = ChatOpenAI(temperature=0,
                     model_name="gpt-3.5-turbo",
                     openai_api_key=OPENAI_KEY)
    ):
    if not isinstance(docs, List):
        docs = [docs]
    combine_document_chain = StuffDocumentsChain(
        llm_chain=LLMChain(
            llm=llm,
            prompt=PromptTemplate(template="{summaries}",
                                input_variables=["summaries"]),
            verbose=False,
        ),
        verbose=False
    )
    return combine_document_chain.prompt_length(docs)

# Experiment Class

In [27]:
class DrugOutput(BaseModel):
    drug_name: str = Field(description = "Name of the drug")
    advantages: str = Field(description = "Advantages of the drug ")
    disadvantages: str = Field(description = "Disadvantages of the drug")

class Experiment:
    """Experiment Module"""

    def __init__(
        self,
        prompt_template: Union[PromptTemplate, ChatPromptTemplate],
        vector_store: str,
        llm_type: str = "gpt-3.5-turbo",
        emb: str = "text-embedding-ada-002",
        keys_json: str = osp.join(MAIN_DIR, "auth", "api_keys.json"),
        temperature: float = 0,
        max_tokens: int = 512,
        gt: Optional[str] = None,
        verbose: bool = False,
    ):
        """Initiate Instance for an experiment run

        Args:
            prompt_template (Union[PromptTemplate, ChatPromptTemplate]): Prompt to be feed to LLM
            vector_store (str): Path to Vector Index Database
            llm_type (str, optional): Type of LLM Model. Defaults to "gpt-3.5-turbo".
            emb (str, optional): Type of Embedding Model. Defaults to "text-embedding-ada-002".
            keys_json (str, optional): Path to API Keys. Defaults to osp.join(MAIN_DIR, "auth", "api_keys.json").
            temperature (float, optional): Temperature Settings for LLM model. Lower temperature makes LLM more deterministic
                while higher temperature makes LLM more random. Defaults to 0.
            max_tokens (int, optional): Max_Tokens Settings for LLM model. Defaults to 512.
            gt (Optional[str], optional): Path to Ground Truth file. Defaults to None.
            verbose (bool, optional): Verbose Setting. Defaults to False.
        """

        self.llm_type = llm_type.lower()
        self.temperature = temperature
        self.max_tokens = max_tokens

        with open(keys_json, "r") as f:
            keys = json.load(f)

        self.openai_key = (
            keys["OPENAI_API_KEY_FOR_GPT4"]
            if self.llm_type == "gpt-4"
            else keys["OPENAI_API_KEY"]
        )

        if isinstance(prompt_template, ChatPromptTemplate):
            self.llm = ChatOpenAI(
                model_name=self.llm_type,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                openai_api_key=self.openai_key,
            )
        else:
            self.llm = OpenAI(
                model_name=self.llm_type,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                openai_api_key=self.openai_key,
            )
        self.embedder = OpenAIEmbeddings(model=emb, openai_api_key=self.openai_key)
        try:
            self.load_vectorstore(vector_store)
        except Exception:
            print(
                "Vectorstore invalid. Please load valid vectorstore or create new vectorstore."
            )

        self.prompt_template = prompt_template
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = self.load_groundtruth(gt) if gt else None
        self.drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)
        self.chain = None
        self.verbose = verbose

    def load_vectorstore(self, vectorstore_path: str):
        """Load Vectorstore from path

        Args:
            vectorstore_path (str): Path to vector database folder.
        """
        assert "index.faiss" in os.listdir(
            vectorstore_path
        ) and "index.pkl" in os.listdir(vectorstore_path), "Invalid Vectorstore"
        self.docsearch = FAISS.load_local(vectorstore_path, self.embedder)
        LOGGER.info("Successfully loaded existing vectorstore from local storage")

    def generate_vectorstore(
        self,
        data_directory: Optional[str] = None,
        output_directory: str = "./vectorstore",
        emb_store_type: str = "faiss",
        chunk_size: int = 1000,
        chunk_overlap: int = 250,
        exclude_pages: Optional[Dict] = None,
        pinecone_idx_name: Optional[str] = None,
        additional_docs: Optional[str] = None,
        key_path: Optional[str] = os.path.join(MAIN_DIR, "auth", "api_keys.json"),
    ):
        """Generate New vectorstore

        Args:
            data_directory (str): Directory contains source documents
            output_directory (str, optional): Output directory of vector index database. Defaults to "./vectorstore".
            emb_store_type (str, optional): Type of vector index database. Defaults to "faiss".
            chunk_size (int, optional): Maximum size of text chunks (characters) after split. Defaults to 1000.
            chunk_overlap (int, optional): Maximum overlapping window between text chunks. Defaults to 250.
            exclude_pages (Optional[Dict], optional): Dictionary of pages to be excluded from documents. Defaults to None.
            pinecone_idx_name (Optional[str], optional): Name of pinecone index to be created or loaded. Defaults to None.
            additional_docs (Optional[str], optional): Additional Tables, Images or Json to be added to doc list. Defaults to None.
            key_path (Optional[str], optional): Path to file containing API info.
                Defaults to os.path.join(MAIN_DIR, "auth", "api_keys.json").
        """
        self.docsearch = generate_vectorstore(
            data_directory=data_directory,
            embedder=self.embedder,
            output_directory=output_directory,
            emb_store_type=emb_store_type,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            exclude_pages=exclude_pages,
            pinecone_idx_name=pinecone_idx_name,
            additional_docs=additional_docs,
            key_path=key_path,
        )

    def run_test_cases(
        self, test_cases: Union[List[str], str], only_return_source: bool = False
    ):
        """Run and save test cases to memory

        Args:
            test_cases (Union[List[str], str]): List of test queries.
        """
        if isinstance(test_cases, str):
            with open(test_cases, "r", encoding="utf-8-sig") as f:
                test_cases = f.readlines()
            test_cases = [test_case.rstrip() for test_case in test_cases]

        if not self.chain:
            self._create_retriever_chain()

        if only_return_source:
            LOGGER.info("Perform Semantic Search for Source Documents only (No QA).")

        for test_case in test_cases:
            print("Query: {}".format(test_case))
            sources = []  # All sources for 1 single query
            if only_return_source:
                self.questions.append(test_case)
                self.answers.append(None)
                inputs = {"question": test_case}
                source_documents = self.chain._get_docs(inputs)

            else:
                output = self.chain(test_case)
                self.questions.append(output["question"])
                self.answers.append(output["answer"])
                source_documents = output["source_documents"]

            for document in source_documents:
                sources.append(
                    {
                        "title": document.metadata["title"],
                        "filename": document.metadata["source"].split("/")[-1],
                        "page": document.metadata["page"],
                        "text": document.page_content,
                    }
                )

            self.sources.append(sources)

    @staticmethod
    def convert_prompt_to_string(
        prompt: Union[PromptTemplate, ChatPromptTemplate]
    ) -> str:
        """Convert Prompt Object to string format

        Args:
            prompt (Union[PromptTemplate, ChatPromptTemplate]): Prompt Template

        Returns:
            str: Prompt String Template
        """
        return prompt.format(**{v: v for v in prompt.input_variables})

    @staticmethod
    def process_source(source: Dict) -> str:
        """_summary_

        Args:
            source (Dict): Source Document Information

        Returns:
            str: Source Document Information in string
        """
        return "\n\n".join([f"{k}: {v}" for k, v in source.items()])

    def save_json(self, output_path: str):
        """Save Output of test case runs to json file

        Args:
            output_path (str): Output Path to json file.
        """
        output_dict = {}
        output_dict["prompt"] = Experiment.convert_prompt_to_string(
            self.prompt_template
        )
        output_dict["test_cases"] = []

        for question, answer, source in zip(self.questions, self.answers, self.sources):
            output_dict["test_cases"].append(
                {"question": question, "answer": answer, "sources": source}
            )

        with open(output_path, "w") as f:
            json.dump(output_dict, f)

    def load_groundtruth(self, gt_path: str) -> pd.DataFrame:
        """Load Ground Truth information from .csv file

        Args:
            gt_path (str): Path to Ground Truth file

        Returns:
            pd.DataFrame: DataFrame containing Ground Truth data.
        """
        return pd.read_csv(gt_path, encoding="ISO-8859-1")

    def reset(self):
        """Reset queries and answers"""
        self.questions = []
        self.answers = []
        self.sources = []
        self.ground_truth = None

    def load_json(self, json_path: str, reset: bool = False):
        """Load Queries and Answers from Json file

        Args:
            json_path (str): Path to json output file to load into instance
            reset (bool, optional): If reset, clear queries and answers from memory before loading. Defaults to False.
        """
        if reset:
            self.reset()
        with open(json_path, "r") as f:
            input_dict = json.load(f)
        for test_case in input_dict["test_cases"]:
            self.questions.append(test_case["question"])
            self.answers.append(test_case["answer"])
            self.sources.append(test_case["sources"])

    def write_csv(self, output_csv: str):
        """Write questions and answers to .csv files

        Args:
            output_csv (str): Path to output csv file
        """

        pd_answers = [[], []]
        pd_pros = [[], []]
        pd_cons = [[], []]
        pd_sources = [[], [], [], [], [], []]

        for answer, sources in zip(self.answers, self.sources):
            if answer:
                drugs_info = re.findall(re.compile(r"{[^{}]+}"), answer)
                drugs = []
                for drug in drugs_info:
                    try:
                        drug = self.drug_parser.parse(drug)
                        drugs.append(drug)
                    except Exception:
                        pass
            else:
                drugs = []
                
            pd_answers[0].append(drugs[0].drug_name if len(drugs) > 0 else None)
            pd_answers[1].append(drugs[1].drug_name if len(drugs) > 1 else None)
            pd_pros[0].append(drugs[0].advantages if len(drugs) > 0 else None)
            pd_cons[0].append(drugs[0].disadvantages if len(drugs) > 0 else None)
            pd_pros[1].append(drugs[1].advantages if len(drugs) > 1 else None)
            pd_cons[1].append(drugs[1].disadvantages if len(drugs) > 1 else None)

            for idx, source in enumerate(sources):
                pd_sources[idx].append(Experiment.process_source(source))

            if idx + 1 < len(pd_sources):
                for i in range(idx + 1, len(pd_sources)):
                    pd_sources[i].append(None)

        info = {"question": self.questions}

        if self.ground_truth is not None:
            info["gt_rec1"] = self.ground_truth["Recommendation 1"].tolist()
            info["gt_rec2"] = self.ground_truth["Recommendation 2"].tolist()
            info["gt_rec3"] = self.ground_truth["Recommendation 3"].tolist()
            info["gt_avoid"] = self.ground_truth["Drug Avoid"].tolist()
            info["gt_reason"] = self.ground_truth["Reasoning"].tolist()

        info["prompt"] = [
            Experiment.convert_prompt_to_string(self.prompt_template)
        ] * len(self.questions)
        info["raw_answer"] = self.answers
        info["answer1"] = pd_answers[0]
        info["pro1"] = pd_pros[0]
        info["cons1"] = pd_cons[0]
        info["answer2"] = pd_answers[1]
        info["pro2"] = pd_pros[1]
        info["cons2"] = pd_cons[1]
        info["source1"] = pd_sources[0]
        info["source2"] = pd_sources[1]
        info["source3"] = pd_sources[2]
        info["source4"] = pd_sources[3]
        info["source5"] = pd_sources[4]
        info["source6"] = pd_sources[5]

        panda_df = pd.DataFrame(info)

        panda_df.to_csv(output_csv, header=True)

    def _create_retriever_chain(
        self,
        chain_type: str = "stuff",
        return_source_documents: bool = True,
        reduce_k_below_max_tokens: bool = True,
    ):
        """Initiate QA from Source Chain

        Args:
            chain_type (str, optional): Chain Type. Can be stuff|map_reduce|refine|map_rerank. Defaults to "stuff".
            return_source_documents (bool, optional): Whether to return source documents along side answers. Defaults to True.
            reduce_k_below_max_tokens (bool, optional): If True, automatically reduce the number of source documents to
                ensure that total tokens below max_tokens limit. Defaults to True.
        """
        self.chain = RetrievalQAWithSourcesChain.from_chain_type(
            llm=self.llm,
            chain_type=chain_type,
            retriever=self.docsearch.as_retriever(),
            return_source_documents=return_source_documents,
            chain_type_kwargs={"prompt": self.prompt_template},
            reduce_k_below_max_tokens=reduce_k_below_max_tokens,
            verbose=self.verbose,
        )
        

# Create Vectorstore

In [None]:
datastore_paths = [os.path.join(DOCUMENT_SOURCE, PROJECT, file_name) for file_name in os.listdir(os.path.join(DOCUMENT_SOURCE, PROJECT)) if file_name.endswith(".pdf")]
print("Number of documents in datastore:", len(datastore_paths))
for i, path in enumerate(datastore_paths):
    print(f"Index {i + 1}: {path}")

In [None]:
sample_path = datastore_paths[0]

sample_data = load_single_document(sample_path)
print("Number of pages:", len(sample_data))
sample_page = sample_data[4]
print(str(sample_page.metadata) + "\n")
content = sample_page.page_content
print(f"Text Length: {len(content)}\n")
# content = re.sub(r"\t+", " ", content)
pprint(content[:1000])
metadata = sample_page.metadata
pprint(metadata)

### Ada-Text-Embeddings-2: Text + Tables

In [None]:
add_docs_path = os.path.join(MAIN_DIR, "data/additional_docs.json")

with open(add_docs_path, "r") as f:
    additional_documents = json.load(f)
    
add_docs = []
for table_info in additional_documents:
    add_docs.extend(convert_csv_to_documents(table_info))
    
add_docs

In [10]:
emb_model = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)
database_path = os.path.join(EMBSTORE_DIR, PROJECT,
                                "faiss", "text-embedding-ada-002", "v2-add")
embstore_type="faiss"
chunk_size = 1000
chunk_overlap = 200

In [11]:
generate_vectorstore(
    embeddings=emb_model,
    source_directory=SOURCE_DATA,
    output_directory=database_path,
    emb_store_type=embstore_type,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    exclude_pages=EXCLUDE_DICT,
    additional_docs=add_docs)

<langchain.vectorstores.faiss.FAISS at 0x7f0afde5cdc0>

# Prototypes

## Test Cases

In [3]:
with open(osp.join(DATA_DIR, "queries", "uc.txt"), "r", encoding = "utf-8-sig") as f:
    test_cases = f.readlines()

test_cases = [test_case.rstrip() for test_case in test_cases]
test_cases

['40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations',
 '70 year old female with newly diagnosed severe UC',
 '35 year old male with known moderate UC with prior exposure to infliximab but has worsening colitis on endoscopy despite compliance',
 '60 year old female with newly diagnosed moderate UC with a background of congestive cardiac failure',
 '38 year old female with newly diagnosed moderate UC and psoriasis',
 '25 year old pregnant woman with severe distal ulcerative colitis',
 '56 year old man with moderate to severe ulcerative colitis and ankylosing spondylitis',
 '38 year old man with severe ulcerative colitis and has lost response to vedolizumab',
 '28 year old woman who has severe extensive ulcerative colitis and has a history of lymphoma which was treated 4 years ago',
 '36 year old woman with moderate ulcerative colitis and multiple sclerosis']

## Experiment 1: Only Text - Normal Prompt Template - GPT4

### Prompt Setup

In [None]:
### STANDARD PROMPT TEMPLATE
drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)

prompt_template = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

{summaries}

{format_instructions}

Question: {question}
Answer:
"""

TEST_PROMPT_TEMPLATE_1 = PromptTemplate(
    template = prompt_template,
    input_variables = ["summaries", "question"],
    partial_variables={"format_instructions": drug_parser.get_format_instructions()}
)

print(TEST_PROMPT_TEMPLATE_1.format(summaries = "Summaries", question = "User Query"))

Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

Summaries

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
t

### Run Experiments

In [16]:
# Settings
llm_type = "gpt-4"
description = "normal_prompt_1000_200"
emb_store_dir = os.path.join(EMBSTORE_DIR, "v1_1000_50")
max_tokens = 1024
time = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
verbose = True
save_path = osp.join(ARTIFACT_DIR, f"{llm_type}_{description}_{time}")

In [None]:
# Create and run experiment
exp1 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_1,
    vector_store = emb_store_dir,
    llm_type = llm_type,
    max_tokens = max_tokens,
    gt = osp.join(DATA_DIR, "queries", "uc_gt.csv"),
    verbose = verbose
)

exp1.run_test_cases(test_cases)

# Save Output
exp1.save_json(save_path+".json")
exp1.write_csv(save_path+".csv")

## Experiment 2: Only Text - CHAT Prompt Template - GPT4

### Prompt 1

In [None]:
### CHAT PROMTP TEMPLATE
system_prompt = """
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

{summaries}

"""

TEST_PROMPT_TEMPLATE_2 = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_prompt, input_variables = ["summaries"]),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

print(TEST_PROMPT_TEMPLATE_2.format(summaries = "Summaries", question = "User Query"))

System: 
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

Summaries


Human: User Query


In [None]:
# Settings
llm_type = "gpt-4"
description = "Text_Only_With_CHAT_Prompt"
emb_store_dir = os.path.join(EMBSTORE_DIR, "v1_1000_50")
max_tokens = 1024
time = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
verbose = True
save_path = osp.join(ARTIFACT_DIR, f"{llm_type}_{description}_{time}")
print("Save directory:", save_path)

In [None]:
# Create and run experiment
exp2 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_2,
    vector_store = emb_store_dir,
    llm_type = llm_type,
    max_tokens = max_tokens,
    gt = osp.join(DATA_DIR, "queries", "uc_gt.csv"),
    verbose = verbose
)

exp2.run_test_cases(test_cases)

# Save Output
exp2.save_json(save_path+".json")
exp2.write_csv(save_path+".csv")

### Prompt 2

In [28]:
### CHAT PROMTP TEMPLATE
system_prompt = """
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

Analyse the patient ulcerative colitis (UC) severity and list all risk factors.
Analyse the patient profile and list all risk factors. Patient profile includes age, gender, pregnancy status, prior reactions to any drugs, whether the patient is newly diagnosed, extraintestinale manifestation, pouchtitis

FINALLY RETURN up to 2 TOP choices of recommended biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

{summaries}
"""

TEST_PROMPT_TEMPLATE_3 = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_prompt, input_variables = ["summaries"]),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

prompt_str = TEST_PROMPT_TEMPLATE_3.format(summaries = "Summaries", question = "User Query")
print(prompt_str)

System: 
Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).

Analyse the patient ulcerative colitis (UC) severity and list all risk factors.
Analyse the patient profile and list all risk factors. Patient profile includes age, gender, pregnancy status, prior reactions to any drugs, whether the patient is newly diagnosed, extraintestinale manifestation, pouchtitis

FINALLY RETURN up to 2 TOP choices of recommended biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.

Summaries

Human: User Query


In [29]:
# Settings
llm_type = "gpt-4"
description = "CHAT_Prompt_V2"
emb_store = "v5-add-tables_2500_500"
max_tokens = 1024
time = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
verbose = False
save_path = osp.join(ARTIFACT_DIR, f"{llm_type}_{description}_{time}")
print("Save directory:", save_path)

settings = {
    "project": PROJECT,
    "test_case": osp.join(DATA_DIR, "queries", "uc.txt"),
    "prompt": prompt_str,
    "ground_truth": "uc_gt.csv",
    "description": description,
    "verbose": verbose,

    "emb_type": "text-embedding-ada-002",
    "vectorstore": "faiss/text-embedding-ada-002/" + emb_store,
    "chunk_size": emb_store.split("_")[-2],
    "chunk_overlap": emb_store.split("_")[-1],
    "additional_docs": "data/additional_docs.json",
    "pinecone_index_name": None,

    "llm_type": "gpt-4",
    "temperature": 0,
    "max_tokens": max_tokens,
}

Save directory: /mnt/c/Users/QUAN/Desktop/medical-chatbot/artifacts/gpt-4_CHAT_Prompt_V2_07-07-2023-17-34-18


In [None]:
# Create and run experiment
exp4 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_3,
    vector_store = os.path.join(EMBSTORE_DIR, emb_store),
    llm_type = llm_type,
    max_tokens = max_tokens,
    gt = osp.join(DATA_DIR, "queries", "uc_gt.csv"),
    verbose = verbose
)

exp4.run_test_cases(test_cases)

# Save Output
exp4.save_json(os.path.join(save_path+"result.json"))
exp4.write_csv(os.path.join(save_path+"result.csv"))

with open(os.path.join(save_path, "settings.yaml"), "w") as f:
    yaml.dump(settings, f)

## Experiment 3 - Added Table:

In [9]:
### STANDARD PROMPT TEMPLATE
drug_parser = PydanticOutputParser(pydantic_object=DrugOutput)

prompt_template = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

{summaries}

{format_instructions}

Question: {question}
Answer:
"""

TEST_PROMPT_TEMPLATE_3 = PromptTemplate(
    template = prompt_template,
    input_variables = ["summaries", "question"],
    partial_variables={"format_instructions": drug_parser.get_format_instructions()}
)

print(TEST_PROMPT_TEMPLATE_3.format(summaries = "Summaries", question = "User Query"))

Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following step

ANALYSE the given patient profile based on given query based on one of the following criteria:
- Freshly treated patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile. Explain the PROS and CONS of the 2 choices.

Summaries

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
t

In [11]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "With Some Tables"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
VERBOSE = True
VECTORSTORE = os.path.join(EMBSTORE_DIR, PROJECT, "faiss", "text-embedding-ada-002", "v2-add-rows_1000_200")
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

### Run Experiments

In [None]:
# Create and run experiment
exp3 = Experiment(
    prompt_template = TEST_PROMPT_TEMPLATE_3,
    vector_store = VECTORSTORE,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    gt = osp.join(MAIN_DIR, "ground_truth.csv"),
    verbose = VERBOSE
)

exp3.run_test_cases(test_cases)

# Save Output
exp3.save_json(save_path + ".json")
exp3.write_csv(save_path + ".csv")

## Experiment 4 - Changing to Different Settings

### uc-3:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_1.py
- faiss/text-embedding-ada-002/v2-add-rows_1000_200

### uc-4:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_1.py
- faiss/text-embedding-ada-002/v3-add-rows_2500_500

### uc-5:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_1.py
- faiss/text-embedding-ada-002/v4-add-tables_1000_200

### uc-6:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_1.py
- faiss/text-embedding-ada-002/v5-add-tables_2500_500

### uc-7:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_1.py
- faiss/text-embedding-ada-002/v6-add-tables_750_100

### uc-8:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_2.py
- faiss/text-embedding-ada-002/v5-add-tables_2500_500

### uc-1_chat:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_chat_1.py
- faiss/text-embedding-ada-002/v5-add-tables_2500_500

### uc-2_chat:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_chat_1.py
- faiss/text-embedding-ada-002/v3-add-rows_2500_500

### uc-3_chat:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_chat_1.py
- faiss/text-embedding-ada-002/v4-add-tables_1000_200

### uc-4_chat:
- test_case: data/queries/uc.txt
- prompt: uc_qa_source_chat_2.py
- faiss/text-embedding-ada-002/v5-add-tables_2500_500

### uc-5_chat:
- test_case: data/queries/uc_long.txt
- prompt: uc_qa_source_chat_3.py
- faiss/text-embedding-ada-002/v5-add-tables_2500_500

In [None]:
! ./src/bash/multi_exps.sh uc_7.yaml uc_8_chat.yaml uc_4_chat.yaml uc_5_chat.yaml

## Experiment 5 - Map Reduce

### QA with semantic search

#### Pregnant

In [9]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "Map_Reduce_1_Prompt_Pregnant"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
VERBOSE = False
VECTORSTORE = os.path.join(EMBSTORE_DIR, PROJECT, "faiss",
                           "text-embedding-ada-002","v5-add-tables_2500_500")
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [10]:
human_prompt = """
=========
QUESTION: {question}
=========
"""

pregnancy_prompt = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
=========
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following steps
1. Identify if patient is pregnant.
2. Search from REFERENCE TEXT the best biological drugs based on whether patient is pregnant.
3. Return up to 2 TOP choices of biological drugs with the PROS and CONS of the 2 choices.

=========
REFERENCE TEXT:
{summaries}
"""

PREGNANCY_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(
            pregnancy_prompt, input_variables=["summaries"]
        ),
        HumanMessagePromptTemplate.from_template(human_prompt),
    ]
)

age_prompt = """Make reference to the context given to assess the scenario. If you do not know the answer. just say that "I don't know", don't try to make up an answer.
=========
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC). Perform the following steps
1. Identify from user input the age of patient, if applicable.
2. Search from REFERENCE TEXT the best biological drugs based on patient's age.
3. Return up to 2 TOP choices of biological drugs with the PROS and CONS of the 2 choices.

=========
REFERENCE TEXT:
{summaries}
"""

AGE_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(
            age_prompt, input_variables=["summaries"]
        ),
        HumanMessagePromptTemplate.from_template(human_prompt),
    ]
)

In [None]:
exp5_pregnant =  Experiment(
    prompt_template = PREGNANCY_PROMPT_TEMPLATE,
    vector_store = VECTORSTORE,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    verbose = VERBOSE
)

exp5_pregnant.run_test_cases(test_cases)
settings = {
    "project": PROJECT,
    "test_case": osp.join(DATA_DIR, "queries", "uc.txt"),
    "prompt": "individual_prompts.py",
    "ground_truth": "uc_gt.csv",
    "description": DESCRIPTION,
    "verbose": False,

    "emb_type": "text-embedding-ada-002",
    "vectorstore": VECTORSTORE,
    "chunk_size": 2500,
    "chunk_overlap": 500,
    "additional_docs": None,
    "pinecone_index_name": None,

    "llm_type": "gpt-4",
    "temperature": 0,
    "max_tokens": MAX_TOKENS,
}

os.makedirs(save_path, exist_ok=True)
exp5_pregnant.save_json(os.path.join(save_path, "result.json"))
exp5_pregnant.write_csv(os.path.join(save_path, "result.csv"))
with open(os.path.join(save_path, "settings.yaml"), "w") as f:
    yaml.dump(settings, f)

In [22]:
# Settings
LLM_TYPE = "gpt-4"
DESCRIPTION = "Map_Reduce_1_Prompt_Age"
MAX_TOKENS = 1024
TIME = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
VERBOSE = True
VECTORSTORE = os.path.join(EMBSTORE_DIR, PROJECT, "faiss",
                           "text-embedding-ada-002","v5-add-tables_2500_500")
save_path = osp.join(ARTIFACT_DIR, f"{LLM_TYPE}_{DESCRIPTION}_{TIME}")

In [32]:
exp5_age =  Experiment(
    prompt_template = AGE_PROMPT_TEMPLATE,
    vector_store = VECTORSTORE,
    llm_type = LLM_TYPE,
    max_tokens = MAX_TOKENS,
    verbose = VERBOSE
)

exp5_age.run_test_cases(test_cases)
settings = {
    "project": PROJECT,
    "test_case": osp.join(DATA_DIR, "queries", "uc.txt"),
    "prompt": "individual_prompts.py",
    "ground_truth": "uc_gt.csv",
    "description": "Map_Reduce_1_Prompt_Age",
    "verbose": True,

    "emb_type": "text-embedding-ada-002",
    "vectorstore": VECTORSTORE,
    "chunk_size": 2500,
    "chunk_overlap": 500,
    "additional_docs": None,
    "pinecone_index_name": None,

    "llm_type": "gpt-4",
    "temperature": 0,
    "max_tokens": MAX_TOKENS,
}

os.makedirs(save_path, exist_ok=True)
exp5_age.save_json(os.path.join(save_path, "result.json"))
exp5_age.write_csv(os.path.join(save_path, "result.csv"))
with open(os.path.join(save_path, "settings.yaml"), "w") as f:
    yaml.dump(settings, f)

Query: 40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
Query: 70 year old female with newly diagnosed severe UC


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
Query: 35 year old male with known moderate UC with prior exposure to infliximab but has worsening colitis on endoscopy despite compliance


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
Query: 60 year old female with newly diagnosed moderate UC with a background of congestive cardiac failure


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
Query: 38 year old female with newly diagnosed moderate UC and psoriasis


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
Query: 25 year old pregnant woman with severe distal ulcerative colitis


[1

### QA from docs
#### Map Reduce

In [86]:
documents = load_documents(os.path.join(DOCUMENT_SOURCE, PROJECT),
                           exclude_pages=EXCLUDE_DICT)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000, chunk_overlap=500
)
texts = text_splitter.split_documents(documents)

doc_max_tokens, text_max_tokens = 0, 0

print("Number of text documents:", len(documents))
print("Number of text chunks:", len(texts))

print("Number of tokens in all documents:", check_documents_token(documents))
print("Number of tokens in all text chunks:", check_documents_token(texts))

for document in documents:
    doc_tokens = check_documents_token(document)
    if doc_tokens > doc_max_tokens:
        doc_max_tokens = doc_tokens

for text in texts:
    text_tokens = check_documents_token(text)
    if text_tokens > text_max_tokens:
        text_max_tokens = text_tokens
    
print("Max tokens in all documents:", doc_max_tokens)
print("Max tokens in all text chunks:", text_max_tokens)

Number of text documents: 56
Number of text chunks: 86
Number of tokens in all documents: 80453
Number of tokens in all text chunks: 84814
Max tokens in all documents: 3301
Max tokens in all text chunks: 2211


In [93]:
class MapReduceDocumentsChainV2(MapReduceDocumentsChain):
    combine_max_tokens: int = 30000
    collapse_max_tokens: int = 5000

    @root_validator()
    def check_maximum_context_length(cls, values: Dict) -> Dict:
        max_token_dict = {
            "gpt-3.5-turbo": 3000,
            "gpt-3.5-turbo-16k": 14000,
            "gpt-4": 7000,
            "gpt-4-32k": 30000
        }
        
        combine_doc_llm_model = values["combine_document_chain"].llm_chain.llm.model_name
        if combine_doc_llm_model in max_token_dict:
            if max_token_dict[combine_doc_llm_model] < values["combine_max_tokens"]:
                values["combine_max_tokens"] = max_token_dict[combine_doc_llm_model]
        
        if values["collapse_document_chain"]:
            collapse_doc_llm_model = values["collapse_document_chain"].llm_chain.llm.model_name
        else:
            collapse_doc_llm_model = values["combine_document_chain"].llm_chain.llm.model_name
        
        if collapse_doc_llm_model in max_token_dict:
            if max_token_dict[collapse_doc_llm_model] < values["collapse_max_tokens"]:
                values["collapse_max_tokens"] = max_token_dict[collapse_doc_llm_model]

        return values

    def combine_docs(
        self,
        docs: List[Document],
        callbacks: Callbacks = None,
        **kwargs: Any,
    ) -> Tuple[str, dict]:
        """Combine documents in a map reduce manner.

        Combine by mapping first chain over all documents, then reducing the results.
        This reducing can be done recursively if needed (if there are many documents).
        """
        results = self.llm_chain.apply(
            # FYI - this is parallelized and so it is fast.
            [{self.document_variable_name: d.page_content, **kwargs} for d in docs],
            callbacks=callbacks,
        )
        return self._process_results(
            results, docs, callbacks=callbacks, **kwargs
        )

    def _process_results(
        self,
        results: List[Dict],
        docs: List[Document],
        callbacks: Callbacks = None,
        **kwargs: Any,
    ) -> Tuple[str, dict]:
        question_result_key = self.llm_chain.output_key
        result_docs = [
            Document(page_content=r[question_result_key], metadata=docs[i].metadata)
            # This uses metadata from the docs, and the textual results from `results`
            for i, r in enumerate(results)
        ]
        length_func = self.combine_document_chain.prompt_length
        num_tokens = length_func(result_docs, **kwargs)

        def _collapse_docs_func(docs: List[Document], **kwargs: Any) -> str:
            return self._collapse_chain.run(
                input_documents=docs, callbacks=callbacks, **kwargs
            )

        collapse_counter = 0
        while num_tokens is not None and num_tokens > self.combine_max_tokens:
            
            # 
            collapse_counter += 1
            if collapse_counter == 2:
                raise Exception("Double Collapse steps. Stop")            
            
            new_result_doc_list = _split_list_of_docs(
                result_docs, length_func, self.collapse_max_tokens, **kwargs
            )
            result_docs = []
            for docs in new_result_doc_list:
                new_doc = _collapse_docs(docs, _collapse_docs_func, **kwargs)
                result_docs.append(new_doc)
            num_tokens = self.combine_document_chain.prompt_length(
                result_docs, **kwargs
            )
        if self.return_intermediate_steps:
            _results = [r[self.llm_chain.output_key] for r in results]
            extra_return_dict = {"intermediate_steps": _results}
        else:
            extra_return_dict = {}
        output = self.combine_document_chain.run(
            input_documents=result_docs, callbacks=callbacks, **kwargs
        )
        return output, extra_return_dict

In [99]:
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo",
                 openai_api_key=OPENAI_KEY)
reduce_llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k",
                        openai_api_key=OPENAI_KEY)
collapse_llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k",
                        openai_api_key=OPENAI_KEY) 

In [101]:
## Map
system_question_template = """You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC) using biological drugs.
Use the following portion of a long document to see if any of the text is relevant to treatment of given patient profile using biological drugs. 
Return any relevant text verbatim.
______________________
{context}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_question_template),
    HumanMessagePromptTemplate.from_template("Patient Profile: {question}"),
]
CHAT_QUESTION_PROMPT = ChatPromptTemplate.from_messages(messages)

## Reduce

system_combine_template = """You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC) using biological drugs.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Given the following extracted information of a long document, return up to 2 top choices of biological drugs given the patient profile. 
Explain the PROS and CONS of the 2 choices with respect to the patient profile.

Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.
______________________
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_combine_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_COMBINE_PROMPT = ChatPromptTemplate.from_messages(messages)

CHAT_COLLAPSE_PROMPT = CHAT_COMBINE_PROMPT

In [102]:
map_chain = LLMChain(llm=llm, prompt=CHAT_QUESTION_PROMPT, verbose=verbose)
reduce_chain = LLMChain(llm=reduce_llm, prompt=CHAT_COMBINE_PROMPT,
                        verbose=verbose)
collapse_chain = LLMChain(llm=collapse_llm, prompt=CHAT_COLLAPSE_PROMPT,
                          verbose=verbose)

combine_document_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="summaries",
    verbose=verbose
    )

collapse_document_chain = StuffDocumentsChain(
    llm_chain=collapse_chain,
    document_variable_name="summaries",
    verbose=verbose
)

map_reduce_qa_chain = MapReduceDocumentsChainV2(
    llm_chain=map_chain,
    combine_document_chain=combine_document_chain,  
    collapse_document_chain=collapse_document_chain,
    document_variable_name="context",
    combine_max_tokens = 14000,
    collapse_max_tokens = 6000,
    verbose=verbose,
    return_intermediate_steps=True,
    return_map_steps=True
)

In [105]:
map_reduce_qa_chain({"input_documents": documents, "question": test_cases[0]})

{'input_documents': [Document(page_content='Articles\nwww.thelancet.com/gastrohep   Vol 7   February 2022 \n161\nLancet Gastroenterol Hepatol \n2022; 7: 161–70\nPublished Online \nNovember 29, 2021 \nhttps://doi.org/10.1016/ \nS2468-1253(21)00377-0\nSee Comment page 110\n*Contributed equally\nInflammatory Bowel Disease \nUnit, Gastroenterology \nSection, Department of \nInternal Medicine, Centro de \nEducación Médica e \nInvestigaciones Clínicas, \nBuenos Aires, Argentina \n(J S Lasa MD, P A Olivera MD); \nGastroenterology Department, \nHospital Británico de Buenos \nAires, Buenos Aires, Argentina \n(J S Lasa); Zane Cohen Centre \nfor Digestive Diseases, \nLunenfeld-Tanenbaum \nResearch Institute, Sinai Health \nSystem, Toronto, ON, Canada \n(P A Olivera); Division of \nGastroenterology, Mount Sinai \nHospital, University of Toronto, \nToronto, ON, Canada \n(P A Olivera); Gastroenterology \nand Endoscopy, IRCCS Ospedale \nSan Raffaele and University \nVita-Salute San Raffaele, \nMilano

### Summarization

In [153]:
documents = load_single_document(
    os.path.join(MAIN_DIR, "data/document_store/uc/juillerat 2022.pdf")
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500, chunk_overlap=500
)
texts = text_splitter.split_documents(documents)
print(len(texts))

37


In [155]:
len(texts[0].page_content)

2479

In [162]:
prompt_template = """
Write a summary on the following text:
{text}

The summary will contain information relevant to treatment for moderate to severe ulcerative colitis (UC) for
the following patient profile: 
40 year old male with newly diagnosed moderate UC and articular extraintestinal manifestations

The summary will contain the pros and cons of different biological drugs on the patient.

Summary:"""

BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
                        input_variables=["text"])

chain = load_summarize_chain(ChatOpenAI(model_name="gpt-3.5-turbo",
                                        openai_api_key=keys["OPENAI_API_KEY"],
                                        max_tokens=1024,
                                        temperature=0), 
                             chain_type="map_reduce", 
                             map_prompt=BULLET_POINT_PROMPT
)

output_summary = chain.run(texts)

KeyboardInterrupt: 

In [161]:
pprint(output_summary)

('The article discusses the efficacy and safety of different biological drugs '
 'for the treatment of inflammatory bowel diseases (IBD), such as ulcerative '
 "colitis (UC) and Crohn's disease (CD). The choice of drug should be based on "
 'individual patient factors such as disease severity, response to previous '
 'treatments, and potential side effects. Anti-TNF agents, vedolizumab, '
 'ustekinumab, etrolizumab, golimumab, and guselkumab are all potential '
 'options for treatment. A personalized approach to treatment is recommended.')


In [144]:
len([text.page_content for text in texts])

91

In [120]:
# documents = load_documents(os.path.join(DOCUMENT_SOURCE, PROJECT),
#                            exclude_pages=EXCLUDE_DICT)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000, chunk_overlap=250
# )
# texts = text_splitter.split_documents(documents)
# print(len(texts))

367


## Unstructured Images

In [172]:
sample_img = os.path.join(MAIN_DIR,
                          "data/tables/grading_disease_activity_agrawal.JPG")
loader = UnstructuredImageLoader(sample_img)
data = loader.load()
print(len(data))
pprint(data[0].page_content)

1
('Stools (no./d) Formed stool <4 >6 >10 Blood in stools None Intermittent '
 'Frequent Continuous Urgency None Mild, occasional Often Continuous '
 'Hemoglobin Normal Normal pel ee ESR <30 <30 >30 >30 CRP (mg/L) Normal '
 'Elevated Elevated Elevated FC (ug/g) < 150-200 > 150-200 > 150-200 > 150-200 '
 'Endoscopy (Mayo subscore) 0-1 1 2-3 3 UCEIS 0-1 2-4 5-8 7-8')


In [168]:
data

[Document(page_content='Stools (no./d) Formed stool <4 >6 >10 Blood in stools None Intermittent Frequent Continuous Urgency None Mild, occasional Often Continuous Hemoglobin Normal Normal pel ee ESR <30 <30 >30 >30 CRP (mg/L) Normal Elevated Elevated Elevated FC (ug/g) < 150-200 > 150-200 > 150-200 > 150-200 Endoscopy (Mayo subscore) 0-1 1 2-3 3 UCEIS 0-1 2-4 5-8 7-8', metadata={'source': '/mnt/c/Users/QUAN/Desktop/medical-chatbot/data/tables/grading_disease_activity_agrawal.JPG'})]

## Custom Agent

### QA from text Agent

### CSV Agent

In [None]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

model_name = 'text-davinci-003'
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature, openai_api_key = OPENAI_KEY)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

joke_query = "Tell me a joke."
_input = prompt.format_prompt(query=joke_query)

output = model(_input.to_string())

In [None]:
print(_input.text)

Answer the user query.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"drug_name": {"title": "Drug Name", "description": "Name of the drug", "type": "string"}, "description": {"title": "Description", "description": "Overall summary of the drug", "type": "string"}, "advantages": {"title": "Advantages", "description": "Advantages of the drug ", "type": "string"}, "disadvantages": {"title": "Disadvantages", "description": "Disadvantages of the drug", "type": "string"}}, "required": ["drug_name", "description", "advantages", "disadvantages"]}
```
Tell me a joke.



In [None]:
output

'\n{"drug_name": "Joke", "description": "A joke to make you laugh", "advantages": "It can make you laugh and bring joy", "disadvantages": "It may not be funny"}'