# Setup

In [1]:
import os, json

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings

import numpy as np
import pandas as pd

In [2]:
MAIN_DIR = ".."
EMB_DIR = os.path.join(MAIN_DIR, "data", "emb_store", "uc", "faiss", "text-embedding-ada-002")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]

with open(os.path.join(MAIN_DIR, "data", "queries", "uc_all.txt"), "r") as f:
    test_cases_txt = f.readlines()
    
with open(os.path.join(MAIN_DIR, "data", "queries", "uc_all_emb.json"), "r") as f:
    test_cases_emb = json.load(f)
    
test_cases = [(txt, emb) for txt, emb in zip(test_cases_txt, test_cases_emb)]
print("Number of test cases:", len(test_cases))
print("Length of embeddings:", len(test_cases[0][1]))

Number of test cases: 30
Length of embeddings: 1536


# Inspect Single Database

In [3]:
embedding_function = OpenAIEmbeddings().embed_query
docsearch = FAISS.load_local(
    os.path.join(EMB_DIR, "v8-add-tables_2500_500"),
    OpenAIEmbeddings()
    )

In [52]:
k = 10

info = {
    "question": [],
    "average_score": [],
    "min_score": [],
    "max_score": []
}

for idx in range(10):
    info[f"Doc {idx+1} text"] = []
    info[f"Doc {idx+1} score"] = []

for test_case in test_cases:
    info["question"].append(test_case[0])
    relevant_docs_and_scores = docsearch.similarity_search_with_score_by_vector(test_case[1], k = k)
    scores = [doc_and_score[1] for doc_and_score in relevant_docs_and_scores]
    info["average_score"].append(np.mean(scores))
    info["min_score"].append(np.min(scores))
    info["max_score"].append(np.max(scores))
    for idx, doc_and_score in enumerate(relevant_docs_and_scores):
        doc, score = doc_and_score
        info[f"Doc {idx+1} text"].append(doc.page_content)
        info[f"Doc {idx+1} score"].append(score)
        
df = pd.DataFrame(info)
save_folder = os.path.join(MAIN_DIR, "artifacts", "similarity-search-analysis-text-embedding-ada-002")
if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)
df.to_csv(os.path.join(save_folder, "summary.csv"),
          header=True)

# Compare Multiple Databases

In [59]:
docstores = [
    "v6-add-tables_750_100",
    "v7-add-tables_1000_200",
    "v8-add-tables_2500_500",
    "v9-add-tables_1500_300"
]

info = {
    "question": [test_case[0] for test_case in test_cases],
}

for docstore in docstores:
    docsearch = FAISS.load_local(
        os.path.join(EMB_DIR, docstore),
        OpenAIEmbeddings()
        )
    chunk_size = docstore.split("_")[-2]
    info[f"{chunk_size}_average_score"] = []
    info[f"{chunk_size}_min_score"] = []
    info[f"{chunk_size}_max_score"] = []
    
    for test_case in test_cases:
        relevant_docs_and_scores = docsearch.similarity_search_with_score_by_vector(test_case[1], k = k)
        scores = [doc_and_score[1] for doc_and_score in relevant_docs_and_scores]
        info[f"{chunk_size}_average_score"].append(np.mean(scores))
        info[f"{chunk_size}_min_score"].append(np.min(scores))
        info[f"{chunk_size}_max_score"].append(np.max(scores))
        
df_scores = pd.DataFrame(info)
save_folder = os.path.join(MAIN_DIR, "artifacts", "similarity-search-analysis-text-embedding-ada-002")
if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)
    
df_scores.to_csv(os.path.join(save_folder, "compare_database.csv"),
                 header=True)

# Inspect Table Contents

In [3]:
from langchain.document_loaders import PyMuPDFLoader

page_no = 5
document = os.path.join(
    os.path.join(MAIN_DIR, "data", "document_store", "uc", "1-s2.0-S2468125321003770-main.pdf")
)

docs = PyMuPDFLoader(document).load()
page_with_table = docs[page_no-1]

In [4]:
print(page_with_table.page_content)

Articles
www.thelancet.com/gastrohep   Vol 7   February 2022 
165
U-ACCOMPLISH8). No phase 3 RCTs with etrasimod or 
TD-1473 were found.
Among 22 studies evaluating maintenance therapy, ten 
were done by use of a treat-straight-through strategy 
(2528 patients)19–22,24,25,29,34,36 and 12 followed a randomised 
responders design (3484 patients).6,27,28,30–32,35,37–41 Only seven 
studies evaluated histological remission.6,29,31,32,35,38,41 The 
main characteristics of the included trials are described 
in the appendix (pp 2–4). All outcomes were assessed 
uniformly on the basis of the standard definition of the 
Mayo score, with follow-up durations of 6–14 weeks for 
induction therapy and 26–66 weeks for maintenance 
therapy (appendix pp 2–4). All studies were industry 
sponsored. A risk of bias assessment showed a low risk 
of bias for most of the included studies (appendix p 5). 
Confidence in the estimates derived from our meta-
analysis is shown in the appendix (pp 6–8).
A network ma

In [25]:
from langchain.document_loaders import CSVLoader, UnstructuredCSVLoader
from langchain.docstore.document import Document

table_path = os.path.join(MAIN_DIR, "data", "tables", "uc_juillerat_2022_tab1.csv")
table = CSVLoader(table_path, encoding = "ISO-8859-1")
unstructured_table = UnstructuredCSVLoader(
    table_path, mode = "single"
)

In [27]:
docs = table.load()
table_content = (
    "Table Description: Efficacy of biological treatments according, "
    "to the line of treatment, earlier exposure, disease phenotype "
    "and patient characteristics. \n\nTable Content:\n"
    )  
for row in docs:
    table_content += row.page_content + "\n\n"

table_doc = Document(page_content=table_content)

print(table_content)

Table Description: Efficacy of biological treatments according, to the line of treatment, earlier exposure, disease phenotype and patient characteristics. 

Table Content:
Patient Profile: Fresh No previous treatment
Best: Infliximab (IFX*)
2nd Best: Vedoluzimab (VEDO)
3rd Best: Ustekinumab (USTE)
4th Best: Golimumab (GOL)
5th Best: Adalimumab (ADA)

Patient Profile: Currently under maintenance
Best: Vedoluzimab (VEDO)
2nd Best: Infliximab (IFX)
3rd Best: Ustekinumab (USTE), Golimumab (GOL)
4th Best: Adalimumab (ADA)
5th Best: 

Patient Profile: Prior response to Infliximab
Best: Golimumab (GOL)
2nd Best: Adalimumab (ADA)
3rd Best: Ustekinumab (USTE), Vedoluzimab (VEDO)
4th Best: 
5th Best: 

Patient Profile: Prior failure to Anti-TNF agents (1-2x) (PNR)
Best: Ustekinumab (USTE)
2nd Best: Vedoluzimab (VEDO)
3rd Best: 
4th Best: 
5th Best: 

Patient Profile: Prior failure to Vedolizumab
Best: Infliximab (IFX)
2nd Best: Ustekinumab (USTE), Golimumab (GOL)
3rd Best: Adalimumab (ADA)
4th B

In [28]:
from langchain.prompts.chat import ChatPromptTemplate

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# CHAT PROMTP TEMPLATE
system_prompt = """
You are a physician assistant giving advice on treatment for moderate to severe ulcerative colitis (UC).
Make reference to the CONTEXT given to assess the scenario.
If the answer cannot be inferred from CONTEXT, return "NO ANSWER", don't try to make up an answer.
=================================
TASK: ANALYSE the given patient profile based on given query based on one of the following criteria:
- Whether treated patient is new patient or patient under maintenance
- Prior response to Infliximab
- Prior failure to Anti-TNF agents
- Prior failure to Vedolizumab
- Age
- Pregnancy
- Extraintestinale manifestations
- Pouchitis

FINALLY RETURN up to 2 TOP choices of biological drugs given patient profile and context. Explain the PROS and CONS of the 2 choices.
If answer cannot be derived from context, RETURN "NO ANSWER" and explain reason.
=================================
OUTPUT INSTRUCTIONS:
Output your answer as a list of JSON objects with keys: drug_name, advantages, disadvantages.
=================================
CONTEXT:
{summaries}
=================================
"""

PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(
            system_prompt, input_variables=["summaries"]
        ),
        HumanMessagePromptTemplate.from_template("PATIENT PROFILE: {question}"),
    ]
)

In [29]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import ChatOpenAI

qa_chain = load_qa_with_sources_chain(
    llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=512),
    
)