# Import

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain_core.documents import Document # 3 para
from typing import List, Dict

import glob
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

import sqlalchemy
import numpy as np
import pprint

import re

# Set up model

In [2]:
# THE FIRST TIME YOU RUN THIS, IT MIGHT TAKE A WHILE

model_path_or_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
model = AutoModelForCausalLM.from_pretrained(
    model_path_or_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    bnb_4bit_compute_dtype=torch.float16,
    use_flash_attention_2=True,
    load_in_4bit=True
)


## LoRA instead

In [3]:
from typing import List, Dict
from langchain.vectorstores.pgvector import PGVector

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [4]:
lora_path = "../III_Finetuning_For_RAG/mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_qkvo_fixedtokens"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

if lora_path:
    # load base LLM model with PEFT Adapter
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        use_flash_attention_2=True,
        quantization_config = bnb_config
    )
    # tokenizer = AutoTokenizer.from_pretrained(lora_path)
    tokenizer = AutoTokenizer.from_pretrained(lora_path, padding_side='right')
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        use_flash_attention_2=True,
        quantization_config = bnb_config
    )
    # tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id, padding_side='right')

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Database strings

In [2]:
# The connection to the database
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver= "psycopg2",
    host = "localhost",
    port = "5432",
    database = "postgres",
    user= "username",
    password="password"
)

# The embedding function that will be used to store into the database
embedding_function = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

# Creates the database connection to our existing DB
db = PGVector(
    connection_string = CONNECTION_STRING,
    collection_name = "embeddings",
    embedding_function = embedding_function,
    pre_delete_collection = True, # uncomment this to delete existing database first    
)

# Chunk documents

## `chunk_documents()`

In [6]:
def chunk_document(doc_path: str) -> List[Document]:
    """Chunk a document into smaller langchain Documents for embedding.

    :param doc_path: path to document
    :type doc_path: str
    :return: List of Document chunks
    :rtype: List[Document]
    """
    loader = PyPDFLoader(doc_path)
    documents = loader.load()

    # split document based on the `\n\n` character, quite unintuitive
    # https://stackoverflow.com/questions/76633836/what-does-langchain-charactertextsplitters-chunk-size-param-even-do
    # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # default
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
    
    return text_splitter.split_documents(documents)

In [4]:
def chunk_document(doc_path: str) -> List[Document]:
    """Chunk a document into smaller langchain Documents for embedding.

    :param doc_path: path to document
    :type doc_path: str
    :return: List of Document chunks
    :rtype: List[Document]
    """
    loader = PyPDFLoader(doc_path)
    documents = loader.load()

    # split document based on the `\n\n` character, quite unintuitive
    # https://stackoverflow.com/questions/76633836/what-does-langchain-charactertextsplitters-chunk-size-param-even-do
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # default
    # text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
    
    return text_splitter.split_documents(documents)

In [20]:
docpath_ex = "../../msl-data/MSL Notes_1.pdf"
loader = PyPDFLoader(docpath_ex)
doc_ex = loader.load()
doc_ex

[Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aetherisol  \nDate:  March 15, 2023  \nProvider:  Dr. Olivia Reynolds  \nTitle:  Rheumatologist  \nInstitution:  Metropolitan Medical Center  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Expressed gratitude fo r the opportunity to discuss Aetherisol, a novel interleukin -17 \ninhibitor.  \n• Discussed Aetherisol's recent approval and its potential impact on refractory \nrheumatoid arthritis (RA) cases.  \n2. Provider's Current Patient Cases:  \n• Inquired about Dr. Reynolds' current patient cases considering Aetherisol.  \n• Discussed three specific RA cases where Aetherisol demonstrated a remarkable 30% \nimprovement in joint function within the first month.  \n3. Efficacy and Clinical Data:  \n• Presented recent c linical data supporting Aetherisol's efficacy.  \n• Highlighted a 60% reduction in disease activity scores observed in a 6 -month clinical trial \ninvolving 200 pa

In [23]:
ts = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
ts.split_documents(doc_ex)

[Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aetherisol  \nDate:  March 15, 2023  \nProvider:  Dr. Olivia Reynolds  \nTitle:  Rheumatologist  \nInstitution:  Metropolitan Medical Center  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Expressed gratitude fo r the opportunity to discuss Aetherisol, a novel interleukin -17 \ninhibitor.  \n• Discussed Aetherisol's recent approval and its potential impact on refractory \nrheumatoid arthritis (RA) cases.  \n2. Provider's Current Patient Cases:  \n• Inquired about Dr. Reynolds' current patient cases considering Aetherisol.  \n• Discussed three specific RA cases where Aetherisol demonstrated a remarkable 30% \nimprovement in joint function within the first month.  \n3. Efficacy and Clinical Data:  \n• Presented recent c linical data supporting Aetherisol's efficacy.  \n• Highlighted a 60% reduction in disease activity scores observed in a 6 -month clinical trial \ninvolving 200 pa

In [9]:
foo = str(doc_ex[0])
len(foo)

1906

In [11]:
pprint.pprint(foo)

('page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth '
 'Discussion on Aetherisol  \\nDate:  March 15, 2023  \\nProvider:  Dr. Olivia '
 'Reynolds  \\nTitle:  Rheumatologist  \\nInstitution:  Metropolitan Medical '
 'Center  \\nSummary of Key Discussion Points:  \\n1. Introduction:  \\n• '
 'Expressed gratitude fo r the opportunity to discuss Aetherisol, a novel '
 "interleukin -17 \\ninhibitor.  \\n• Discussed Aetherisol's recent approval "
 'and its potential impact on refractory \\nrheumatoid arthritis (RA) cases.  '
 "\\n2. Provider's Current Patient Cases:  \\n• Inquired about Dr. Reynolds' "
 'current patient cases considering Aetherisol.  \\n• Discussed three specific '
 'RA cases where Aetherisol demonstrated a remarkable 30% \\nimprovement in '
 'joint function within the first month.  \\n3. Efficacy and Clinical Data:  '
 "\\n• Presented recent c linical data supporting Aetherisol's efficacy.  \\n• "
 'Highlighted a 60% reduction in disease activity scores

In [16]:
doc_ex[0].page_content

"Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aetherisol  \nDate:  March 15, 2023  \nProvider:  Dr. Olivia Reynolds  \nTitle:  Rheumatologist  \nInstitution:  Metropolitan Medical Center  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Expressed gratitude fo r the opportunity to discuss Aetherisol, a novel interleukin -17 \ninhibitor.  \n• Discussed Aetherisol's recent approval and its potential impact on refractory \nrheumatoid arthritis (RA) cases.  \n2. Provider's Current Patient Cases:  \n• Inquired about Dr. Reynolds' current patient cases considering Aetherisol.  \n• Discussed three specific RA cases where Aetherisol demonstrated a remarkable 30% \nimprovement in joint function within the first month.  \n3. Efficacy and Clinical Data:  \n• Presented recent c linical data supporting Aetherisol's efficacy.  \n• Highlighted a 60% reduction in disease activity scores observed in a 6 -month clinical trial \ninvolving 200 patients.  \n4. Safety Pr

## Get list of doc chunks

In [8]:
# load the document and split it into chunks
doc_chunks = []
for doc in glob.glob("../../msl-data/*.pdf"):
    doc_chunks += chunk_document(doc)

In [9]:
len(doc_chunks)

78

In [16]:
len(doc_chunks[0].page_content)

1732

In [18]:
len(doc_chunks[1].page_content)

1363

## Make database from documents
See: https://python.langchain.com/docs/integrations/vectorstores/pgvector

In [10]:
db = PGVector.from_documents(
    doc_chunks,
    connection_string = CONNECTION_STRING,
    collection_name = "embeddings",
    embedding = embedding_function,
    pre_delete_collection = True, # uncomment this to delete existing database first
)

# Q&A with RAG for Doc Context

## Prompt template - `RAG_PROMPT_TEMPLATE`

In [24]:
# Prepare the input for for tokenization, attach any prompt that should be needed
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}

### Response:
"""


## Query

In [26]:
question = "What kind of doctor is Jonathan Reynolds?"

## Similarity Search

In [34]:
len(docs_with_scores[0][0].page_content)

1709

In [None]:
len(docs_with_scores[1][0].page_content)

In [31]:
docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs
docs_with_scores

[(Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - Introduction of PulmoVive  \nDate:  August 10, 2024  \nProvider:  Dr. Jonathan Reynolds  \nTitle:  Pulmonologist  \nInstitution:  RespiraLung Pulmonary Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced PulmoVive as an innovative therapy for managing moderate to severe \nchronic obstructive pulmonary disease (COPD).  \n• Emphasized its unique mechanism targeting both bronchoconstriction and airway \ninflammation.  \n2. Provider's Current Patient Cases:  \n• Dr. Re ynolds expressed interest in new interventions for patients experiencing \nexacerbations despite standard COPD treatments.  \n• Discussed specific cases where current bronchodilators and anti -inflammatory agents \nhave shown limited efficacy.  \n3. Efficacy and Clinical Data:  \n• Presented recent clinical data showcasing PulmoVive's ability to improve FEV1 by 15% \nand reduce exacerbation rates by 30%.  \n• Highlighted stat

### `concat_RAG_page_content()`

In [35]:
def concat_RAG_page_content(docs_input):
    """
    docs_input is a list of tuples, where tuple[0] is Document and tuple[1] is score.
    Document has properties Document.page_content, Document.metadata["source"], Document.metadata["page"]
    """
    
    concat_docs = "\n".join([tup[0].page_content for tup in docs_input])
    
    return concat_docs

In [36]:
len(concat_RAG_page_content(docs_with_scores))

3568

In [10]:
docs_with_scores

[(Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on NeuroGlyde  \nDate:  April 10, 2023  \nProvider:  Dr. James Harper  \nTitle:  Neurologist  \nInstitution:  City Neurology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its potential in \nslowing disease progression.  \n• Discussed ongoing clinical trials and positive early -phase results.  \n2. Provider's Current Patient C ases:  \n• Explored Dr. Harper's experience with NeuroGlyde in treating neurodegenerative \ndisorders.  \n• Discussed improvements in cognitive function observed in Alzheimer's patients.  \n3. Efficacy and Clinical Data:  \n• Presented data demonstrating a 40% reduction in annualized relapse rates in multiple \nsclerosis patients.  \n• Highlighted significant improvements in quality of life measures.  \n4. Safety Profile:  \n• Discussed the favorable safety profile of NeuroGlyde, with 

In [None]:
source_filename = re.search("([MSL].{1,})", docs_with_scores[0][0].metadata["source"])[0]

In [33]:
import re

foo = docs_with_scores[0][0].metadata["source"]

bar = re.search("([MSL].{1,})", foo)[0]
bar[0]

'M'

In [40]:
concat_RAG_page_content(docs_with_scores)

"Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on NeuroGlyde  \nDate:  April 10, 2023  \nProvider:  Dr. James Harper  \nTitle:  Neurologist  \nInstitution:  City Neurology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its potential in \nslowing disease progression.  \n• Discussed ongoing clinical trials and positive early -phase results.  \n2. Provider's Current Patient C ases:  \n• Explored Dr. Harper's experience with NeuroGlyde in treating neurodegenerative \ndisorders.  \n• Discussed improvements in cognitive function observed in Alzheimer's patients.  \n3. Efficacy and Clinical Data:  \n• Presented data demonstrating a 40% reduction in annualized relapse rates in multiple \nsclerosis patients.  \n• Highlighted significant improvements in quality of life measures.  \n4. Safety Profile:  \n• Discussed the favorable safety profile of NeuroGlyde, with no serious  adverse even

## Give context prompt based on template

In [59]:
# Prepare the input for for tokenization, attach any prompt that should be needed
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}

### Response:


### Source:
Source is: {source_filename}
"""

# Prepare the input for for tokenization, attach any prompt that should be needed
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}.

### Response:
"""


question = "What kind of doctor is Jonathan Reynolds? If he is more than one, describe why."

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    # context = docs_with_scores[0][0].page_content + "\nSource is: " + re.search("([MSL].{1,})", docs_with_scores[0][0].metadata["source"])[0],
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
    # source_filename = re.search("([MSL].{1,})", docs_with_scores[0][0].metadata["source"])[0]
)

In [56]:
context_prompt

"### Context:\nSubject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol  \nDate:  June 20, 2023  \nProvider:  Dr. Michael Chang  \nTitle:  Dermatologist  \nInstitution:  Coastal Dermatology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced Aetherisol as a breakthrough treatment for refractory psoriasis.  \n• Discussed its unique mechanism targeting interleukin -17 pathways in the skin.  \n2. Provider's Current Patient Cases:  \n• Explored Dr. Chang's experiences with Aetherisol in managing severe psoriasis cases.  \n• Discussed rapid improvement in skin clearance observed in p atients resistant to \ntraditional therapies.  \n3. Efficacy and Clinical Data:  \n• Presented data showing a 70% reduction in Psoriasis Area and Severity Index (PASI) \nscores within 12 weeks.  \n• Highlighted Aetherisol's efficacy in achieving long -term remission.  \n4. Safety Pr ofile:  \n• Discussed the favorable safety profile of Aetherisol, with no re

## Result

### `generate()`

In [None]:
concat_RAG_page_content

In [57]:
# def generate(prompt):
#     """Convenience function for generating model output"""
#     # Tokenize the input
#     input_ids = tokenizer(
#         prompt, 
#         return_tensors="pt", 
#         truncation=True).input_ids.cuda()
    
#     # Generate new tokens based on the prompt, up to max_new_tokens
#     # Sample aacording to the parameter
#     with torch.inference_mode():
#         outputs = model.generate(
#             input_ids=input_ids, 
#             max_new_tokens=100, 
#             do_sample=True, 
#             top_p=0.9,
#             temperature=0.1,
#             use_cache=True
#         )
#     return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]


def generate(prompt, max_new_tokens = 100, temperature = 0.7):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample according to the parameter
    with torch.inference_mode(mode=True): # AP: added mode=True
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=max_new_tokens, 
            do_sample=True,
            top_p=0.9,
            temperature=temperature,
            use_cache=True,
            # pad_token_id=tokenizer.eos_token_id ## ADDED - AP
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

#### Q: Jonathan Reynolds doctor type?

In [116]:
question = "What kind of doctor is Jonathan Reynolds? If there is more than one answer, explain why."

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
)

res = generate(context_prompt, max_new_tokens=1000, temperature=1)

# print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Response:
Jonathan Reynolds is a cardiologist.  However, he is also interested in COPD and PulmoVive.

Answer provided by AP.



In [118]:
question = "What kind of doctor is Jonathan Reynolds? Provide multiple answers if possible."

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
)

res = generate(context_prompt, max_new_tokens=10000, temperature=1)

# print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Response:
Jonathan Reynolds is a pulmonologist at RespiraLung Pulmonary Clinic. He is also a cardiologist at Advanced Cardiovascular Associates.

Answer provided by AP.



#### Q: Aetherisol treats what?

In [90]:
question = "What does Aetherisol treat?"

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
)

res = generate(context_prompt, max_new_tokens=1000, temperature=0.5)

# print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Response:
Aetherisol treats refractory psoriasis and rheumatoid arthritis.

Answer provided by AP.



In [99]:
question = "What clinic does Jonathan Reynolds work at? State multiple answers if available."

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
)

res = generate(context_prompt, max_new_tokens=1000, temperature=0.5)

# print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Response:
Advanced Cardiovascular Associates
RespiraLung Pulmonary Clinic

Answer provided by AP.



In [None]:
question = "What clinic does Jonathan Reynolds work at? State multiple answers if available."

docs_with_scores = db.similarity_search_with_score(question, k = 2) # a list of docs


context_prompt = RAG_PROMPT_TEMPLATE.format(
    # context = docs_with_scores[0][0].page_content,
    context = concat_RAG_page_content(docs_with_scores),
    question = question,
)

res = generate(context_prompt, max_new_tokens=1000, temperature=0.5)

# print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

# Langchain for RAG

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain_core.documents import Document
from typing import List, Dict

import glob
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

import sqlalchemy
import numpy as np
import pprint

## Objects
* langchain
    * document_loaders -> TextLoader, PyPDFLoader
    * embeddings.sentence_transformer -> SentenceTransformerEmbeddings
    * text_splitter -> CharacterTextSplitter
    * vectorstores.pgvector -> PGVector
    * schema -> StrOutputParser
    * schema.runnable -> RunnablePassthrough
    * schema.runnable -> RunnableParallel
    * prompts -> PromptTemplate
    * llms.huggingface_pipeline -> HuggingFacePipeline

* HuggingFacePipeline
    * pipeline?
        * specific time is "text-generation"
    * parameters: model, tokenizer, max_new_tokens
* PGVector.as_retriever()
* itemgetter()
* PromptTemplate
    * Pass in the parameters with {parameter}
        * e.g. {context}, {question}
* model
* OutputParser
    * langchain.schema -> StrOutputParser
* RunnabelParallel

## Import libraries

In [35]:
from operator import itemgetter
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

import re

## Set up `retriever`

In [36]:
# Turn our db into a retriever
retriever = db.as_retriever(search_kwargs = {'k' : 1})

## Create pipeline - `HuggingFacePipeline`

In [37]:
# Turn our model into an LLM
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=100)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [76]:
llm = HuggingFacePipeline(pipeline=pipe)

prompt_template = PromptTemplate.from_template("""
Answer the question using only this context and followed by 'Answer provided by AP':

Context: {context}

Question: {question}

Answer: 
""")                                    

In [None]:
re.search("([MSL].{1,})", docs_with_scores[0][0].metadata["source"])[0]

## `format_docs()`

In [70]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


## Chain

### RAG Chain: template -> model -> output parser

In [71]:
# Build a chain with multiple documents for RAG
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | prompt_template
    | llm
    | StrOutputParser()
)

### RAG chain with retriever

In [77]:
# 2-step chain, first retrieve documents
# Then take those documents and store relevant infomration in `document_sources`
# Pass the prompt into the document chain
rag_chain_with_source = RunnableParallel({
    "documents": retriever, 
     "question": RunnablePassthrough()
}) | {
    "sources": lambda input: [(doc.page_content, doc.metadata) for doc in input["documents"]], # input["documents"] is from the previous link in the chain
    "answer": rag_chain_from_docs,
}

### Invoke

In [86]:
query = "What's the efficacy of Neurosolvix? Cite the source"

In [87]:
res = rag_chain_with_source.invoke(query)

print(res["answer"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Source: 

Medical Science Liaison (MSL) Notes - NeuroSolvix in Neuropathic Pain Management  
Date: March 25, 2023  
Provider: Dr. Joshua Martinez  
Title: Pain Management Specialist  
Institution: PainRelief Clinic  
Summary of Key Discussion Points:  
1. Introduction:  
• Introduced NeuroSolvix as a


In [80]:
len(res["sources"])

2

In [81]:
pprint.pprint(res["sources"])

[('Subject:  Medical Science Liaison (MSL) Notes - NeuroSolvix in Neuropathic '
  'Pain Management  \n'
  'Date:  March  25, 20 23 \n'
  'Provider:  Dr. Joshua Martinez  \n'
  'Title:  Pain Management Specialist  \n'
  'Institution:  PainRelief Clinic  \n'
  'Summary of Key Discussion Points:  \n'
  '1. Introduction:  \n'
  '• Introduced NeuroSolvix as a potential therapy for the management of '
  'neuropathic pain.  \n'
  '• Emphasized its unique mechanism targeting central sensitization and '
  'neural \n'
  'hyperactivity.  \n'
  "2. Provider's Current Patient Cases:  \n"
  '• Dr. Martinez expressed interest in novel approaches for patients with '
  'chronic \n'
  'neuropathic pain.  \n'
  '• Discussed specific cases where current analgesics and neuropathic pain '
  'medications \n'
  'have shown limited efficacy or significant side effects.  \n'
  '3. Efficacy and Clinica l Data:  \n'
  "• Presented recent clinical data showcasing NeuroSolvix's ability to reduce "
  'neuropathic \n

# COMBINED

## Database

In [62]:
def chunk_document(doc_path: str) -> List[Document]:
    """Chunk a document into smaller langchain Documents for embedding.

    :param doc_path: path to document
    :type doc_path: str
    :return: List of Document chunks
    :rtype: List[Document]
    """
    loader = PyPDFLoader(doc_path)
    documents = loader.load()

    # split document based on the `\n\n` character, quite unintuitive
    # https://stackoverflow.com/questions/76633836/what-does-langchain-charactertextsplitters-chunk-size-param-even-do
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    
    return text_splitter.split_documents(documents)


# load the document and split it into chunks
doc_chunks = []
for doc in glob.glob("../../msl-data/*.pdf"):
    doc_chunks += chunk_document(doc)
    
db = PGVector.from_documents(
    doc_chunks,
    connection_string = CONNECTION_STRING,
    collection_name = "embeddings",
    embedding = embedding_function,
    pre_delete_collection = True, # uncomment this to delete existing database first
)

## Model

In [63]:
from operator import itemgetter
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

####------------------------------RETRIEVER------------------------------####
# Turn our db into a retriever
retriever = db.as_retriever(search_kwargs = {'k' : 2})

####------------------------------TEMPLATE------------------------------####
prompt_template = PromptTemplate.from_template("""
Answer the question using only this context and followed by 'Answer provided by AP':

Context: {context}

Question: {question}

Answer: 
""")

####------------------------------QUERY------------------------------####
query = "What's the efficacy of Neurosolvix?"

####------------------------------MODEL------------------------------####
# Turn our model into an LLM
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=100)

llm = HuggingFacePipeline(pipeline=pipe)





####------------------------------CHAINS------------------------------####
    # Build a chain with multiple documents for RAG
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | prompt_template
    | llm
    | StrOutputParser()
)

    # 2-step chain, first retrieve documents
    # Then take those documents and store relevant infomration in `document_sources`
    # Pass the prompt into the document chain
rag_chain_with_source = RunnableParallel({
    "documents": retriever, 
     "question": RunnablePassthrough()
}) | {
    "sources": lambda input: [(doc.page_content, doc.metadata) for doc in input["documents"]], # input["documents"] is from the previous link in the chain
    "answer": rag_chain_from_docs,
}



res = rag_chain_with_source.invoke(query)

print(res["answer"])
print()
print(res["sources"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



NeuroSolvix has shown efficacy in reducing neuropathic pain scores by 40% and improving patient-reported quality of life. Statistically significant improvements in pain-related functional outcomes have also been observed.

[("Subject:  Medical Science Liaison (MSL) Notes - NeuroSolvix in Neuropathic Pain Management  \nDate:  March  25, 20 23 \nProvider:  Dr. Joshua Martinez  \nTitle:  Pain Management Specialist  \nInstitution:  PainRelief Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced NeuroSolvix as a potential therapy for the management of neuropathic pain.  \n• Emphasized its unique mechanism targeting central sensitization and neural \nhyperactivity.  \n2. Provider's Current Patient Cases:  \n• Dr. Martinez expressed interest in novel approaches for patients with chronic \nneuropathic pain.  \n• Discussed specific cases where current analgesics and neuropathic pain medications \nhave shown limited efficacy or significant side effects.  \n3. Efficacy

In [36]:
from operator import itemgetter
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def run_model(query_inp, retriever_inp):
    
        ####------------------------------MODEL------------------------------####
    # Turn our model into an LLM
    pipe = pipeline(
        "text-generation", 
        model=model, 
        tokenizer=tokenizer, 
        max_new_tokens=100)

    llm = HuggingFacePipeline(pipeline=pipe)





    ####------------------------------CHAINS------------------------------####
        # Build a chain with multiple documents for RAG
    rag_chain_from_docs = (
        {
            "context": lambda input: format_docs(input["documents"]),
            "question": itemgetter("question"),
        }
        | prompt_template
        | llm
        | StrOutputParser()
    )

        # 2-step chain, first retrieve documents
        # Then take those documents and store relevant infomration in `document_sources`
        # Pass the prompt into the document chain
    rag_chain_with_source = RunnableParallel({
        "documents": retriever_inp, 
         "question": RunnablePassthrough()
    }) | {
        "sources": lambda input: [(doc.page_content, doc.metadata) for doc in input["documents"]], # input["documents"] is from the previous link in the chain
        "answer": rag_chain_from_docs,
    }



    res = rag_chain_with_source.invoke(query_inp)
    
    return (res["answer"], res["sources"])
    

In [49]:
df_queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Query        6 non-null      object
 1   MSL_Notes_x  6 non-null      object
 2   MSL_Answer   6 non-null      object
dtypes: object(3)
memory usage: 272.0+ bytes


In [74]:
df_output.to_csv("./output_results", index=False)

In [72]:
import pandas as pd

# df_queries = pd.read_csv("../../test_questions.csv")
df_queries = pd.read_csv("../../GPT_test_questions.csv")

pd.options.display.max_colwidth = 500
# queries = df_queries["Query"].astype(str).to_list()
queries = df_queries["Question"].astype(str).to_list()

results = [run_model(query, retriever) for query in queries]
answers = [result[0] for result in results]
sources = [result[1][:50] for result in results]

df_results = pd.DataFrame({"Answers": answers,
                           "Sources": sources,
                          })

df_output = pd.concat([df_queries, df_results], axis=1)
df_output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Unnamed: 0,Question,Answer,Source,Quote,Answers,Sources
0,What is the efficacy of Tranquilify in reducing anxiety symptoms?,Tranquilify reduces anxiety symptoms by 20%.,MSL Notes_6.pdf,"""Presented data highlighting Tranquilify's efficacy in reducing anxiety symptoms by 20%.""","\nTranquilify has shown an average reduction of 40% in anxiety symptoms. However, the clinical significance of this reduction is debated, with some experts questioning its statistical relevance. Dr. Turner shared concerns about the limited efficacy observed in her initial trials, while Dr. Nguyen expressed interest in new pharmacological options for treatment -resistant anx iety.","[(Subject: Medical Science Liaison (MSL) Notes - Introduction of Tranquilify i n Anxiety Disorders \nDate: July 15, 2024 \nProvider: Dr. Sophia Nguyen \nTitle: Psychiatrist \nInstitution: MindCare Mental Health Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Tranquilify as a potential therapy for anxiety disorders, emphasiz ing its \nunique mechanism targeting specific neurotransmitter pathways. \n• Highlighted the drug's potential in managing both g..."
1,What is the mechanism of action of Aetherisol in rheumatoid arthritis?,Aetherisol targets interleukin-17 pathways.,MSL Notes_1.pdf,"""Explained Aetherisol's unique mechanism of action targeting interleukin-17 pathways.""","\nAetherisol's mechanism of action in rheumatoid arthritis involves targeting interleukin -17 (IL -17) pathways. IL -17 is a pro-inflammatory cytokine that plays a crucial role in the development and progression of rheumatoid arthritis. Aetherisol works by inhibiting the production and activity of IL -17, which leads to a reduction","[(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aetherisol \nDate: March 15, 2023 \nProvider: Dr. Olivia Reynolds \nTitle: Rheumatologist \nInstitution: Metropolitan Medical Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Expressed gratitude fo r the opportunity to discuss Aetherisol, a novel interleukin -17 \ninhibitor. \n• Discussed Aetherisol's recent approval and its potential impact on refractory \nrheumatoid arthritis (RA) cases. ..."
2,How does NeuroRelieve target migraine pathophysiology?,NeuroRelieve targets trigeminal nerve hyperactivity.,MSL Notes_19.pdf,"""Explained NeuroRelieve's mechanism, focusing on its targeted modulation of trigeminal nerve hyperactivity.""",NeuroRelieve targets migraine pathophysiology by modulating trigeminal nerve hyperactivity.,"[(Subject: Medical Science Liaison (MSL) Notes - Introduction of NeuroRelieve \nDate: October 15, 2024 \nProvider: Dr. Amanda Rodriguez \nTitle: Neurologist \nInstitution: NeuroWell Neurological Institute \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced NeuroRelieve as a groundbreaking therapy for the management of chronic \nmigraine. \n• Emphasized its unique mechanism targeting trigeminal nerve pathways involved in \nmigraine pathophysiology. \n2. Provide..."
3,What is the safety profile of Revitalizix in heart failure patients?,Revitalizix has a favorable safety profile with minimal reported side effects.,MSL Notes_10.pdf,"""Discussed the favorable safety profile of Revitalizix, with minimal reported central nervous system side effects.""","\nThe safety profile of Revitalizix in heart failure patients is favorable, with low rates of adverse events reported. Dr. Martinez raised concerns about potential long-term safety, particularly in elderly patients with multiple comorbidities. Dr. Nguyen expressed confidence in the drug's safety based on her patient experiences.","[(Subject: Medical Science Liaiso n (MSL) Notes - Mixed Discussion on Revitalizix \nDate: December 7, 2023 \nProvider: Dr. Amanda Martinez \nTitle: Cardiologist \nInstitution: Metropolitan Heart Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Revitalizix as a potential therapy for heart failure with preserved ejection \nfraction. \n• Dr. Martinez expressed cautious optimism, acknowledging the need for additional \ntreatment options in heart failure. ..."
4,How does Vitaligen promote pancreatic beta-cell regeneration in type 2 diabetes?,Vitaligen stimulates pancreatic beta-cell proliferation.,MSL Notes_9.pdf,"""Explained Vitaligen's mechanism, focusing on its role in stimulating pancreatic beta-cell proliferation.""","Vitaligen promotes pancreatic beta-cell regeneration in type 2 diabetes through its unique mechanism of action. It stimulates the proliferation of pancreatic beta-cells, which are responsible for producing insulin and regulating blood sugar levels. This process helps improve insulin sensitivity and glucose control in individuals with type 2 diabetes.","[(Subject: Medical Science Liaison (MSL) Notes - Introduction of Vitaligen \nDate: January 15, 2024 \nProvider: Dr. Kimberly Adams \nTitle: Endocrinologist \nInstitution: Metabolic Wellness Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Vitaligen as an innovative therapy for type 2 diabetes with a focus on \nmetabolic health. \n• Emphasized its unique mecha nism of promoting pancreatic beta -cell regeneration. \n2. Provider's Current Patient Cases:..."
...,...,...,...,...,...,...
104,Where is Dr. Jonathan Reynolds' institution located?,"Dr. Jonathan Reynolds works at the Advanced Cardiovascular Institute, and also the RespiraLung Pulmonary Clinic.","MSL Notes_9.pdf, MSL Notes_17.pdf",,\nDr. Jonathan Reynolds is a pulmonologist at RespiraLung Pulmonary Clinic.,"[(Subject: Medical Science Liaison (MSL) Notes - Introduction of PulmoVive \nDate: August 10, 2024 \nProvider: Dr. Jonathan Reynolds \nTitle: Pulmonologist \nInstitution: RespiraLung Pulmonary Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced PulmoVive as an innovative therapy for managing moderate to severe \nchronic obstructive pulmonary disease (COPD). \n• Emphasized its unique mechanism targeting both bronchoconstriction and airway \ninflammation. ..."
105,What is the specialty of Dr. Melissa Turner?,Dr. Melissa Turner is a pulmonologist.,MSL Notes_18.pdf,,Dr. Melissa Turner is a pulmonologist.,"[(Subject: Medical Science Liaison (MSL) Notes - Mixed Reactions on PulmoVive \nDate: September 5, 2024 \nProvider: Dr. Melissa Turner \nTitle: Pulmonologist \nInstitution: BreatheWell Pulmonary Care Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced PulmoVive as an innovative therapy for COPD management. \n• Dr. Turner expressed cautious optimism, acknowledging the need for advancements in \nCOPD treatment. \n2. Provider 's Current Patient Cases: \n•..."
106,Where is Dr. Rachel Foster's institution located?,Dr. Rachel Foster works at the Advanced Cardiovascular Institute.,MSL Notes_15.pdf,,\nDr. Rachel Foster's institution is the Advanced Cardiovascular Institute.,"[(Subject: Medical Science Liaison (MSL) Notes - Critical Discussion on Vitaligen in Cardiovascular Health \nDate: June 8, 2024 \nProvider: Dr. Rachel Foster \nTitle: Interventional Cardiologist \nInstitution: Advanced Cardiovascular Institute \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Vitaligen as a potential therapy targeting arterial stiffness and cardiovascular \nhealth . \n• Dr. Foster expressed skepticism about the need for another medication in a..."
107,What is the specialty of Dr. Michael Chang?,Dr. Michael Chang is a dermatologist.,MSL Notes_4.pdf,,Dr. Michael Chen is a cardiologist.,"[(Subj ect: Medical Science Liaison (MSL) Notes - Introduction of CardioGuardia \nDate: November 20, 2024 \nProvider: Dr. Michael Chen \nTitle: Cardiologist \nInstitution: HeartCare Cardiovascular Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced CardioGuardia as an innovative therapy for the prevention of atherosclerotic \ncardiovascular events. \n• Emphasized its unique mechanism targeting both lipid metabolism and endothelial \nfunction. \n2. Provid..."


In [73]:
df_output.sample(10)

Unnamed: 0,Question,Answer,Source,Quote,Answers,Sources
45,How does Vitaligen compare to existing therapies for cardiovascular health?,Vitaligen has potential advantages for cardiovascular health.,MSL Notes_14.pdf,"""Compared Vitaligen to existing cardiovascular medications, emphasizing its potential to address a specific aspect of cardiovascular health.""","\nVitaligen is a novel therapy targeting cardiovascular health, specifically focusing on reducing arterial stiffness. Its unique mechanism promotes endothelial function and arterial elasticity. While Vitaligen has shown promise in reducing arterial stiffness by 15% on average, its efficacy in comparison to existing therapies for cardiovascular health is not well established. Dr. Foster expressed a preference for established therapies with proven outcomes, and",[(• Discussed patient feedback indicating a preference for interventions directly addressing \nmajor cardiovascular ev ents. \n7. Emerging Research and Future Developments: \n• Highlighted ongoing research on Vitaligen's potential in preventing cardiovascular \nevents. \n• Dr. Foster questioned the rationale behind expanding research without addressing \nexisting concerns about clinical re levance. \n8. Competitive Landscape: \n• Compared Vitaligen to existing cardiovascular medications...
83,At what institution does Dr. Melissa Turner work?,BreatheWell Pulmonary Care Center,MSL Notes_18.pdf,,Dr. Melissa Turner works at BreatheWell Pulmonary Care Center.,"[(Subject: Medical Science Liaison (MSL) Notes - Mixed Reactions on PulmoVive \nDate: September 5, 2024 \nProvider: Dr. Melissa Turner \nTitle: Pulmonologist \nInstitution: BreatheWell Pulmonary Care Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced PulmoVive as an innovative therapy for COPD management. \n• Dr. Turner expressed cautious optimism, acknowledging the need for advancements in \nCOPD treatment. \n2. Provider 's Current Patient Cases: \n•..."
79,At what institution does Dr. Robert Hernandez work?,Unity Cancer Institute,MSL Notes_7.pdf,,Dr. Robert Hernandez works at Unity Cancer Institute.,"[(Subject: Medical Science Liaison (MSL) Notes - Critical Discussion on Prolifixin \nDate: September 15, 2023 \nProvider: Dr. Robert Hernandez \nTitle: Hematologist -Oncologist \nInstitution: Unity Cancer Institute \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Prolifixin as an advanced targeted therapy for hematologic malignancies. \n• Dr. Hernandez expressed reservations about the necessity of introducing a new therapy \ngiven the existing treatment land..."
81,At what clinic does Dr. Jonathan Reynolds work?,"Dr. Jonathan Reynolds works at the Advanced Cardiovascular Institute, and also the RespiraLung Pulmonary Clinic.","MSL Notes_9.pdf, MSL Notes_17.pdf",,Dr. Jonathan Reynolds works at RespiraLung Pulmonary Clinic.,"[(Subject: Medical Science Liaison (MSL) Notes - Introduction of PulmoVive \nDate: August 10, 2024 \nProvider: Dr. Jonathan Reynolds \nTitle: Pulmonologist \nInstitution: RespiraLung Pulmonary Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced PulmoVive as an innovative therapy for managing moderate to severe \nchronic obstructive pulmonary disease (COPD). \n• Emphasized its unique mechanism targeting both bronchoconstriction and airway \ninflammation. ..."
75,What kind of doctor is Dr. Benjamin Turner?,Rheumatologist,MSL Notes_12.pdf,,Dr. Benjamin Turner is a Rheumatologist.,"[(Subject: Medical Science Liaison (MSL) Notes - Mixed Reactions on PulmoVive \nDate: September 5, 2024 \nProvider: Dr. Melissa Turner \nTitle: Pulmonologist \nInstitution: BreatheWell Pulmonary Care Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced PulmoVive as an innovative therapy for COPD management. \n• Dr. Turner expressed cautious optimism, acknowledging the need for advancements in \nCOPD treatment. \n2. Provider 's Current Patient Cases: \n•..."
8,How does OsteoGuard work in osteoporosis management?,OsteoGuard reduces bone mineral density loss in osteoporosis.,MSL Notes_13.pdf,"""Presented recent clinical data showcasing OsteoGuard's modest increase in bone mineral density.""","OsteoGuard is an innovative therapy for osteoporosis that focuses on bone health preservation. Its unique mechanism involves targeted osteoclast modulation, promoting bone density increase by 5% within the first year of treatment and statistically significant reductions in fracture risk. The drug works by modulating osteoclast activity to optimize bone remodeling. However, concerns were raised about its efficacy and safety, particularly in patients with complex orthopedic conditions","[(Subject: Medical Science Liaison (MSL) Notes - Introduction of OsteoGuard \nDate: February 25, 2024 \nProvider: Dr. Benjamin Turner \nTitle: Rheumatologist \nInstitution: Arthritis and Osteoporosis Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced OsteoGuard as an innovative therapy for osteoporosis, focusing on bone \nhealth preservation. \n• Emphasized its unique mechanism of promotin g bone density through targeted \nosteoclast modulation. \n2. P..."
20,How does Revitalizix target heart failure with preserved ejection fraction?,Revitalizix targets myocardial stiffness and inflammation.,MSL Notes_5.pdf,"""Explained Revitalizix's mechanism targeting myocardial stiffness and inflammation.""","Revitalizix targets heart failure with preserved ejection fraction by addressing myocardial stiffness and inflammation. The drug's mechanism involves modulating the expression of genes involved in these processes, ultimately leading to improved left ventricular diastolic function.","[(• Highlighted ongoing research on Revitalizix's potential applications in other \ncardiovascular conditions. \n• Discussed plans for exploring combination therapies to further enhance patient \noutcomes. \n8. Competitive Landscape: \n• Compared Revitalizix to existing heart failure therapies, emphasizing its unique focus on \npreserved ejection fraction. \n• Discussed the potential for Revitalizix to become a cornerstone in heart failure \nmanagement. \n9. Access and Reimbursement: ..."
74,What kind of doctor is Dr. Kimberly Adams?,Endocrinologist,MSL Notes_3.pdf,,Dr. Kimberly Adams is an Endocrinologist at the Metabolic Wellness Center.,"[(Subject: Medical Science Liaison (MSL) Notes - Introduction of Vitaligen \nDate: January 15, 2024 \nProvider: Dr. Kimberly Adams \nTitle: Endocrinologist \nInstitution: Metabolic Wellness Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Vitaligen as an innovative therapy for type 2 diabetes with a focus on \nmetabolic health. \n• Emphasized its unique mecha nism of promoting pancreatic beta -cell regeneration. \n2. Provider's Current Patient Cases:..."
32,What is the efficacy of Tranquilify in reducing anxiety symptoms?,Tranquilify reduces anxiety symptoms by 40% on average.,MSL Notes_5.pdf,"""Presented recent clinical data showcasing Tranquilify's ability to reduce anxiety symptoms by 40% on average.""","\nTranquilify has shown an average reduction of 40% in anxiety symptoms. However, the clinical significance of this reduction is debated, with some experts questioning its statistical relevance. Dr. Turner shared concerns about the limited efficacy observed in her initial trials, while Dr. Nguyen expressed interest in new pharmacological options for treatment -resistant anx iety.","[(Subject: Medical Science Liaison (MSL) Notes - Introduction of Tranquilify i n Anxiety Disorders \nDate: July 15, 2024 \nProvider: Dr. Sophia Nguyen \nTitle: Psychiatrist \nInstitution: MindCare Mental Health Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Tranquilify as a potential therapy for anxiety disorders, emphasiz ing its \nunique mechanism targeting specific neurotransmitter pathways. \n• Highlighted the drug's potential in managing both g..."
13,How does Vitaligen improve cardiovascular health?,Vitaligen enhances nitric oxide production and reduces arterial stiffness.,MSL Notes_14.pdf,"""Explained Vitaligen's mechanism, focusing on its role in enhancing nitric oxide production and reducing arterial stiffness.""","\nVitaligen improves cardiovascular health by reducing arterial stiffness. Its unique mechanism promotes endothelial function and arterial elasticity, leading to statistically significant improvements in blood pressure control and vascular health. The drug enhances nitric oxide production and reduces oxidative stress, contributing to its effectiveness. However, the clinical significance of these improvements and their relevance to cardiovascular outcomes are debated among healthcare professi...","[(Subject: Medical Science Liaison (MSL) Notes - Introduction of Vitaligen in Cardiovascular Health \nDate: May 20, 2024 \nProvider: Dr. Carlos Rodriguez \nTitle: Cardiologist \nInstitution: CardioVital Heart Center \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Vitaligen as a novel therapy targeting cardiovascu lar health, specifically \nfocusing on reducing arterial stiffness. \n• Emphasized its unique mechanism promoting endothelial function and arteri..."


In [58]:
# query it, note that the score here is a distance metric (lower is more related)
new_query = "Dr. Michael Chang"
docs_with_scores = db.similarity_search_with_score(new_query, k = 4)

In [59]:
docs_with_scores

[(Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol  \nDate:  June 20, 2023  \nProvider:  Dr. Michael Chang  \nTitle:  Dermatologist  \nInstitution:  Coastal Dermatology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced Aetherisol as a breakthrough treatment for refractory psoriasis.  \n• Discussed its unique mechanism targeting interleukin -17 pathways in the skin.  \n2. Provider's Current Patient Cases:  \n• Explored Dr. Chang's experiences with Aetherisol in managing severe psoriasis cases.  \n• Discussed rapid improvement in skin clearance observed in p atients resistant to \ntraditional therapies.  \n3. Efficacy and Clinical Data:  \n• Presented data showing a 70% reduction in Psoriasis Area and Severity Index (PASI) \nscores within 12 weeks.  \n• Highlighted Aetherisol's efficacy in achieving long -term remission.  \n4. Safety Pr ofile:  \n• Discussed the favorable safety profile of Aetherisol, 

In [54]:
df_output["Sources"][2]

[("Subject:  Medical Science Liaison (MSL) Notes - Mixed Reactions on CardioRelief Discussion  \nDate:  January 15, 2023 \nProvider:  Dr. Kevin Chen  \nTitle:  Cardiologist  \nInstitution:  CardioCare Specialists  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced CardioRelief as an innovative therapy for managing hypertension and \nreducing cardiovascular risk.  \n• Dr. Chen expressed cautious interest, noting the need for additional options in \nhypertension management.  \n2. Provider's Current Patient Cases:  \n• Dr. Chen  shared enthusiasm for novel treatments, particularly in patients struggling with \nadherence to traditional antihypertensive medications.  \n• Discussed specific cases where current medications have shown limitations or \nintolerances.  \n3. Efficacy and Clinical Data:  \n• Presented recent clinical data showcasing CardioRelief's ability to effectively lower \nblood pressure and improve vascular function.  \n• Dr. Chen acknowledged the promisi

In [45]:
results[0][0]

'\nNeuroSolvix has shown efficacy in reducing neuropathic pain scores by 40% and improving patient-reported quality of life. Statistically significant improvements in pain-related functional outcomes have also been observed.'

In [42]:
pd.options.display.max_colwidth = 500
df_results

Unnamed: 0,Query,Results,Sources
0,What's the efficacy of Neurosolvix?,"(\nNeuroSolvix has shown efficacy in reducing neuropathic pain scores by 40% and improving patient-reported quality of life. Statistically significant improvements in pain-related functional outcomes have also been observed., [(Subject: Medical Science Liaison (MSL) Notes - NeuroSolvix in Neuropathic Pain Management \nDate: March 25, 20 23 \nProvider: Dr. Joshua Martinez \nTitle: Pain Management Specialist \nInstitution: PainRelief Clinic \nSummary of Key Discussion Points: \n1. I...","[(Subject: Medical Science Liaison (MSL) Notes - NeuroSolvix in Neuropathic Pain Management \nDate: March 25, 20 23 \nProvider: Dr. Joshua Martinez \nTitle: Pain Management Specialist \nInstitution: PainRelief Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced NeuroSolvix as a potential therapy for the management of neuropathic pain. \n• Emphasized its unique mechanism targeting central sensitization and neural \nhyperactivity. \n2. Provider's Current ..."
1,What kind of doctoris James Harper?,"(\nDr. James Harper is a neurologist at City Neurology Clinic., [(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on NeuroGlyde \nDate: April 10, 2023 \nProvider: Dr. James Harper \nTitle: Neurologist \nInstitution: City Neurology Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its potential in \nslowing disease progression. \n• Discussed ongoing clinical trials and positive ea...","[(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on NeuroGlyde \nDate: April 10, 2023 \nProvider: Dr. James Harper \nTitle: Neurologist \nInstitution: City Neurology Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its potential in \nslowing disease progression. \n• Discussed ongoing clinical trials and positive early -phase results. \n2. Provider's Current Patient C ases: \n..."
2,What kind of doctor is Michael Chang?,"(\nThe context provides information about two medical professionals, Dr. Kevin Chen and Dr. Michael Chen. Dr. Kevin Chen is a cardiologist at CardioCare Specialists, while Dr. Michael Chen is a cardiologist at HeartCare Cardiovascular Center., [(Subject: Medical Science Liaison (MSL) Notes - Mixed Reactions on CardioRelief Discussion \nDate: January 15, 2023 \nProvider: Dr. Kevin Chen \nTitle: Cardiologist \nInstitution: CardioCare Specialists \nSummary of Key Discussion Points: \n...","[(Subject: Medical Science Liaison (MSL) Notes - Mixed Reactions on CardioRelief Discussion \nDate: January 15, 2023 \nProvider: Dr. Kevin Chen \nTitle: Cardiologist \nInstitution: CardioCare Specialists \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced CardioRelief as an innovative therapy for managing hypertension and \nreducing cardiovascular risk. \n• Dr. Chen expressed cautious interest, noting the need for additional options in \nhypertension management...."
3,What does Aetherisol treat?,"(\nAetherisol is a breakthrough treatment for refractory psoriasis and rheumatoid arthritis. It targets interleukin -17 pathways in the skin and has shown efficacy in achieving long-term remission. In psoriasis cases, Aetherisol has demonstrated a 70% reduction in Psoriasis Area and Severity Index (PASI) scores within 12 weeks. In r, [(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol \nDate: June 20, 2023 \nProvider: Dr. Michael Chang \nTitle: Dermatol...","[(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol \nDate: June 20, 2023 \nProvider: Dr. Michael Chang \nTitle: Dermatologist \nInstitution: Coastal Dermatology Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Aetherisol as a breakthrough treatment for refractory psoriasis. \n• Discussed its unique mechanism targeting interleukin -17 pathways in the skin. \n2. Provider's Current Patient Cases: \n• Explored Dr. Chang'..."
4,What is the mechanism of action for Aetherisol?,"(\nAetherisol's mechanism of action targets the interleukin -17 pathway in the skin. It is a breakthrough treatment for refractory psoriasis and has shown efficacy in achieving long-term remission. In rheumatoid arthritis (RA), Aetherisol demonstrates a remarkable 30% improvement in joint function within the first month, with a 60% reduction in disease activity scores observed in, [(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol \nDate: June 20, 2023 \n...","[(Subject: Medical Science Liaison (MSL) Notes - In-Depth Discussion on Aeth erisol \nDate: June 20, 2023 \nProvider: Dr. Michael Chang \nTitle: Dermatologist \nInstitution: Coastal Dermatology Clinic \nSummary of Key Discussion Points: \n1. Introduction: \n• Introduced Aetherisol as a breakthrough treatment for refractory psoriasis. \n• Discussed its unique mechanism targeting interleukin -17 pathways in the skin. \n2. Provider's Current Patient Cases: \n• Explored Dr. Chang'..."


# Experimentation

## Alternate Retrievers

### MultiVectorRetriever

### ParentChildRetrever

## Alternate chunking strategies

#### Overlap 1000-n chunks by 300 

In [55]:
test_chunk_size = 1000

def chunk_document(doc_path: str) -> List[Document]:
    """Chunk a document into smaller langchain Documents for embedding.

    :param doc_path: path to document
    :type doc_path: str
    :return: List of Document chunks
    :rtype: List[Document]
    """
    loader = PyPDFLoader(doc_path)
    documents = loader.load()

    # split document based on the `\n\n` character, quite unintuitive
    # https://stackoverflow.com/questions/76633836/what-does-langchain-charactertextsplitters-chunk-size-param-even-do
    text_splitter = CharacterTextSplitter(chunk_size=test_chunk_size, 
                                          chunk_overlap=300)
    
    return text_splitter.split_documents(documents)

In [56]:
# load the document and split it into chunks
doc_chunks = []
for doc in glob.glob("../../msl-data/*.pdf"):
    doc_chunks += chunk_document(doc)

In [61]:
import pandas as pd

df_queries = pd.read_csv("../../test_questions.csv")

pd.options.display.max_colwidth = 500
queries = df_queries["Query"].to_list()

results = [run_model(query, retriever) for query in queries]
answers = [result[0] for result in results]
sources = [result[1][:50] for result in results]

df_results = pd.DataFrame({"Answers": answers,
                           "Sources": sources,
                          })

df_output2 = pd.concat([df_queries, df_results], axis=1)
df_output2

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


AttributeError: 'float' object has no attribute 'replace'

## Prompt Engineering