# Using embeddings-based retrieval - Chroma Vector Database

In [1]:
import importlib
import sys


In [26]:
pip install pypdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pypdf import PdfReader

def extract_pdf_info(pdf_path):
    reader = PdfReader(pdf_path) #"instructions/colonoscopy-guidelines.pdf")
    pdf_texts = [p.extract_text().strip() for p in reader.pages]

    # Filter the empty strings
    pdf_texts = [text for text in pdf_texts if text]

    #pdf_texts[0]
    
    return pdf_texts[0]

In [3]:
# Paths to your PDFs
guidelines_pdf_path = 'instructions/colonoscopy-guidelines.pdf'
patient_pdf_path = 'medical-record-1.pdf'

In [4]:
# Extract text
guidelines_text = extract_pdf_info(guidelines_pdf_path)
patient_text = extract_pdf_info(patient_pdf_path)

In [5]:
guidelines_text

'For educational purposes only   \n[45378] Colonoscopy, flexible; diagnostic  • Colorectal cancer screening, as indicated by 1 or more of the following: o Patient has average-risk or higher, as indicated by ALL of the following § Age 45 years or older § No colonoscopy in past 10 years o High risk family history, as indicated by 1 or more of the following: § Colorectal cancer diagnosed in one or more first-degree relatives of any age and ALL of the following: • Age 40 years or older • Symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding) § Family member with colonic adenomatous polyposis of unknown etiology o Juvenile polyposis syndrome diagnosis indicated by 1 or more of the following: § Age 12 years or older and symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia) § Age younger than 12 years and symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia)'

In [6]:
patient_text

'Co:Helm  \nMEDICAL RECORD Patient Name: James Freeman DOB: 06/16/1982 MRN: 456789123 Sex: Male  PRESENTING COMPLAINT Symptoms: Occasional rectal bleeding and abdominal discomfort for the past 6 months. Duration: 6 months.  PATIENT INFORMATION Name: James Freeman DOB: 06/16/1982 Gender: Male Address: 4521 Maple Avenue, Dallas, Texas 75219 Contact Number: (214) 555-0123 Emergency Contact: Not provided  MEDICAL HISTORY • Family History: Father had colorectal cancer at age 68. • Personal Medical History: Hypertension, managed with medication. • Medications: Lisinopril 10mg daily. • Allergies: No known drug allergies.  ALLERGIES • Allergies not reviewed (last reviewed 11/28/2022) • NKDA'

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter


In [33]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision (from sentence-transformers)
  Downloading torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl.metadata (6.6 kB)
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch>=1.6.0 (from sentence-transformers)
  Downloading torch-2.1.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Downloading torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.2 MB/s[0m eta [36

In [11]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=100)# max is 384 tokens

token_split_texts = []

token_split_texts += token_splitter.split_text(guidelines_text)

print((token_split_texts))
print(f"\nTotal chunks: {len(token_split_texts)}")

['for educational purposes only [ 45378 ] colonoscopy, flexible ; diagnostic • colorectal cancer screening, as indicated by 1 or more of the following : o patient has average - risk or higher, as indicated by all of the following § age 45 years or older § no colonoscopy in past 10 years o high risk family history, as indicated by 1 or more of the following : § colorectal cancer diagnosed in one or more first - degree relatives of any age and', 'all of the following : • age 40 years or older • symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding ) § family member with colonic adenomatous polyposis of unknown etiology o juvenile polyposis syndrome diagnosis indicated by 1 or more of the following : § age 12 years or older and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia ) § age', 'younger than 12 years and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia )']

Total chunks: 3


In [12]:
print((token_split_texts[0]))

for educational purposes only [ 45378 ] colonoscopy, flexible ; diagnostic • colorectal cancer screening, as indicated by 1 or more of the following : o patient has average - risk or higher, as indicated by all of the following § age 45 years or older § no colonoscopy in past 10 years o high risk family history, as indicated by 1 or more of the following : § colorectal cancer diagnosed in one or more first - degree relatives of any age and


In [13]:
print((token_split_texts[1]))

all of the following : • age 40 years or older • symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding ) § family member with colonic adenomatous polyposis of unknown etiology o juvenile polyposis syndrome diagnosis indicated by 1 or more of the following : § age 12 years or older and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia ) § age


In [14]:
print((token_split_texts[2]))

younger than 12 years and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia )


In [38]:
pip install chromadb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
#pip uninstall typing_extensions

In [40]:
pip install typing_extensions

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [41]:
pip install --upgrade typing_extensions

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [15]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()


In [16]:
#print(embedding_function([token_split_texts[1]]))

In [19]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("guidelines", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

3

In [20]:
chroma_collection

Collection(name=guidelines)

### Collecting RAG chunks (documents that are pertinent to our query). 

In [24]:
query = "You are an expert medical assistant. Follow the instructions listed here along with the guidelines document to answer if the requested procedure(s) for the patient should be approved. If not provide details of the additional information that may help in making decision.1.	Please use only the information provided to answer. 2.	Your goal is to approve the requested procedure only when all the required criterias are met.3.	Ingest patient medical record PDF (medical-record-x.pdf) where X is likely a number. 4.	Create a timeline of patient’s medical history on complaints, diagnostics, diagnosis, procedures, treatments, medications etc.5.	Extract all the CPT code(s) from this document which is usually present after the text ‘Requested Procedure’ to identify which procedure(s) have been recommended by the doctor.6.	Display the name of the patient, patient’s date of birth or DOB, calculate the age from the date of birth or DOB and display it . Display MRN if provided. 7.	Identify if any conservative treatment has already been attempted from the patient medical record PDFs whether in medical procedures, clinical procedures or notes.8.	If a prior conservative treatment has already been attempted and if the treatment was successful or have shown signs of improvements, then present evidence that conservative treatment improved the patient’s condition and disapprove the need for the Requested Procedure stating the reason.9.	If the conservative treatment was not found or has failed then mention explicitly that ‘conservative treatment was not found or has failed’, and then look for the criterias present in the guidelines to identify if the Requested Procedure should be allowed or not. Please state which condition(s) in the guidelines were used to arrive at the answer.10.	At any point if you don’t know the answer, just say ‘I cannot arrive at the conclusion. Please provide additional information’. Do not hallucinate or provide incorrect or incomplete information. Please request specific information that could be helpful in making a decision.11.	Walk me through the process with chain-of-thoughts on how you arrived at the conclusion. 12.	At the end, after two blank line, summarise the final conclusion with the title ‘Conclusion:’ in less than 40 words if the requested procedure should be approved or not. Provide the reason why it should be approved or not approved. In case of indecision, specify what further information would be required to decide."

results = chroma_collection.query(query_texts=[query], n_results=len(token_split_texts))
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print((document))
    print('\n')

for educational purposes only [ 45378 ] colonoscopy, flexible ; diagnostic • colorectal cancer screening, as indicated by 1 or more of the following : o patient has average - risk or higher, as indicated by all of the following § age 45 years or older § no colonoscopy in past 10 years o high risk family history, as indicated by 1 or more of the following : § colorectal cancer diagnosed in one or more first - degree relatives of any age and


all of the following : • age 40 years or older • symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding ) § family member with colonic adenomatous polyposis of unknown etiology o juvenile polyposis syndrome diagnosis indicated by 1 or more of the following : § age 12 years or older and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia ) § age


younger than 12 years and symptomatic ( eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia )




In [25]:
import os
import openai
from openai import OpenAI

api_key = os.environ['openai_api_key'] # copy paste your api key here

openai_client = OpenAI(api_key=api_key)

In [30]:
information=patient_text 
information = information.join(retrieved_documents)
information

'for educational purposes only [ 45378 ] colonoscopy, flexible ; diagnostic • colorectal cancer screening, as indicated by 1 or more of the following : o patient has average - risk or higher, as indicated by all of the following § age 45 years or older § no colonoscopy in past 10 years o high risk family history, as indicated by 1 or more of the following : § colorectal cancer diagnosed in one or more first - degree relatives of any age andCo:Helm  \nMEDICAL RECORD Patient Name: James Freeman DOB: 06/16/1982 MRN: 456789123 Sex: Male  PRESENTING COMPLAINT Symptoms: Occasional rectal bleeding and abdominal discomfort for the past 6 months. Duration: 6 months.  PATIENT INFORMATION Name: James Freeman DOB: 06/16/1982 Gender: Male Address: 4521 Maple Avenue, Dallas, Texas 75219 Contact Number: (214) 555-0123 Emergency Contact: Not provided  MEDICAL HISTORY • Family History: Father had colorectal cancer at age 68. • Personal Medical History: Hypertension, managed with medication. • Medicatio

In [31]:
def rag(query, retrieved_documents, model="gpt-4-1106-preview"): #gpt-3.5-turbo
    information=patient_text 
    information = information.join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful medical expert. Use the information provided to answer."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [32]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print((output))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
1. Ingest patient medical record PDF: Ingested.

2. Create a timeline of patient's medical history on complaints, diagnostics, diagnosis, procedures, treatments, medications, etc.:
   - Duration of symptoms (occasional rectal bleeding and abdominal discomfort): 6 months.
   - Family History: Father had colorectal cancer at age 68.
   - Personal Medical History: Hypertension.
   - Medications: Lisinopril 10mg daily.

3. Extract all the CPT code(s): 45378 (colonoscopy, flexible; diagnostic).

4. Display the name, DOB, age, and MRN:
   - Name: James Freeman
   - DOB: 06/16/1982
   - Age: As of the knowledge cutoff in 2023, James would be 40 years old.
   - MRN: 456789123

5. Identify if any conservative treatme