<a href="https://colab.research.google.com/github/adsj/DBGuideInstructEmbed/blob/main/DBGuideInstructEmbed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chromadb
!pip install langchain
!pip install pypdf
!pip install tiktoken
!pip -q install huggingface_hub
!pip -q install InstructorEmbedding sentence_transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path
import os
import chromadb
from chromadb.config import Settings
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma

  from tqdm.autonotebook import trange


In [4]:
#Enter file paths for document folder
pdf_files_path = '/content/drive/MyDrive/PDFs/'

In [5]:
#Source datafiles in "ClinGuidelinesPDFs" folder
#Get list of documents to check
pdf_files = os.scandir(pdf_files_path)
pdf_paths = [doc.path for doc in pdf_files]
print(len(pdf_paths), pdf_paths)

3 ['/content/drive/MyDrive/PDFs/High Risk PE.pdf', '/content/drive/MyDrive/PDFs/Hyperkalaemia.pdf', '/content/drive/MyDrive/PDFs/Adult Diabetic Ketoacidosis.pdf']


In [6]:
def list_of_pdfs(collection):
    all_docs = collection.get()
    all_docs_list = []
    for i in range(len(all_docs['ids'])):
        guideline_name = all_docs['metadatas'][i]['source'].split('/')[-1]
        if guideline_name not in all_docs_list:
            all_docs_list.append(guideline_name)
    return all_docs_list

In [7]:
def add_new_pdf(new_pdf_path, db_collection_name, persist_dir, client):
    loader = PyPDFLoader(new_pdf_path)
    raw_documents = loader.load()
    text_splitterR = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 300, length_function = len)
    documents = text_splitterR.split_documents(raw_documents)
    Chroma.from_documents(documents, embeddings_model, collection_name=db_collection_name, persist_directory= persist_dir, client = client)

In [8]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
persist_dir = '/content/drive/MyDrive/VectorDBs/PDFsVecDBInstruct'
client = chromadb.PersistentClient(path = persist_dir)
db_collection_name = 'PDF_collection_Instruct'
embeddings_model = instructor_embeddings
vectordb_Instruct = Chroma(client=client, collection_name=db_collection_name, embedding_function=embeddings_model)

In [10]:
# If collection does not exist then add new pdf, else create list of 'source' docs (from Metadata dictionary)
for pdf_path in pdf_paths:
    #check if collection exists
    try:
        collection = client.get_collection(name = db_collection_name, embedding_function = embeddings_model)
        screen = collection.get(where = {'source': pdf_path})
        if screen['metadatas'] == []:
            add_new_pdf(new_pdf_path=pdf_path, db_collection_name=db_collection_name, persist_dir= persist_dir, client=client)
    except ValueError:
        print ("Collection does not exist so creating it")
        add_new_pdf(new_pdf_path=pdf_path, db_collection_name=db_collection_name, persist_dir= persist_dir, client=client)

In [11]:
# Use Chroma client to interrogate collection
collection = client.get_collection(name = db_collection_name, embedding_function = embeddings_model)

In [13]:
#Check the collection content seems correct (for the 3 guideline pdfs- number = 47)
print(collection.count())

47


In [14]:
def query_db(query):
    if query == 'list':
        print('\n')
        coll_list = list_of_pdfs(collection)
        for l in range(len(coll_list)):
            print(coll_list[l])

    else:
        docs = vectordb_Instruct.similarity_search(query, k=4)
        print("\nTHE TOP 4 RETURNS ARE:")
        for i in range (len(docs)):
            print (docs[i].metadata)
        print("\nEXCERPT FROM THE FIRST DOCUMENT")
        print(docs[0].page_content)
        return docs

In [16]:
# Enter the query as a string, returns top 4 similar results (as Chromadb documents)
print ("""Enter your query (enter "list" to see full list of guidelines)""")
query = input('input query:')
docs = query_db(query)

Enter your query (enter "list" to see full list of guidelines)
input query:how do i treat high potassium

THE TOP 4 RETURNS ARE:
{'page': 1, 'source': '/content/drive/MyDrive/PDFs/Hyperkalaemia.pdf'}
{'page': 3, 'source': '/content/drive/MyDrive/PDFs/Hyperkalaemia.pdf'}
{'page': 5, 'source': '/content/drive/MyDrive/PDFs/Adult Diabetic Ketoacidosis.pdf'}
{'page': 4, 'source': '/content/drive/MyDrive/PDFs/Hyperkalaemia.pdf'}

EXCERPT FROM THE FIRST DOCUMENT
Hyperkalaemia  
 
Printed or personally saved electronic copies of this policy are considered uncontrolled. Refer to the FSFHG Policy 
hub for current controlled electronic policies.   
Page 2 of 5 3.3. Assessment  
• Perform an ECG or commence continuous cardiac monitoring  
• Record vital signs  
• Conduct a neurological examination  
• Assess for signs of renal failure, such as oedema or skin changes  
• Take bloods for U&E, glucose and venous blood gases  
3.4. Severity and Treatment  
• Identify the cause  
• Discontinue or withh

In [17]:
# To show the excerpts of the next recommendation
print(docs[1].page_content)

Printed or personally saved electronic copies of this policy are considered uncontrolled. Refer to the FSFHG Policy hub for current controlled electronic policies.   
Page 4 of 5 9.  Appendices  
9.1. Appendix 1: P harmacological management  options  
DRUG  DOSE & ADMINISTRATION  ONSET  DURATION  EFFECT ON K+ PRECAUTIONS  COMMENTS  
Stabilises cardiac cell membrane (alleviates the membrane depolarisation of severe hyperkalaemia)  
CALCIUM 
GLUCONATE  Calcium gluconate 10% 
10mL (2.2mmol calcium) IV 
over    5 minutes into a large vein  
Repeat after 5 minutes if 
ECG changes persist  1 - 5 min  30 to 60 
minutes  
May need to be repeated  Does not lower serum K
+ Avoid use in digoxin 
toxicity (increases 
digoxin ef fect) 
Hypercalcaemia Monitor response by ECG  
Avoid extravasation 
 
Redistributes extracellular potassium into the cells  
INSULIN WITH 
GLUCOSE  ACTRAPID® 10 units in         
50 mL of glucose 50% IV      
over 15 min  
 
Subsequent IV glucose 
infusion may be required 