<a href="https://colab.research.google.com/github/adsj/DBSearchColabOpenAI/blob/main/DBSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install dependencies
!pip install openai
!pip install chromadb
!pip install langchain
!pip install pypdf
!pip install tiktoken

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pathlib import Path
import os
import chromadb
from chromadb.config import Settings
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [None]:
#Enter openai_api_key
api_key ='...enter_api_key_here...'

In [None]:
# Create chromadb client with file path to where to store database
persist_dir = '/content/drive/MyDrive/VecDB'
client = chromadb.PersistentClient(path = persist_dir)

In [None]:
#name the collection and choose the embeddings modle (OpenAI in this case)
db_collection_name = 'Guide_collection'
embeddings_model = OpenAIEmbeddings(openai_api_key=api_key)
vectordb_guidelines = Chroma(client=client, collection_name=db_collection_name, embedding_function=embeddings_model)

In [None]:
#Enter file paths for document folder in googledrive
pdf_files_path = '/content/drive/MyDrive/PDFs/'

In [None]:
#Source datafiles in "ClinGuidelinesPDFs" folder
#Get list of documents to check
pdf_files = os.scandir(pdf_files_path)
pdf_paths = [doc.path for doc in pdf_files]
print(len(pdf_paths), pdf_paths)

10 ['/content/drive/MyDrive/ClinGuidelinesPDFs/Advanced Response Team (ART).pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Code STEMI.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Chest Pain - Suspected & Confirmed ACS.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Ascitic Drain Management.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Adult Diabetic Ketoacidosis.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Adult Hyperosmolar Hyperglycaemic State.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Anticoagulation Reversal Warfarin & Heparin.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Hyperkalaemia.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/Upper Gastrointestinal Bleed.pdf', '/content/drive/MyDrive/ClinGuidelinesPDFs/High Risk PE Management.pdf']


In [None]:
def list_of_pdfs(collection):
    all_docs = collection.get()
    all_docs_list = []
    for i in range(len(all_docs['ids'])):
        guideline_name = all_docs['metadatas'][i]['source'].split('/')[-1]
        if guideline_name not in all_docs_list:
            all_docs_list.append(guideline_name)
    return all_docs_list

In [None]:
def add_new_pdf(new_pdf_path, db_collection_name, persist_dir, client):
    loader = PyPDFLoader(new_pdf_path)
    raw_documents = loader.load()
    text_splitterR = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 300, length_function = len)
    documents = text_splitterR.split_documents(raw_documents)
    Chroma.from_documents(documents, embeddings_model, collection_name=db_collection_name, persist_directory= persist_dir, client = client)

In [None]:
# If collection does not exist then add new pdf, else create list of 'source' docs (from Metadata dictionary)
for pdf_path in pdf_paths:
    #check if collection exists
    try:
        collection = client.get_collection(name = db_collection_name, embedding_function = embeddings_model)
        screen = collection.get(where = {'source': pdf_path})
        if screen['metadatas'] == []:
            add_new_pdf(new_pdf_path=pdf_path, db_collection_name=db_collection_name, persist_dir= persist_dir, client=client)
    except ValueError:
        print ("Collection does not exist so creating it")
        add_new_pdf(new_pdf_path=pdf_path, db_collection_name=db_collection_name, persist_dir= persist_dir, client=client)

In [None]:
# Use Chroma client to interrogate collection
collection = client.get_collection(name = db_collection_name, embedding_function = embeddings_model)

In [None]:
#Check the collection content seems correct (usually approx 3-4 per pdf page)
print(collection.count())

141


In [None]:
def query_db(query):
    if query == 'list':
        print('\n')
        coll_list = list_of_pdfs(collection)
        for l in range(len(coll_list)):
            print(coll_list[l])

    else:
        docs = vectordb_guidelines.similarity_search(query, k=4)
        print("\nTHE TOP 4 RETURNS ARE:")
        for i in range (len(docs)):
            print (docs[i].metadata)
        print("\nEXCERPT FROM THE FIRST DOCUMENT")
        print(docs[0].page_content)
        return docs

In [None]:
# Enter the query as a string, returns top 4 similar results (as Chromadb documents)
print ("""Enter your query (enter "list" to see full list of guidelines)""")
query = input('input query:')
docs = query_db(query)

Enter your query (enter "list" to see full list of guidelines)
input query:what is the dose of protamine

THE TOP 4 RETURNS ARE:
{'page': 5, 'source': '/content/drive/MyDrive/ClinGuidelinesPDFs/Anticoagulation Reversal Warfarin & Heparin.pdf'}
{'page': 5, 'source': '/content/drive/MyDrive/ClinGuidelinesPDFs/Anticoagulation Reversal Warfarin & Heparin.pdf'}
{'page': 9, 'source': '/content/drive/MyDrive/ClinGuidelinesPDFs/Anticoagulation Reversal Warfarin & Heparin.pdf'}
{'page': 8, 'source': '/content/drive/MyDrive/ClinGuidelinesPDFs/Upper Gastrointestinal Bleed.pdf'}

EXCERPT FROM THE FIRST DOCUMENT
Anticoagulation Reversal Guidelines   
Printed or personally saved electronic copies of this policy are considered uncontrolled. Refer to the FSFHG Policy hub for 
current controlled electronic policies.   
Page 6 of 10 9. Appendices  
9.1. Appendix  1: Antidotes for Anticoagulation  
Reversal Agent  Anticoagulant  Dosage / Comments  
Protamine  Unfractionated 
Heparin  
(UFH) –            

In [None]:
#to see the excerpts for the other suggestions
print(docs[1].page_content)

In [None]:
print(docs[2].page_content)

In [None]:
print(docs[3].page_content)