Suggested LLM and RAG application.

### RAG for Vista Documentation Lib

##  The VDL is an open resource set of documents on the web and is frequently used by advanced users of VISTA ( CACs, Pharmacists, ADPAC).
### Suggestion:
## Create a resource that offers a QnA session for CACs/Pharmacists/ADPACs to 


In [2]:
import os

import sys
from langchain.document_loaders import WebBaseLoader, PyPDFLoader # load URLs.
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from sentence_transformers import SentenceTransformer

from langchain_chroma import Chroma
from uuid import uuid4
from langchain_core.documents import Document
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from chromadb.utils import embedding_functions
import chromadb
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
 )

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv())

# openai.api_key = os.environ['OPENAI_API_KEY']

class PreProcess():
    '''
    A set of methods  to pre-process  text for a bot using the VDL library
    '''
    def __init__(self):
        self.docs = []
        #self.embedding = OpenAIEmbeddings()
        
        


    def concat_pdfdocs(self, pdfurl):
        '''
        A method to take in a PDF URL of a document and concatenate it to  to a larger document 
        '''

        # load a VDL PDF.
        loader  = PyPDFLoader(pdfurl)
        doc = loader.load()
        self.docs.extend(doc)
    
    def get_chuncks(self, chunck_size = 150, chunck_overlap = 10):
        '''
        A method to  create embedding following the splitting 
        '''
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunck_size,
            chunk_overlap = chunck_overlap,
            length_function=len,
            is_separator_regex=True,
            )
        splits = text_splitter.split_documents(self.docs)
        #
        
        return splits


    
    
    def delete_collection(self, collection, col_name):
        '''
        A method to delete a collection
        collection :
        '''
        collection.delete(name = col_name)


    def write_docs(self, list, docs2save = 'docs.txt'):
        '''
        A method to save the docs list to a flat text file for debug/ease of development
        '''

        with open(docs2save, 'w') as file:
            # Join the list elements into a single string with a newline character
            data_to_write = '**'.join(list.to_json()['kwargs']['page_content'])
    
             # Write the data to the file
            file.write(data_to_write)


    def load_chroma(self, splits):
        '''
        A method to take chunck  and load them into Chroma one by one using the
        '''
        get_page_content = lambda x: x.to_json()['kwargs']['page_content'].replace("\n","")
        get_metadata = lambda x: x.to_json()['kwargs']['metadata']
        get_id = lambda x: x.to_json()['kwargs']['id']
        #
        page_content = list(map(get_page_content, docs))
        metadata = list(map(get_metadata, docs))
        id = ['doc{0}'.format(i) for  i in range(len(page_content))]
        #
        # create a collection
        client = chromadb.PersistentClient(path="test")
        #embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        default_ef = embedding_functions.DefaultEmbeddingFunction()
        #default_ef = SentenceTransformer("all-MiniLM-L6-v2")
        collection = client.get_or_create_collection(name="test", embedding_function = default_ef)
    #
        for i in range(len(id)):
            collection.add(
                documents=page_content[i],
                metadatas=metadata[i],
                ids=id[i]
            )
        return collection

    def connect_chroma(self,path):
        '''
        A method to commect to a chroma collection with a path "path"
        path - string, the path  name of collection
        '''
        client = chromadb.PersistentClient(path=path)
        #embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        default_ef = embedding_functions.DefaultEmbeddingFunction()
        #default_ef = SentenceTransformer("all-MiniLM-L6-v2")
        collection = client.get_or_create_collection(name=path, embedding_function = default_ef)

        return collection


USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm, trange


In [4]:
## Instantiate 
pp = PreProcess()


In [None]:
pdfurls = [
         "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiutm.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiu_util.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiuum.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiuhl7.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiuqr.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiu_1_250rn.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiu_1_0_297_ig.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiu_1_0_309_ig.pdf",
        "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiuim.pdf",
       "https://www.va.gov/vdl/documents/Clinical/CPRS-Text_Integration_Utility_(TIU)/tiuig.pdf"

for i, pdfurl in enumerate(pdfurls):
    pp.concat_pdfdocs(pdfurl)
docs = pp.docs

splits = pp.get_chuncks(chunck_size= 240, chunck_overlap=10)
# load into chromadb
#vs = pp.load_chroma(splits) # load splits into chroma


In [5]:
vs = pp.connect_chroma(path = "test") # connect to a collection

In [6]:

#simple query
def ask_q(question, n_res = 5):
    res = vs.query(
        query_texts=question,
        n_results = 5,
        include=["documents", "distances", "metadatas"]
    )

    # Define the prompt template for the LLM
    prompt = PromptTemplate(
        template="""You are an assistant for question-answering tasks.
        Use the following documents to answer the question.
        If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise:
        Question: {question}
        Documents: {documents}
        Answer:
        """,
        input_variables=["question", "documents"],
    )

    # Initialize the LLM with Llama 3.1 model
    llm = ChatOllama(
        model="llama3.2",
        temperature=0,
    )
    # Create a chain combining the prompt template and LLM
    rag_chain = prompt | llm | StrOutputParser()

    doc_texts = "\\n".join([doc for doc in res['documents'][0]])

    answer = rag_chain.invoke({"question": question, "documents": doc_texts})

    return answer


In [7]:
question = ["How to create a new note"]
ans = ask_q(question)

In [8]:
from pprint import pprint
pprint(ans)

('It appears that the text is a user manual for a clinical documentation '
 'system, specifically the Text Integration Utilities (TIU) version 1.0. The '
 'manual provides instructions on how to use the system, including creating '
 'and editing documents, using Interdisciplinary Notes, and setting up '
 'document definitions.\n'
 '\n'
 'The manual covers various topics, such as:\n'
 '\n'
 '* Creating a Progress Note Document Class called Nursing Notes\n'
 '* Creating a Title called Nursing Patient Education Notes\n'
 '* Using Interdisciplinary Notes to express notes from different caregivers '
 'as a single episode of care\n'
 '* Setting up note titles for the initiating note and attachment notes '
 '(parent and child notes)\n'
 '* Using version 15 of the CPRS Windows interface or later\n'
 '\n'
 'The manual also provides guidance on how to use various features, such as:\n'
 '\n'
 '* Dragging and dropping previously created note attachments to the parent '
 'node\n'
 '* Confirming att

In [9]:
question = ["How to delete a new note"]
ans = ask_q(question)

In [10]:
from pprint import pprint
pprint(ans)

('To delete a new note, go to the Interdisciplinary Notes menu and select '
 '"Detach from ID Note". This will detach the note from its parent node.')


In [14]:
question = ["How to make an addendum"]
ans = ask_q(question)

In [12]:
pprint(ans)

('The text appears to be a user manual for CPRS (Clinical Practice and '
 'Reporting System), a healthcare information system. The manual provides '
 'instructions on how to use the system, including searching for patient '
 "records, viewing progress notes, and navigating the system's interface.\n"
 '\n'
 'Specifically, the manual explains how to:\n'
 '\n'
 '1. Search for patient records by name or other criteria.\n'
 '2. View progress notes, including those with addendums attached.\n'
 "3. Navigate the system's tree-structured arrangement of notes.\n"
 '4. Understand the meaning of various icons used in the system, such as '
 'signed and unsigned notes, Interdisciplinary Notes, and regular notes.\n'
 '\n'
 'The manual also provides instructions on how to perform actions such as:\n'
 '\n'
 '* Quitting the system\n'
 '* Printing documents\n'
 '* Identifying signers\n'
 '* Making addendums\n'
 '\n'
 'Overall, the manual appears to be a comprehensive guide for healthcare '
 'professional

### Identify surgical site infections from text notes.