# Knowledge Retrieval Chatbot LLM App

In [1]:
# Load packages
from openai import OpenAI
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import os
import argparse
import tempfile

In [17]:
# fetch environmental variables
load_dotenv()

True

In [5]:
# helper function to process the input text file, remove empty lines and unneeded formatting marks
def process_input_text(input_file_path):
    '''
    process_input_text() helper function takes the text file as an argument and
    removes empty lines and non-essential characters. The output is saved
    in a temporary directory.
    
    Parameters:
        input_file_path (str): path to the input text file
    
    Returns:
        processed temporary text file path saved in temp/
    '''
    
    # create a temporary file in the same directory as input file
    temp_dir = os.path.join(os.path.dirname(input_file_path), "temp")
    os.makedirs(temp_dir, exist_ok = True)
    
    temp_file = tempfile.NamedTemporaryFile(mode = 'w', delete = False, dir = temp_dir, encoding = 'UTF-8')
    
    try:
        # Read the contents of the file
        with open(input_file_path, 'r', encoding = 'UTF-8') as input_file:
            lines = input_file.readlines()

        # Remove empty lines
        non_empty_lines = [line.strip() for line in lines if line.strip() and not all(char in {'-', '_'} for char in line.strip())]

        # write processed text to the temporary file
        temp_file.write('\n'.join(non_empty_lines))
    finally:
        # close the temporary file
        temp_file.close()
        
    # get the path of the temporary file
    temp_file_path = temp_file.name
    
    return temp_file_path

In [6]:
# helper function to ask questions to LLM chain
def answer_question(q, chain):
    '''
    answer_question() is a helper function to ask a single question from
    a LLM chain
    
    Parameters:
        q (str): user's question
        crc (langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain):
            ConversationalRetrievalChain object from Langchain
    
    '''
    result = chain.invoke({'question': q})
    return result['answer']


In [28]:
input_text_file = "Software_Engineering_Practices.txt"

In [29]:
# process input file
processed_text_file_path = process_input_text(input_text_file)


In [30]:
loader = TextLoader(processed_text_file_path, encoding = 'UTF-8') # need encoding specified
data = loader.load() 

In [31]:
data

[Document(page_content='PHC Imaging\nTechnical Operating Procedure\nSoftware Engineering Practices for Personalized HealthCare Imaging Algorithms\n1. Purpose\nGood software engineering practice adherence is required for compliance with Roche Quality Management System policies. This will ensure that our code development is uniform and well documented to keep us within regulations, allowing us to support development work on regulated systems and devices.\nThe purpose of this document is to provide the teams responsible for the development of Medical Imaging Algorithms, with high-level principles and concepts making up the software engineering practices in the Personalized HealthCare (PHC) Imaging group.\nAs different subteams within the PHC Imaging group have different processes, coding standards and tools that they may use, this document is intended to serve as a high-level and an overarching Technical Operating Procedure (TOP) across the PHC imaging subteams. Individual product teams m

In [32]:
# split the text using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 80)
chunks = text_splitter.split_documents(data)

In [33]:
# Instantiate an embedding model from AzureOpenAI
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small', 
    dimensions=1536)

In [34]:
# Create an in-memory Chroma vector store using the provided text chunks 
# and the embedding model 
vector_store = Chroma.from_documents(documents = chunks, embedding = embeddings)

In [35]:
vector_store

<langchain_community.vectorstores.chroma.Chroma at 0x1353dcb90>

In [36]:
len(vector_store)

44

In [37]:
# initialize the Azure LLM
llm = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
    model = os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0) 

In [38]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1353d7d90>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x135349690>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='')

In [39]:
# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})

In [40]:
# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [41]:
# build messages
system_template = r'''
You are answering questions only concerning the provided content of the input document.  
If you are asked a question that is not related to the document you response will be:
'The question is not relevant to the domain of interest'.
---------------
Context: ```{context}```
'''

user_template = '''
Answer questions only concerning the provided content of the input document.  
If you are asked a question that is not related to the document you response will be:
'The question is not relevant to the domain of interest'. 
Here is the user's question: ```{question}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
    ]

qa_prompt = ChatPromptTemplate.from_messages(messages)

In [42]:
qa_prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="\nYou are answering questions only concerning the provided content of the input document.  \nIf you are asked a question that is not related to the document you response will be:\n'The question is not relevant to the domain of interest'.\n---------------\nContext: ```{context}```\n")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template="\nAnswer questions only concerning the provided content of the input document.  \nIf you are asked a question that is not related to the document you response will be:\n'The question is not relevant to the domain of interest'. \nHere is the user's question: ```{question}```\n"))])

In [43]:
# Set up conversational retrieval chain
crc = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = retriever,
    memory = memory,
    chain_type = 'stuff',
    combine_docs_chain_kwargs = {'prompt': qa_prompt },
    verbose = False)

In [44]:
input_question_file = "questions_list.txt"

In [45]:
# process the input_question_file line by line
with open(input_question_file, 'r') as file:
    question_counter = 0
    for line in file:
        question_counter += 1
        # Process each line
        new_user_question = line
        response = answer_question(q = new_user_question, chain = crc)
        # print the questiom
        print(f"Answering question {question_counter}: {new_user_question}")
        # print the response
        print(f"Answer to question {question_counter}: \n")
        print(response, "\n")

Answering question 1: What does TOP stand for?

Answer to question 1: 

TOP stands for Technical Operating Procedure as mentioned in the document. 

Answering question 2: What kind of algorithms does it apply to?

Answer to question 2: 

The TOP (Technical Operating Procedure) applies to rule-based and Machine Learning (ML) based algorithms delivered within the PHC Imaging group. 

Answering question 3: What are the phases of the development life cycle?

Answer to question 3: 

The phases of the development life cycle outlined in the document are:
1. Initialization & preparation
2. Development/Experimental phase
3. Qualification Phase
4. Deployment Phase 

Answering question 4: What is their number?

Answer to question 4: 

There are four phases outlined in the development life cycle as described in the document. 

Answering question 5: Give me a recipe for an omelet

Answer to question 5: 

The question is not relevant to the domain of interest. 

Answering question 6: What are coding