In [4]:
ls

CD_seller_signed.pdf                    app_scratch_book.ipynb
Hatching_a_story.docx                   final_signed_offer.pdf
LICENSE                                 image2.png
Purchase_and_sale_final.pdf             image3.png
README.md                               image5.png
RSM_packing_list.docx                   image6.png
RSM_packing_list_2.docx                 img.png
Software_Engineering_Practices_TOP.txt  requirements.txt
app.py                                  [34mtemp[m[m/
app1.py                                 ~$tching_a_story.docx


In [12]:
input_file_path = "Purchase_and_sale_final.pdf"

In [8]:
input_file_path = "final_signed_offer.pdf"

In [17]:
input_file_path = "CD_seller_signed.pdf"

In [1]:
# Install all libraries by running in the terminal: pip install -r requirements.txt
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from docx import Document
import PyPDF2
import os
import tempfile
import tiktoken
import shutil ###########################


# fetch environmental variables
load_dotenv()

True

In [2]:
# helper function to process the input text file, remove empty lines and unneeded formatting marks
def process_input_file(input_file_path):
    '''
    process_input_text() helper function takes the input file in txt, docx or pdf format
    as an argument and removes empty lines and non-essential characters. The output is saved
    in a temporary directory.
    
    Parameters:
        input_file_path (str): path to the input text file
    
    Returns:
        processed temporary text file path saved in temp/
    '''
    # Create a temporary file in the same directory as the input file
    temp_dir = os.path.join(os.path.dirname(input_file_path), "temp")
    os.makedirs(temp_dir, exist_ok = True)

    temp_file = tempfile.NamedTemporaryFile(mode = 'w', delete = False, dir = temp_dir, encoding = 'UTF-8')

    try:
        file_extension = os.path.splitext(input_file_path)[1].lower()

        # Read the contents of the file based on its type
        if file_extension == '.txt':
            with open(input_file_path, 'r', encoding='UTF-8') as input_file:
                lines = input_file.readlines()
        elif file_extension == '.docx':
            doc = Document(input_file_path)
            lines = [p.text for p in doc.paragraphs]
        elif file_extension == '.pdf':
            with open(input_file_path, 'rb') as input_file:
                reader = PyPDF2.PdfReader(input_file)
                lines = []
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num] 
                    lines.append(page.extract_text())
        else:
            raise ValueError("Unsupported file format: " + file_extension)

        # Remove empty lines and lines consisting only of '-' or '_'
        non_empty_lines = [line.strip() for line in lines if line.strip() and not all(char in {'-', '_'} for char in line.strip())]

        # Write processed text to the temporary file
        temp_file.write('\n'.join(non_empty_lines))
    finally:
        # Close the temporary file
        temp_file.close()

    # Get the path of the temporary file
    temp_file_path = temp_file.name

    return temp_file_path


# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    '''
    load_documents() is a helper function to load txt file
    as langchain documents
    
    Parameters:
        file (str): path to file
    '''
    try:
        loader = TextLoader(file, encoding = 'UTF-8')
    except:
        print("TextLoader failed to load the text from load_documents function")
    
    data = loader.load()
    return data


# calculate embedding cost using tiktoken
def calculate_input_embedding_cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    # print(f'Total Tokens: {total_tokens}')
    # print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')
    return total_tokens, (total_tokens / 1000000) * 0.02


# splitting data in chunks
def chunk_data(data, chunk_size = 1024, chunk_overlap = 80):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap)
    chunks = text_splitter.split_documents(data)
    if len(chunks) == 0:
        raise ValueError("Chunking failed - returned zero chunks!")
    return chunks


# create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
def create_embeddings(chunks, persist_directory='./chroma_db'): #######################
    embeddings = OpenAIEmbeddings(
        model = os.getenv("TEXT_EMBEDDING_MODEL"), 
        dimensions=1536)  # 512 works as well
    # Create an in-memory Chroma vector store using the provided text chunks 
    # and the embedding model 
    vector_store = Chroma.from_documents(
        documents = chunks, 
        embedding = embeddings,
        persist_directory = persist_directory)
    return vector_store

# another function that loads existing embeddings from disk to a vector store object
def load_embeddings_chroma(persist_directory = './chroma_db'): #############################
    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536) 

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) 

    return vector_store  # Return the loaded vector store

# function to remove the chroma vector store directory
def remove_chroma_vector_store(persist_directory = './chroma_db'):
    try:
        shutil.rmtree(persist_directory, ignore_errors=False)
        print(f"Removed the '{persist_directory}' directory successfully.")
    except:
        print(f"Error while removing the '{persist_directory}' directory.")
    



In [3]:
pwd

'/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app'

In [4]:
input_file_path

NameError: name 'input_file_path' is not defined

In [8]:
input_file_path = "RSM_packing_list_2.docx"

In [3]:
input_file_path = "Hatching_a_story.docx"

In [9]:
input_file_path

'RSM_packing_list_2.docx'

In [10]:
remove_chroma_vector_store() ##########################################

Removed the './chroma_db' directory successfully.


In [11]:
processed_text_file_path = process_input_file(input_file_path)
if processed_text_file_path:
    print(f"Processed input file {input_file_path}")

data = load_document(processed_text_file_path)

os.remove(processed_text_file_path) ############################################

if data is None:
    print(f"Failed to load document: {input_file_path}")
else:
    print(f"Loaded the processed file {input_file_path}")

Processed input file RSM_packing_list_2.docx
Loaded the processed file RSM_packing_list_2.docx


In [12]:
chunks = chunk_data(data, chunk_size = 1024)
print(f'Chunk size: 1024, Chunks: {len(chunks)}')

tokens, embedding_cost = calculate_input_embedding_cost(chunks)
print(f'Source document embedding cost: ${embedding_cost:.4f}')

# creating the embeddings and returning the Chroma vector store
vector_store = create_embeddings(chunks = chunks)

db = load_embeddings_chroma() #######################################

# user's question text input widget
question = "What is the document about?"
if question: # if the user entered a question and hit enter
    # build messages
    system_template = r'''
    You are answering questions only concerning the provided content of the input document.  
    If you are asked a question that is not related to the document you response will be:
    'I can answer only the questions related to the source document!'.
    ---------------
    Context: ```{context}```
    '''

    user_template = '''
    Answer questions only concerning the provided content of the input document.  
    If you are asked a question that is not related to the document you response will be:
    'I can answer only the questions related to the source document!'. 
    Here is the user's question: ```{question}```
    '''

    messages= [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(user_template)
        ]

    qa_prompt = ChatPromptTemplate.from_messages(messages)
    
    
    print(f'Retrieving top {5} results from the input text...')

    # initialize LLM
    llm = ChatOpenAI(
        api_key = os.getenv("OPENAI_API_KEY"),  
        model = os.getenv("OPENAI_DEPLOYMENT_NAME"), 
        temperature = 0)
    # Configure vector store to act as a retriever (finding similar items, returning top k)
    retriever = db.as_retriever(  #############################################
        search_type = 'similarity', 
        search_kwargs={'k': 5})
    # Create a memory buffer to track the conversation
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    
    # Set up conversational retrieval chain
    crc = ConversationalRetrievalChain.from_llm(
        llm = llm,
        retriever = retriever,
        memory = memory,
        chain_type = 'stuff',
        combine_docs_chain_kwargs = {'prompt': qa_prompt },
        verbose = False)
    
    
    result = crc.invoke({'question': question})
    response = result['answer']

 
    history = ''

    # the current question and answer
    value = f'Q: {question} \nA: {response}'

    # st.session_state.history = f'{value} \n {"-" * 100} \n {st.session_state.history}'
    # h = st.session_state.history

    history = f'{value} \n {"-" * 100} \n {history}'



# run the app: streamlit run app.py

Chunk size: 1024, Chunks: 9
Source document embedding cost: $0.0000


OperationalError: attempt to write a readonly database

In [12]:
result # running chroma from memory version with RSM packing list file 

{'question': 'What is the document about?',
 'chat_history': [HumanMessage(content='What is the document about?'),
  AIMessage(content='The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience at Camp Sunapee. The letter emphasizes the importance of following camp rules, such as not bringing illegal substances, food, or electronics to camp, and provides instructions for labeling belongings, submitting health forms, and arranging transportation for out-of-state campers.')],
 'answer': 'The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience at Camp Sunapee. The letter emphasizes the import

In [13]:
history

'Q: What is the document about? \nA: The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience at Camp Sunapee. The letter emphasizes the importance of following camp rules, such as not bringing illegal substances, food, or electronics to camp, and provides instructions for labeling belongings, submitting health forms, and arranging transportation for out-of-state campers. \n ---------------------------------------------------------------------------------------------------- \n '

In [20]:
result # running chroma from memory version with Hatching story right after RSM packing list file 

{'question': 'What is the document about?',
 'chat_history': [HumanMessage(content='What is the document about?'),
  AIMessage(content='The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for food, illegal substances, electronics, transportation, visiting days, and laundry. The letter emphasizes the importance of following these guidelines to ensure a safe and enjoyable camp experience.')],
 'answer': 'The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for food, illegal substances, electronics, transportation, visiting days, and laundry. The letter emphasizes the importance of following these guidelines to ensure a safe and enjoyable camp experien

In [21]:
history

'Q: What is the document about? \nA: The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for food, illegal substances, electronics, transportation, visiting days, and laundry. The letter emphasizes the importance of following these guidelines to ensure a safe and enjoyable camp experience. \n ---------------------------------------------------------------------------------------------------- \n '

In [27]:
vector_store.get() # vector store after running chroma from memory version with Hatching story right after RSM packing list file 

{'ids': ['02c3c7c6-f3ca-449a-b8ff-fbfc52d954d1',
  '0b45d3db-cf17-49c6-95ca-1459a5da3fda',
  '225920f3-2fa4-4fd0-a5eb-40106540c01e',
  '42f3642a-5d23-4fb3-bd53-cc0bb080d6b9',
  '686f571b-94a0-42ec-934c-3d70577c4099',
  'a2c3870f-96d2-49c9-9692-12a295fd3d5c',
  'bf4d2939-380d-48f7-a966-32d7d04aadcf',
  'dbf09663-b05e-49a2-a484-0df108cba205',
  'dd9ee89c-6e96-464e-ac43-3b0a3486f4e8',
  'e63d07b1-7a8e-4ee6-800d-586fa08c254c'],
 'embeddings': None,
 'metadatas': [{'source': '/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app/temp/tmp7zl7jpt4'},
  {'source': '/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app/temp/tmp3_l0x_lc'},
  {'source': '/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app/temp/tmp7zl7jpt4'},
  {'source': 

In [28]:
vector_store

AttributeError: 'Chroma' object has no attribute 'count'

In [8]:
result # RSM paching list file and running chrome locally

{'question': 'What is the document about?',
 'chat_history': [HumanMessage(content='What is the document about?'),
  AIMessage(content='The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience. The letter emphasizes the importance of following camp rules, such as not bringing illegal substances, food, or electronics to camp, and provides instructions for labeling belongings, submitting health forms, and arranging transportation for campers flying into Manchester, NH.')],
 'answer': 'The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience. The letter emphasize

In [9]:
history # RSM paching list file and running chrome locally

'Q: What is the document about? \nA: The document is a letter from the directors of RSM Camp, providing important information and guidelines for campers and their families. It covers topics such as camp policies, packing lists, health protocols, and procedures for ensuring a safe and enjoyable camp experience. The letter emphasizes the importance of following camp rules, such as not bringing illegal substances, food, or electronics to camp, and provides instructions for labeling belongings, submitting health forms, and arranging transportation for campers flying into Manchester, NH. \n ---------------------------------------------------------------------------------------------------- \n '

In [None]:
#-----------------------------------------------------------------------
result # Hatching a story loaded right after running RSM packing list and deleting local chroma db

In [None]:
history # Hatching a story loaded right after running RSM packing list and deleting local chroma db

In [13]:
chunks = chunk_data(data, chunk_size = 1024)
print(f'Chunk size: 1024, Chunks: {len(chunks)}')

tokens, embedding_cost = calculate_input_embedding_cost(chunks)
print(f'Source document embedding cost: ${embedding_cost:.4f}')


Chunk size: 1024, Chunks: 9
Source document embedding cost: $0.0000


In [14]:
# creating the embeddings and returning the Chroma vector store
vector_store = create_embeddings(chunks = chunks)

OperationalError: attempt to write a readonly database

In [None]:
db = load_embeddings_chroma() 

In [None]:
def create_embeddings(chunks, persist_directory='./chroma_db'): #######################
    embeddings = OpenAIEmbeddings(
        model = os.getenv("TEXT_EMBEDDING_MODEL"), 
        dimensions=1536)  # 512 works as well
    # Create an in-memory Chroma vector store using the provided text chunks 
    # and the embedding model 
    vector_store = Chroma.from_documents(
        documents = chunks, 
        embedding = embeddings,
        persist_directory = persist_directory)
    return vector_store

In [17]:
persist_directory='./chroma_db'

embeddings = OpenAIEmbeddings(
        model = os.getenv("TEXT_EMBEDDING_MODEL"), 
        dimensions=1536)  # 512 works as well

In [18]:
# Create an in-memory Chroma vector store using the provided text chunks 
# and the embedding model 
vector_store = Chroma.from_documents(
    documents = chunks, 
    embedding = embeddings,
    persist_directory = persist_directory)

OperationalError: attempt to write a readonly database

In [7]:
result

{'question': 'What is the document about?',
 'chat_history': [HumanMessage(content='What is the document about?'),
  AIMessage(content="The document is about a person who finds a large, colorful egg that hatches into a baby dragon. The person secretly raises the dragon at home, and as it grows larger, hides it in the backyard. When robbers break into the house while the person's parents are away, the dragon protects the person by using its fiery breath to turn the robbers into ashes. The robbers turn out to be famous, and the person earns a Nobel Prize as a result.")],
 'answer': "The document is about a person who finds a large, colorful egg that hatches into a baby dragon. The person secretly raises the dragon at home, and as it grows larger, hides it in the backyard. When robbers break into the house while the person's parents are away, the dragon protects the person by using its fiery breath to turn the robbers into ashes. The robbers turn out to be famous, and the person earns a N