In [7]:
pwd

'/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app'

In [8]:
ls

CD_seller_signed.pdf                    app_scratch_book.ipynb
Hatching_a_story.docx                   app_scratch_pinecone.ipynb
LICENSE                                 final_signed_offer.pdf
Purchase_and_sale_final.pdf             image2.png
README.md                               image3.png
RSM_packing_list.docx                   image5.png
RSM_packing_list_2.docx                 image6.png
Software_Engineering_Practices_TOP.txt  img.png
app.py                                  requirements.txt
app1.py                                 [34mtemp[m[m/
app2.py                                 ~$tching_a_story.docx


In [None]:
#input_file_path = "Purchase_and_sale_final.pdf"

In [None]:
#input_file_path = "final_signed_offer.pdf"

In [None]:
#input_file_path = "CD_seller_signed.pdf"

In [9]:
input_file_path = "RSM_packing_list_2.docx"

In [None]:
#input_file_path = "Hatching_a_story.docx"

In [18]:
# Install all libraries by running in the terminal: pip install -r requirements.txt
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from docx import Document
import PyPDF2
import os
import tempfile
import tiktoken
import shutil ###########################
import pinecone
from langchain_community.vectorstores import Pinecone
from pinecone import ServerlessSpec




In [15]:
# fetch environmental variables
load_dotenv()

True

### Helper functions 
- Process input file
- Load document
- Calculate input costs
- Split data in chunks
- Create embeddings

In [52]:
# helper function to process the input text file, remove empty lines and unneeded formatting marks
def process_input_file(input_file_path):
    '''
    process_input_text() helper function takes the input file in txt, docx or pdf format
    as an argument and removes empty lines and non-essential characters. The output is saved
    in a temporary directory.
    
    Parameters:
        input_file_path (str): path to the input text file
    
    Returns:
        processed temporary text file path saved in temp/
    '''
    # Create a temporary file in the same directory as the input file
    temp_dir = os.path.join(os.path.dirname(input_file_path), "temp")
    os.makedirs(temp_dir, exist_ok = True)

    temp_file = tempfile.NamedTemporaryFile(mode = 'w', delete = False, dir = temp_dir, encoding = 'UTF-8')

    try:
        file_extension = os.path.splitext(input_file_path)[1].lower()

        # Read the contents of the file based on its type
        if file_extension == '.txt':
            with open(input_file_path, 'r', encoding='UTF-8') as input_file:
                lines = input_file.readlines()
        elif file_extension == '.docx':
            doc = Document(input_file_path)
            lines = [p.text for p in doc.paragraphs]
        elif file_extension == '.pdf':
            with open(input_file_path, 'rb') as input_file:
                reader = PyPDF2.PdfReader(input_file)
                lines = []
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num] 
                    lines.append(page.extract_text())
        else:
            raise ValueError("Unsupported file format: " + file_extension)

        # Remove empty lines and lines consisting only of '-' or '_'
        non_empty_lines = [line.strip() for line in lines if line.strip() and not all(char in {'-', '_'} for char in line.strip())]

        # Write processed text to the temporary file
        temp_file.write('\n'.join(non_empty_lines))
    finally:
        # Close the temporary file
        temp_file.close()

    # Get the path of the temporary file
    temp_file_path = temp_file.name

    return temp_file_path


# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    '''
    load_documents() is a helper function to load txt file
    as langchain documents
    
    Parameters:
        file (str): path to file
    '''
    try:
        loader = TextLoader(file, encoding = 'UTF-8')
    except:
        print("TextLoader failed to load the text from load_documents function")
    
    data = loader.load()
    return data


# calculate embedding cost using tiktoken
def calculate_input_embedding_cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    # print(f'Total Tokens: {total_tokens}')
    # print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')
    return total_tokens, (total_tokens / 1000000) * 0.02


# splitting data in chunks
def chunk_data(data, chunk_size = 1024, chunk_overlap = 80):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap)
    chunks = text_splitter.split_documents(data)
    if len(chunks) == 0:
        raise ValueError("Chunking failed - returned zero chunks!")
    return chunks


def create_embeddings(chunks, index_name = "real-estate-rag", namespace = 'user1'):

    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # create index if does not exist yet
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws', 
                region='us-east-1'
            )
        )

    # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
    # inserting the embeddings into the index and returning a new Pinecone vector store object. 
    vector_store = Pinecone.from_documents(
        documents = chunks, 
        embedding = embeddings, 
        index_name = index_name, 
        namespace = namespace) 
    # processing the input documents, the chunks, geenrating the embeddings
    # using the provided openAI embeddings instance, inserting the embedding intot he index and returning pincone vectoor store object.
    print(f'Created vector store within {index_name} index and in {namespace} namespace')
        
    return vector_store


def delete_pinecode_index(index_name = 'all'):
    pc = pinecone.Pinecone()

    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print("Deleting all pinecone indexes...")
        for index in indexes:
            pc.delete_index(index)
    else:
        print(f"Deleting {index_name} pinecone index...")
        pc.delete_index(index_name)
        print(f"{index_name} pinecone index deleted")


In [37]:
pc.list_indexes().names()

['real-estate-rag']

In [53]:
delete_pinecode_index(index_name = 'all')

Deleting all pinecone indexes...


In [54]:
pc.list_indexes().names()

[]

### Process the input document, remove temporary file and load the document

In [38]:
processed_text_file_path = process_input_file(input_file_path)
if processed_text_file_path:
    print(f"Processed input file {input_file_path}")

data = load_document(processed_text_file_path)

os.remove(processed_text_file_path)

if data is None:
    print(f"Failed to load document: {input_file_path}")
else:
    print(f"Loaded the processed file {input_file_path}")

Processed input file RSM_packing_list_2.docx
Loaded the processed file RSM_packing_list_2.docx


### Chunk the text, calculate embedding cost and create the vector store

In [55]:
chunks = chunk_data(data, chunk_size = 1024)
print(f'Chunk size: 1024, Chunks: {len(chunks)}')

tokens, embedding_cost = calculate_input_embedding_cost(chunks)
print(f'Source document embedding cost: ${embedding_cost:.4f}')

# creating the embeddings and returning the Chroma vector store
vector_store = create_embeddings(chunks = chunks, namespace = 'user1')
print(vector_store)

Chunk size: 1024, Chunks: 9
Source document embedding cost: $0.0000
Created the vector store
<langchain_community.vectorstores.pinecone.Pinecone object at 0x12e76ea50>


In [56]:
question = "What is the document about?"

In [57]:
# build messages
system_template = r'''
You are answering questions only concerning the provided content of the input document.  
If you are asked a question that is not related to the document you response will be:
'I can answer only the questions related to the source document!'.
---------------
Context: ```{context}```
'''

user_template = '''
Answer questions only concerning the provided content of the input document.  
If you are asked a question that is not related to the document you response will be:
'I can answer only the questions related to the source document!'. 
Here is the user's question: ```{question}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
    ]

qa_prompt = ChatPromptTemplate.from_messages(messages)

In [58]:
# initialize LLM
llm = ChatOpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),  
    model = os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature = 0)

In [59]:
# Configure vector store to act as a retriever (finding similar items, returning top k)
retriever = vector_store.as_retriever(
    search_type='similarity', search_kwargs={'k': 3, 'namespace': 'user1'})

In [60]:
# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [61]:
 # Set up conversational retrieval chain
crc = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = retriever,
    memory = memory,
    chain_type = 'stuff',
    combine_docs_chain_kwargs = {'prompt': qa_prompt },
    verbose = False)

In [62]:
result = crc.invoke({'question': question})
response = result['answer']

In [63]:
response

'The document is about the policies, procedures, and packing list for Camp Sunapee, directed by Olga Pristin, Ann Baranov, and Ellen Arbeznik. It emphasizes the importance of compliance with camp rules to maintain a safe and enjoyable environment. The document includes detailed instructions on what to pack, health guidelines, and a strict policy against bringing food into the cabins to avoid attracting pests and creating litter. It also mentions the necessity of labeling all belongings and the submission of health forms by a specific deadline.'

#############################################################################

### Now reading another document and creating another vector db using a different namespace within the same pinecone index

In [64]:
input_file_path = "Hatching_a_story.docx"

In [65]:
processed_text_file_path = process_input_file(input_file_path)
if processed_text_file_path:
    print(f"Processed input file {input_file_path}")

data = load_document(processed_text_file_path)

os.remove(processed_text_file_path)

if data is None:
    print(f"Failed to load document: {input_file_path}")
else:
    print(f"Loaded the processed file {input_file_path}")

Processed input file Hatching_a_story.docx
Loaded the processed file Hatching_a_story.docx


In [66]:
chunks = chunk_data(data, chunk_size = 1024)
print(f'Chunk size: 1024, Chunks: {len(chunks)}')

tokens, embedding_cost = calculate_input_embedding_cost(chunks)
print(f'Source document embedding cost: ${embedding_cost:.4f}')

# creating the embeddings and returning the Chroma vector store
vector_store = create_embeddings(chunks = chunks, namespace = 'user2') # used different namespace
print(vector_store)

Chunk size: 1024, Chunks: 1
Source document embedding cost: $0.0000
Created the vector store
<langchain_community.vectorstores.pinecone.Pinecone object at 0x12e7679d0>


In [75]:
chunks[0]

Document(metadata={'source': '/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot_Streamlit_app/knowledge_retrieval_LLM_chatbot_Streamlit_app/temp/tmpef0oq761', 'text': "Hatching a story\nThe egg was a light blue the size of a football and had red and yellow spots and began to crack and now it popped a baby dragon! it coughed and I'll pop the little flame I took it home and secretly fed it he got too big so I had to hide it in my backyard after a few years passed grow goo got too big and turned into a fully sized dragon then robbers broke into my house my mom and dad were away on the theater play so I was home alone I ran into my backyard robbers after me then my dragon stepped out and used his breath and turned the robbers into ashes after that I found out they were famous Roberts and earned the Nobel Prize"}, page_content="Hatching a story\nThe egg was a light blue the size of a football and had red and yellow spots and began to crack and now it popped a 

In [67]:
question = "What is the document about?"

In [68]:
# initialize LLM
llm = ChatOpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),  
    model = os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature = 0)

In [76]:
# Configure vector store to act as a retriever (finding similar items, returning top k)
retriever = vector_store.as_retriever(
    search_type='similarity', search_kwargs={'k': 3, 'namespace': 'user2'}) # use the second namespace

In [77]:
# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [78]:
 # Set up conversational retrieval chain
crc = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = retriever,
    memory = memory,
    chain_type = 'stuff',
    combine_docs_chain_kwargs = {'prompt': qa_prompt },
    verbose = False)

In [79]:
result = crc.invoke({'question': question})
response = result['answer']

In [80]:
response

"The document is about a person who finds a large, uniquely colored egg that hatches into a baby dragon. The person secretly raises the dragon at home, and as it grows larger, hides it in the backyard. When robbers break into the house while the person's parents are away, the dragon protects the person by using its fiery breath to turn the robbers into ashes. The robbers turn out to be famous, and the person earns a Nobel Prize as a result of the incident."