# Importing All Necessary Packages

In [1]:
import os
import logging
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
load_dotenv()

True

# Defining Prompt Template

In [2]:
prompt_template = """
    Use the following pieces of context to answer the question at the end in one sentence.
    If you don't know the answer, don't try to make up an answer.

    {context}

    Question: {question}
"""

# Set a Custom Prompt

In [3]:
def set_custom_prompt():
    prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )
    return prompt

# Loading LLM using Ollama

In [4]:
def load_llm():
    llm = Ollama(
        model="llama2", # tried out mistral-7b, llama2, tinyllama
        verbose=True,
        temperature=0.2,
        callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    )
    return llm

# Creating a QA Chain

In [5]:
def retrieval_qa_chain(llm, prompt, vectorstore):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        chain_type="stuff", # tried refine, map_reduce, map_rerank 
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return qa_chain

# Creating a QA Bot by Initializing Chain with LLM, Prompt and Retriever (VDB)

In [6]:
def create_retrieval_qa_bot():
    vectorstore = Chroma(persist_directory=os.getenv('DB_PATH'), embedding_function=HuggingFaceEmbeddings())
    try:
        llm = load_llm()
    except Exception as e:
        raise Exception(f"Failed to load model: {str(e)}")
    qa_prompt = set_custom_prompt()
    try:
        qa = retrieval_qa_chain(llm, qa_prompt, vectorstore)
    except Exception as e:
        raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
    return qa

# Function to Get Response from QA Chain

In [7]:
def get_chain_response(chain, message_content):
    cb = CallbackManager([StreamingStdOutCallbackHandler()])
    res = chain.invoke(message_content, callbacks=[cb])
    print(f"response: {res}")
    
    # exception handling if the bot doesn't know the answer
    if "I don't know" in res["result"]:
        fallback_message = (
            "I'm sorry I don't know the answer to that."
            "Please contact support@mail.com for further assistance regarding your query."
        )
        res["result"] = fallback_message
    return res

# Main Interaction with Bot

In [8]:
def main_interaction():
    chain = create_retrieval_qa_bot()
    while True:
        user_query = input("Enter your query: ")
        if user_query.lower() in ["exit", "quit"]:
            break
        response = get_chain_response(chain, user_query)
        print(response["result"])

# Importing Packages for Document Chunking & Vectorization

In [9]:
import json
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import NLTKTextSplitter
import csv
from pptx import Presentation
from langchain.docstore.document import Document

# Function to Process CSV Documents

In [10]:
def load_csv_documents(file_path):
    documents = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # assuming the first row is the header
        for row in reader:
            content = ' '.join(row)  # joining all columns to form the content
            documents.append(Document(page_content=content, metadata={"source": file_path}))
    return documents

# Function to Process PPTX Documents

In [11]:
def load_pptx_documents(file_path):
    documents = []
    presentation = Presentation(file_path)
    for slide in presentation.slides:
        slide_texts = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_texts.append(shape.text)
        content = "\n".join(slide_texts)
        documents.append(Document(page_content=content, metadata={"source": file_path}))
    return documents

# Function to Process JSON Documents

In [12]:
def load_json_documents(file_path):
    documents = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        json_data = json.load(file)
        content = json.dumps(json_data)  # Convert JSON data to a string
        documents.append(Document(page_content=content, metadata={"source": file_path}))
    return documents

# Function to Load all Documents from Local Directory

In [13]:
def load_documents(data_path):
    documents = []

    # load PDF files
    pdf_loader = PyPDFDirectoryLoader(data_path)
    pdf_documents = pdf_loader.load()
    documents.extend(pdf_documents)
    
    # load CSV files
    for file in os.listdir(data_path):
        if file.endswith(".csv"):
            csv_documents = load_csv_documents(os.path.join(data_path, file))
            documents.extend(csv_documents)

    # load PPTX files
    for file in os.listdir(data_path):
        if file.endswith(".pptx"):
            ppt_documents = load_pptx_documents(os.path.join(data_path, file))
            documents.extend(ppt_documents)

    # load JSON files
    for file in os.listdir(data_path):
        if file.endswith(".json"):
            json_documents = load_json_documents(os.path.join(data_path, file))
            documents.extend(json_documents)
    return documents

# Function to Create Vector Store

In [14]:
def create_vector_db():
    data_path = os.getenv('DATA_PATH')
    db_path = os.getenv('DB_PATH')

    if not data_path or not db_path:
        raise ValueError("DATA_PATH or DB_PATH environment variables not set.")

    documents = load_documents(data_path)
    print(f"Processed {len(documents)} pages.")

    text_splitter = NLTKTextSplitter(chunk_size=1024, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks.")

    vector_store = Chroma.from_documents(
        documents=texts,
        embedding=HuggingFaceEmbeddings(),
        persist_directory=db_path
    )
    vector_store.persist()
    print(f"Vector database persisted at {db_path}.")

In [16]:
create_vector_db()

Created a chunk of size 1814, which is longer than the specified 1024
Created a chunk of size 13251, which is longer than the specified 1024
Created a chunk of size 1900, which is longer than the specified 1024
Created a chunk of size 9390, which is longer than the specified 1024
Created a chunk of size 5503, which is longer than the specified 1024
Created a chunk of size 11272, which is longer than the specified 1024
Created a chunk of size 19601, which is longer than the specified 1024
Created a chunk of size 23993, which is longer than the specified 1024
Created a chunk of size 14816, which is longer than the specified 1024
Created a chunk of size 9439, which is longer than the specified 1024
Created a chunk of size 22207, which is longer than the specified 1024
Created a chunk of size 37796, which is longer than the specified 1024
Created a chunk of size 1868, which is longer than the specified 1024
Created a chunk of size 8214, which is longer than the specified 1024
Created a chu

Processed 152 pages.
Split into 303 chunks.




Vector database persisted at D:/Chatbot/ollama-RAG2/vectorstore/.


  warn_deprecated(


# Main Function

In [17]:
def main():
    main_interaction()

if __name__ == "__main__":
    main()

  warn_deprecated(


ValueError: `run` not supported when there is not exactly one output key. Got ['result', 'source_documents'].