In [1]:
import argparse
import os
from dotenv import load_dotenv
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_chroma import Chroma
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders.pdf import PyPDFDirectoryLoader as PPDL

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

CHROMA_PATH = "chroma"
DATA_PATH = "pdfs"

In [2]:
def load_docs():
    doc_loader = PPDL(DATA_PATH)
    return doc_loader.load()

docs = load_docs()
print(len(docs))

6146


In [3]:
def split_docs(docs: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100,
        length_function = len,
        is_separator_regex = False
    )
    return text_splitter.split_documents(docs)

chunks = split_docs(docs)
print("len chunks: ", len(chunks))

len chunks:  12185


In [4]:
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_fn())
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("No new documents to add")

In [5]:
def calculate_chunk_ids(chunks):
    # ID structre: Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id

    return chunks

In [6]:
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [7]:
def embed_fn():
    embeddings = OpenAIEmbeddings(api_key = api_key)
    return embeddings

In [8]:
chunks = split_docs(docs)
add_to_chroma(chunks)

Number of existing documents in DB: 12185
No new documents to add


In [9]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_openai import ChatOpenAI

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def query_rag(query_text: str):
    embedding_function = embed_fn()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    
    model = ChatOpenAI(model="gpt-4o-mini", api_key = api_key)
    response_text = model.invoke(prompt)
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    return response_text


In [11]:
query = input("Ask your religious question:")
print(query)
response = query_rag(query)
response.pretty_print()

Ask your religious question: Does God exist? What is the answer from the perspectives of different religions?


Does God exist? What is the answer from the perspectives of different religions?

The provided context discusses the existence and attributes of Allah from an Islamic perspective. It emphasizes Allah's role as the creator of the heavens and the earth, His omniscience, and the notion that all matters return to Him. The context underscores the call to believe in Allah as the sole deity and to recognize His authority over all aspects of existence.

Regarding the question of God's existence from the perspectives of different religions, while the text specifically represents Islamic beliefs, other religions offer varying views:

1. **Christianity**: Generally affirms the existence of one God (similar to the concept of Allah), who is seen as the creator and sustainer of the universe, with a personal relationship with humanity through Jesus Christ.

2. **Judaism**: Believes in a single, all-powerful God who created the universe and establishes a covenant with His people.

3. **Hinduism**: Con