In [121]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores import Chroma


In [106]:
# ! pip install -U langchain-community
# ! pip install chromadb
# ! pip install openai
# ! pip install langchain-chroma
# ! pip install tiktoken
# ! pip install faiss-cpu
# ! pip install faiss-gpu

In [122]:
DATA_PATH = "/Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data"
CHROMA_PATH = "/Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/chroma"

In [123]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = document_loader.load()
    print(f"Loaded {len(documents)} documents.")
    return documents

In [124]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80,length_function=len, is_separator_regex=False)
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")
    return chunks

In [125]:
documents = load_documents()
chunks = split_documents(documents)
# print(chunks[0],"\nchunk1\n",chunks[1])

Loaded 20 documents.
Split into 113 chunks.


In [111]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
# from langchain_community.embeddings.bedrock import BedrockEmbeddings


def get_embedding_function_ollama():
    embeddings = OllamaEmbeddings(
        model="llama3"
    )
    return embeddings

In [112]:
from langchain.embeddings.openai import OpenAIEmbeddings

def get_embedding_function_openai():
    embeddings = OpenAIEmbeddings(
        openai_api_key="ysk-dz6Xg-c4WLXHiRBsY8U9QpcTvr2-16Jd7QZcB2mlKPT3BlbkFJ-YQ_DlSzsDLNFbzM1YErPSxpUG-czmurGgdNVEMKoA" 
    )
    return embeddings

In [113]:
get_embedding_function_openai()

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x167805d60>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x167843ad0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='ysk-dz6Xg-c4WLXHiRBsY8U9QpcTvr2-16Jd7QZcB2mlKPT3BlbkFJ-YQ_DlSzsDLNFbzM1YErPSxpUG-czmurGgdNVEMKoA', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [114]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


In [115]:
# def add_to_chroma(chunks: list[Document]):
#     # Load the existing database.
#     db = Chroma(
#         persist_directory=CHROMA_PATH, embedding_function=get_embedding_function_ollama()
#     )

#     # Calculate Page IDs.
#     chunks_with_ids = calculate_chunk_ids(chunks)

#     # Add or Update the documents.
#     existing_items = db.get(include=[])  # IDs are always included by default
#     existing_ids = set(existing_items["ids"])
#     print(f"Number of existing documents in DB: {len(existing_ids)}")

#     # Only add documents that don't exist in the DB.
#     new_chunks = []
#     for chunk in chunks_with_ids:
#         if chunk.metadata["id"] not in existing_ids:
#             new_chunks.append(chunk)

#     if len(new_chunks):
#         print(f"👉 Adding new documents: {len(new_chunks)}")
#         new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
#         db.add_documents(new_chunks, ids=new_chunk_ids)
#         db.persist()
#     else:
#         print("✅ No new documents to add \U0001F642")

In [116]:
from langchain.vectorstores import Chroma
from langchain.schema import Document

def add_to_chroma(chunks: list[Document]):
    # Initialize Chroma vector store with embedding function
    try:
        embedding_function = get_embedding_function_openai()
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
        print(f"Chroma initialized successfully with embedding function from Ollama.")
    except Exception as e:
        print(f"Error initializing Chroma: {e}")
        return  # Stop if we cannot initialize the DB

    # Check if chunks list is empty
    if not chunks or len(chunks) == 0:
        print("No document chunks to add to Chroma.")
        return
    
    print(f"Attempting to add {len(chunks)} chunks to the Chroma database...")

    # Retrieve existing document IDs from the Chroma DB
    try:
        existing_items = db.get(include=[])  # IDs are always included by default
        existing_ids = set(existing_items['ids'])  # Convert to set for fast lookup
        print(f"Total documents in DB before adding: {len(existing_ids)}")
    except Exception as e:
        print(f"Error retrieving existing documents from Chroma: {e}")
        return

    # Filter out chunks that are already in the database (by their 'id' metadata)
    new_chunks = []
    for chunk in chunks:
        chunk_id = chunk.metadata.get("id", None)  # Ensure the chunk has an ID
        if chunk_id is not None and chunk_id not in existing_ids:
            new_chunks.append(chunk)
        else:
            print(f"Skipping duplicate chunk with ID: {chunk_id}")

    # Add new chunks to the Chroma database (if any)
    if len(new_chunks) > 0:
        try:
            db.add_documents(new_chunks)
            db.persist()  # Ensure the changes are persisted to disk
            print(f"Successfully added {len(new_chunks)} new chunks to Chroma.")
        except Exception as e:
            print(f"Error adding new chunks to Chroma: {e}")
    else:
        print("No new chunks to add, all chunks already exist in the DB.")

    # Check the number of documents in the DB after adding
    try:
        existing_items = db.get(include=[])  # IDs are always included by default
        print(f"Total documents in DB after adding: {len(existing_items['ids'])}")
    except Exception as e:
        print(f"Error retrieving documents from Chroma after adding: {e}")


In [117]:
def check_existing_documents():
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function_openai())
    existing_items = db.get(include=[])  # This retrieves all stored IDs
    print(f"Total documents in DB: {len(existing_items['ids'])}")

check_existing_documents()

Total documents in DB: 226


In [118]:
chunks = calculate_chunk_ids(chunks)
add_to_chroma(chunks)

Chroma initialized successfully with embedding function from Ollama.
Attempting to add 113 chunks to the Chroma database...
Total documents in DB before adding: 226
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:0
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:1
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:2
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:3
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:4
Skipping duplicate chunk with ID: /Users/atreyeemukherjee/Library/Mobile Documents/com~apple~CloudDocs/side projects/data/E17-1033.pdf:0:5
S

In [119]:
import os
import pickle
from langchain.vectorstores import FAISS


FAISS_PATH = "faiss_index"

def add_to_faiss(chunks: list):
    # Load or create the FAISS index.
    if os.path.exists(FAISS_PATH):
        with open(FAISS_PATH, "rb") as f:
            db = pickle.load(f)
        print("Loaded existing FAISS index.")
    else:
        db = FAISS(embedding_function=get_embedding_function_openai())
        print("Created a new FAISS index.")

    # Add documents to the FAISS index
    print(f"👉 Adding new documents: {len(chunks)}")
    db.add_documents(chunks)
    
    # Persist the FAISS index by saving it as a file
    with open(FAISS_PATH, "wb") as f:
        pickle.dump(db, f)
    print("FAISS index updated and saved. \U0001F642")


In [120]:
import argparse
import sys
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

# from get_embedding_function import get_embedding_function

CHROMA_PATH = "chroma"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


def main():
    # Create CLI.
    # 
    sys.argv = ["ipykernel_launcher.py", "POS tagging definition"]
    parser = argparse.ArgumentParser()
    parser.add_argument("query_text", type=str, help="The query text.")
    args = parser.parse_args()
    query_text = args.query_text
    query_rag(query_text)


def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function_openai()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)
    if not results:
        print("No documents retrieved.")
        return "No relevant documents found."

    for doc, score in results:
        print(f"Document Metadata: {doc.metadata}")
        print(f"Similarity Score: {score}")
        print(f"Document Content: {doc.page_content[:500]}\n")  # Print the first 500 characters

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="llama3")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text


if __name__ == "__main__":
    main()

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: ysk-dz6X************************************************************************************MKoA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [92]:
# def query_rag(query_text: str):
#     # Prepare the DB.
#     embedding_function = get_embedding_function_ollama()
#     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

#     # Debug: Check if documents are in the database
#     existing_items = db.get(include=[])  # IDs are always included by default
#     print(f"Total documents in DB: {len(existing_items['ids'])}")

#     # Search the DB.
#     results = db.similarity_search_with_score(query_text, k=5)

#     if not results:
#         print("No documents retrieved.")
#         return "No relevant documents found."

#     # Display results for debugging
#     for doc, score in results:
#         print(f"Document Metadata: {doc.metadata}")
#         print(f"Similarity Score: {score}")
#         print(f"Document Content: {doc.page_content[:500]}\n")  # Print the first 500 characters


In [93]:
query_rag("POS tagging definition")

Total documents in DB: 0
No documents retrieved.


'No relevant documents found.'