In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import os
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex
from llama_index.core import SummaryIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader

from llama_index.core import StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# check current directory
print(os.getcwd())
# go above onre directory
os.chdir("..")
print(os.getcwd())

os.environ["HF_TOKEN"] = "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"

- Step 0: Set up embedding model & LLM
- Step 1: Read PDFs with Docling and assign doc_id
- Step 2: Set up ChromaDB and index

# Step 0: Set up embedding model & LLM

In [None]:
def get_env_from_colab_or_os(key):
    try:
        from google.colab import userdata

        try:
            return userdata.get(key)
        except userdata.SecretNotFoundError:
            pass
    except ImportError:
        pass
    return os.getenv(key)

load_dotenv()
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
GEN_MODEL = HuggingFaceInferenceAPI(
    token=get_env_from_colab_or_os("HF_TOKEN"),
    model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

In [5]:
SOURCE = "data/pdd/674_PROJ_DESC_674_15MAY2011.pdf"
# QUERY = "Evaluate this project based on the ICVCM CCPs standard. Describe in detail the additionality methodology and any data to support the claim."
QUERY = "Describe in detail the additionality methodology and any data to support the claim."

In [None]:
# parse docuemnt with docling
reader = DoclingReader()
# node_parser = MarkdownNodeParser()
documents = reader.load_data(SOURCE)

In [None]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.create_collection("pdd_collection")
# may need to change this chroma collection to load chroma instead of create collection
# Set up ChromaVectorStore
vector_store = ChromaVectorStore(chroma_collection = chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# build index with embeddings
index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context = storage_context,
    embed_model = EMBED_MODEL)

# Update ChromaDB

In [62]:
# set up chromaDB space
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("pdd_collection")
vector_store = ChromaVectorStore(chroma_collection = chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# load the existing index
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=EMBED_MODEL
)

In [None]:
reader = DoclingReader()
new_file_paths_proj = [] # add the new file paths here
new_file_paths_stdrd = [] # add the new file paths here

# a function to add metadata to the document

def add_docs(list_of_paths: list, doc_type="project"):
    """Process documents with Docling, add metadata based on doc_type (can be either "project" or "standard") and insert nodes into the index.

    Args:
        list_of_paths (list): List of PDF file paths to be processed.
        doc_type (str, optional): Type of PDF, either "project" or "standard". Defaults to "project".
    """
    new_documents = []
    for file_path in list_of_paths:
        raw_docs = reader.load_data(file_paths=[file_path])
        # Extract the project ID from the file path
        item_id = file_path.split("/")[-1].split("_")[0]
        for doc in raw_docs:
            if doc_type == "project":
                doc.metadata["proj_id"] = item_id
            else:
                doc.metadata["stdrd_id"] = item_id
            doc.metadata["doc_type"] = doc_type
        new_documents.extend(raw_docs)
    index.insert_nodes([doc.to_node() for doc in new_documents])

# query a specific doc

In [None]:
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter
# Define filter for a specific document
filters = MetadataFilters(filters=[
    ExactMatchFilter(key="doc_id", value="doc_005")  # Target doc_005
])
query_engine = index.as_query_engine(llm=GEN_MODEL, filters=filters)

In [None]:
response = query_engine.query("What do my documents say about additionality?")
print(response.response)

# mar 10th todo: agentic rag codes 
- each document has its own code...
- go back to the tutorial and check code

- PDD and project standards should be in the same DB but with different metadata