# process country policy and carbon policy docs

In [None]:
import os
import uuid
from fastapi import UploadFile
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.readers.docling import DoclingReader
from llama_index.embeddings.gemini import GeminiEmbedding

from dotenv import load_dotenv
load_dotenv()

from llama_index.vector_stores import ChromaVectorStore
import chromadb
chroma_client = chromadb.Client()
# # A collection in ChromaDB is a logical grouping of embeddings, documents, and metadata—like a table in a database.
# # Create a collection for all policys, and another one for all PDD (for similarity search)
chroma_collection = chroma_client.create_collection("policy_vcm") #f"{document_name}_{uuid.uuid4()}"
# # vector_store is a LlamaIndex wrapper around the ChromaDB collection, providing an interface to interact with it (e.g., adding nodes, querying).
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# define embedding model
model_name = "models/embedding-001"
embed_model = GeminiEmbedding(model_name = model_name, 
                            api_key=os.getenv("GEMINI_API_KEY"))

#NOTE MAKE THE REAL PATH!
# also need to process a list of files
# and create metadata field to indicate file names

def get_file_paths(folder_path):
    """Get all file paths in a folder recursively."""
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

# get the folder
file_folder_path = os.path.join(os.getcwd(), "policy_vcm_docs")  # Adjust this to your actual path

# Get all file paths in the policy_doc folder
file_paths = get_file_paths(file_folder_path)

# Use DoclingReader to load the data
reader = DoclingReader()

for file_path in file_paths:
    file_name = os.path.basename(file_path)
    
    document = reader.load_data(file_path)
    
    # Add file name to metadata for each document
    for doc in documents:
        # If documents is a list of Document objects
        if isinstance(doc, Document):
            if doc.metadata is None:
                doc.metadata = {}
            doc.metadata["file_name"] = file_name
        # If it's a dict or another structure, adjust accordingly
        elif hasattr(doc, 'metadata'):
            if doc.metadata is None:
                doc.metadata = {}
            doc.metadata["file_name"] = file_name
            
    # create ingestion pipeline to define splitter and embed model 
    # I also want to add the doc title to metadata
    
    # how to extract more metadata???
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=500, chunk_overlap=50),  # Match your project chunking
            TitleExtractor(),
            embed_model
        ]
    )

    # Create the index
    # node_parser = MarkdownNodeParser()
    # nodes = await pipeline.run(documents=documents)
    nodes = pipeline.run(documents=document)
    index = VectorStoreIndex(
        nodes,
        # transformations=[node_parser],
        embed_model=embed_model,
        vector_store=vector_store
    )

# persist the index
# storage_dir = os.path.join(os.getcwd(), "storage", f"{document_name}_{uuid.uuid4()}")
# os.makedirs(storage_dir, exist_ok=True)
# index.storage_context.persist(persist_dir=storage_dir)


# To load the storage later:
# from llama_index.core import load_index_from_storage
# storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
# policy_index = load_index_from_storage(storage_context)



# Load from vectorstore

In [None]:
vector_store = PineconeVectorStore(pinecone.Index("quickstart"))
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)