In [15]:
import os
from dotenv import load_dotenv, find_dotenv


from langchain_community.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma

import openai

In [16]:
# Variables
_ = load_dotenv(find_dotenv())  # read local .env file
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

In [17]:
# Carga de documentos
sources = [
    "./retriever_documents/informacion_general_11_24.pdf",
]

def load_pdf(sources):
    docs = []

    for source in sources:
        loaders = [PyPDFLoader(source)]
        for loader in loaders:
            docs.extend(loader.load())

    return docs

In [18]:
# generar split de documentos
def RecursiveCharacterTextSplitter_text_splitter(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
    )

    splits = text_splitter.split_documents(docs)

    return splits


def NLTKTextSplitter_text_splitter(docs):
    text_splitter = NLTKTextSplitter(chunk_size=1000)

    splits = text_splitter.split_documents(docs)

    return splits

In [19]:
# Generar Nuevo Indice

persist_directory = "./chroma_docs/expotech_2025/"

def generate_vectordb_from_pdf_store(source, text_splitter, persist_directory):

    docs = load_pdf(source)
    splits = text_splitter(docs)

    embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
    vectordb = Chroma.from_documents(
        documents=splits, embedding=embedding, persist_directory=persist_directory
    )

    return vectordb

In [21]:
# Actualizar indice existente
def update_vectordb_with_new_pdfs(sources, text_splitter, persist_directory):
    # Cargar los documentos existentes en ChromaDB
    embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

    # Cargar y procesar los nuevos PDFs
    docs = load_pdf(sources)
    splits = text_splitter(docs)

    # Agregar los nuevos documentos al índice existente
    vectordb.add_documents(documents=splits)

    # Persistir los cambios en el directorio
    #vectordb.persist()





sources = ["./retriever_documents/visnai.pdf",]
persist_directory = "./chroma_docs/expotech_2025/"

update_vectordb_with_new_pdfs(sources, RecursiveCharacterTextSplitter_text_splitter, persist_directory )

