In [15]:
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
import os

from langchain.indexes import SQLRecordManager

POSTGRES_DB_URI = os.getenv("POSTGRES_DB_URI")

namespace = "postgres/sharepoint_docs"
record_manager = SQLRecordManager(namespace=namespace, db_url=POSTGRES_DB_URI)
record_manager.create_schema()

In [17]:
from langchain_community.document_loaders import SharePointLoader

DOCUMENT_LIBRARY_ID = os.getenv("DOCUMENT_LIBRARY_ID")


loader = SharePointLoader(
    document_library_id=DOCUMENT_LIBRARY_ID,
    auth_with_token=True,
    folder_path="/compliance/reglamentos",
)
docs = loader.load()
docs[:5]

[Document(metadata={'source': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'file_path': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'page': 0, 'total_pages': 25, 'format': 'PDF 1.6', 'title': 'CdC 2023', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 28.0 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20231215133914-03'00'", 'modDate': 'D:20231218135108Z', 'trapped': ''}, page_content='2023\n'),
 Document(metadata={'source': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'file_path': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO

In [18]:
docs[len(docs) - 1]

Document(metadata={'source': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/Video%20MPD%20-%20Jose%20Melville.pdf', 'file_path': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/Video%20MPD%20-%20Jose%20Melville.pdf', 'page': 1, 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'Vanessa Noemi Peña Lopez', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word para Microsoft 365', 'producer': 'Microsoft® Word para Microsoft 365', 'creationDate': "D:20241122120705-03'00'", 'modDate': "D:20241122120705-03'00'", 'trapped': ''}, page_content='La Ley refuerza los requisitos mínimos para que las empresas puedan fomentar un \nambiente de autocontrol que demuestre que tienen un Modelo de Prevención de \nDelitos efectivo.  \n \nTe invitamos a que complementes lo aprendido, continúa leyendo lo que sigue a \ncontinuación.  \n \n')

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)
split_docs[:5]

[Document(metadata={'source': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'file_path': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'page': 0, 'total_pages': 25, 'format': 'PDF 1.6', 'title': 'CdC 2023', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 28.0 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20231215133914-03'00'", 'modDate': 'D:20231218135108Z', 'trapped': ''}, page_content='2023'),
 Document(metadata={'source': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-AUD-001%20C%C3%B3digo%20Conducta%20en%20Los%20Negocios%20AQ.pdf', 'file_path': 'https://aquachile.sharepoint.com/sites/aqua_pdf/Documentos%20compartidos/compliance/reglamentos/AQ-PO-A

In [20]:
from langchain_community.vectorstores import AzureSearch
from langchain_openai.embeddings import AzureOpenAIEmbeddings

AZURE_AI_SEARCH_ENDPOINT = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_API_KEY = os.getenv("AZURE_AI_SEARCH_API_KEY")
AZURE_AI_SEARCH_INDEX_NAME = os.getenv("AZURE_AI_SEARCH_INDEX_NAME")

EMBEDDINGS = AzureOpenAIEmbeddings(deployment="text-embedding-3-small")

vectorstore = AzureSearch(
    azure_search_endpoint=AZURE_AI_SEARCH_ENDPOINT,
    azure_search_key=AZURE_AI_SEARCH_API_KEY,
    index_name=AZURE_AI_SEARCH_INDEX_NAME,
    embedding_function=EMBEDDINGS.embed_query,
    additional_search_client_options={"retry_total": 4},
)

In [21]:
from langchain.indexes import index

index(
    split_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 9, 'num_updated': 0, 'num_skipped': 84, 'num_deleted': 0}

In [22]:
retrieved_doc = vectorstore.similarity_search(
    query="conducta",
    k=3,
    search_type="similarity",
)
retrieved_doc[0].page_content

'3.2  CÓDIGO DE CONDUCTA EN LOS NEGOCIOS \nDocumento que contiene reglas para que cada empleado, independiente de su rango jerárquico, \npromueva una conducta basada en un comportamiento cuyos sellos distintivos, sea la rectitud u \nhonestidad, en cada una de sus acciones.'