In [1]:
#banco vetorial
!pip install pinecone
!pip install -qU langchain-pinecone


#leitura e split
!pip install langchain_community
!pip install pypdf
!pip install langchain_cor
!pip install langchain

Collecting weaviate-client
  Downloading weaviate_client-4.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting validators==0.34.0 (from weaviate-client)
  Downloading validators-0.34.0-py3-none-any.whl.metadata (3.8 kB)
Collecting authlib<1.3.2,>=1.2.1 (from weaviate-client)
  Downloading Authlib-1.3.1-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting grpcio-tools<2.0.0,>=1.66.2 (from weaviate-client)
  Downloading grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting grpcio-health-checking<2.0.0,>=1.66.2 (from weaviate-client)
  Downloading grpcio_health_checking-1.69.0-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.66.2->weaviate-client)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading weaviate_client-4.10.4-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.4/330.4 kB[0m [31m5.3

### Lendo dados e criando splits

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
reg_path = 'drive/MyDrive/PodeIsso/regulamento.pdf'


In [14]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(reg_path)
data = loader.load()

In [16]:
# removendo os \n
for doc in data:
    doc.page_content = doc.page_content.replace("\n", "")

In [18]:
#jutando os documentos em um único documento

from langchain_core.documents.base import Document

merged_content = " ".join([doc.page_content for doc in data])
full_document = Document(page_content=merged_content)


In [20]:
#criando splits

import re
from langchain.text_splitter import TextSplitter


class RegulationTextSplitter(TextSplitter):
    def __init__(self, chunk_size=1000, chunk_overlap=0):
        super().__init__(chunk_size, chunk_overlap)
        self.chunk_size = chunk_size

    def split_text(self, text):
        # Regex ajustada para identificar artigos nos formatos "Art. 9º" e "Art. 10."
        article_pattern = re.compile(r"(Art\. \d+[ºo]?.*?)(?=Art\. \d+[ºo]?|$)", re.DOTALL)

        # Encontra todos os artigos e seus textos completos
        articles = article_pattern.findall(text)

        # Agrupar artigos em chunks respeitando o chunk_size
        chunks = []
        current_chunk = []
        current_size = 0

        for article in articles:
            article_size = len(article)
            # Extrair o título do artigo (primeira linha)
            article_title_match = re.match(r"(Art\. \d+[ºo]?)", article)
            article_title = article_title_match.group(1) if article_title_match else ""

            # Caso o artigo inteiro seja maior que o chunk_size
            if article_size > self.chunk_size:
                # Dividir o artigo em pedaços menores
                split_article = [article[i:i + self.chunk_size] for i in range(0, article_size, self.chunk_size)]

                # Marcar os pedaços subsequentes com o título do artigo
                split_article = [
                    (chunk if idx == 0 else f"{article_title} - Continuation: {chunk}")
                    for idx, chunk in enumerate(split_article)
                ]

                # Adicionar pedaços menores ao chunk atual ou diretamente aos chunks
                if current_chunk:  # Salvar o chunk atual se existir
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                    current_size = 0
                chunks.extend(split_article)  # Adicionar pedaços do artigo diretamente
                continue

            # Se adicionar o artigo atual excede o chunk_size, finalizar o chunk atual
            if current_size + article_size > self.chunk_size:
                if current_chunk:  # Se houver artigos acumulados, criar o chunk
                    chunks.append(" ".join(current_chunk))
                # Reiniciar para o próximo chunk
                current_chunk = [article]
                current_size = article_size
            else:
                # Adicionar artigo ao chunk atual
                current_chunk.append(article)
                current_size += article_size

        # Adicionar o último chunk, se houver
        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

# Exemplo de uso
splitter = RegulationTextSplitter(chunk_size=1000, chunk_overlap=0)
chunks = splitter.split_text(full_document.page_content)


### Alimentando banco

In [78]:
from langchain_pinecone import PineconeEmbeddings
import os

pinecone_api_key = os.environ["PINECONE_API_KEY"]

model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=os.environ.get('PINECONE_API_KEY')
)


In [79]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("regulamento")

vector_store = PineconeVectorStore(embedding=embeddings, index=index)

In [80]:
from langchain.docstore.document import Document

# Converte os textos em objetos Document
documents = [Document(page_content=text) for text in chunks]

In [81]:
document_ids = vector_store.add_documents(documents=documents)