In [1]:
import os
from langchain_community.embeddings import OllamaEmbeddings

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import UnstructuredEPubLoader

from langchain_community.document_loaders import PyPDFLoader

embeddings = OllamaEmbeddings(model='nomic-embed-text:v1.5')

import logging

# setar logging como INFO
logging.basicConfig(level=logging.INFO)

In [2]:
path_books = './books/import'
path_done_books = './books/done'
path_vector_store = '../vector-store/books'
chunk_size = 1000

In [3]:
def load_books(path_books):
    logging.info(f'Loading books from {path_books}')
    books = []
    for root, dirs, files in os.walk(path_books):
        for file in files:
            if file.endswith('.pdf'):
                books.append(os.path.join(root, file))
    return books

def load_pdf(file):
    logging.info(f'Loading PDF {file}')
    loader = PyPDFLoader(file)
    documents = loader.load()
    return documents

def load_chunked_docs(documents):
    logging.info(f'Chunking documents')
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=100
    )
    chunked_docs = text_spliter.split_documents(documents)
    return chunked_docs

def load_vector_store(chunked_docs, path_vector_store):
    if not os.path.isdir(path_vector_store):
        logging.info(f'Creating vector store')
        os.makedirs(path_vector_store)
        db = FAISS.from_documents(chunked_docs, embeddings)
        db.save_local(path_vector_store)
        logging.info(f'Vector store loaded {db.index.ntotal}')
        return db
    else:
        logging.info(f'Loading vector store')
        db = FAISS.load_local(folder_path=path_vector_store, 
                              embeddings=embeddings, 
                              allow_dangerous_deserialization=True)
        logging.info(f'Adding documents to vector store')
        db.add_documents(chunked_docs)
        db.save_local(path_vector_store)
        logging.info(f'Vector store loaded {db.index.ntotal}')
        return db

In [4]:
books = load_books(path_books)
for book in books:
    logging.info(f'Processing book {book}')
    documents_books = load_pdf(book)
    chunked_docs = load_chunked_docs(documents_books)
    db = load_vector_store(chunked_docs=chunked_docs, path_vector_store=path_vector_store)
    logging.info(f'Moving book {book} to {path_done_books}')
    os.rename(book, os.path.join(path_done_books, os.path.basename(book)))

INFO:root:Loading books from ./books/import
INFO:root:Processing book ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf
INFO:root:Loading PDF ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf
INFO:root:Chunking documents
INFO:root:Creating vector store
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
INFO:root:Vector store loaded 294
INFO:root:Moving book ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf to ./books/done
INFO:root:Processing book ./books/import/Somos Todos Teólogos - RC Sproul.pdf
INFO:root:Loading PDF ./books/import/Somos Todos Teólogos - RC Sproul.pdf
INFO:root:Chunking documents
INFO:root:Loading vector store
INFO:root:Adding documents to vector store
INFO:root:Vector store loaded 1176
INFO:root:M

In [5]:
#testar se o documento foi adicionado
#db = FAISS.load_local("../vector-store/books", embeddings, allow_dangerous_deserialization=True)
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 3

results = retriever.get_relevant_documents("Calvinismo")


In [6]:
for row in results:
    page_content = row.page_content
    source = row.metadata['source']
    page = row.metadata['page']

    

    # Agora você pode fazer o que quiser com esses dados.
    # Por exemplo, você pode imprimi-los:
    print(f"""
Page Content: 
{page_content.replace('	',' ')}

Source: 
{source} 

Page:
{page}

""")


Page Content: 
5
 John Calvin, 
Commentaries on the Epistle of Paul to the Romans
, trans. and ed. John Owen (repr., Grand Rapids: Baker, 2003),
354.

Source: 
./books/import/Somos Todos Teólogos - RC Sproul.pdf 

Page:
94



Page Content: 
tão clara e desanuviada como a de Jesus; estão engendrados em nosso coração como dispositivos 
biônicos e revelam uma imagem mista e não totalm ente grosseira dos nossos impulsos de dar".
77 
                                                             
 
75 DONALD  E GRAY . Jesus, the way to freedom , p. 45. 
76 William KILPATRICK.  Identity and intimacy.  Nova York: Sheed and Ward, 1975,0.112.  
77 CARL J. JUNG. MODERN man in search of a soul. HARCOURT : BRACE AND WORLD HARVEST  BOOKS , 1933, P. 235.

Source: 
./books/import/Evangelho Maltrapilho - Brennan Manning.pdf 

Page:
70



Page Content: 
6
 Bunyan aqui se refere à criada diante de quem Pedro negou a Cristo;
conforme Mateus 26:69-72 e Lucas 22:56-57.

Source: 
./books/import/O Peregrino -

In [7]:
#loader = UnstructuredEPubLoader("/home/cristian/Documentos/Projetos/personal-search/ingestion/livros/Livro Conselhos para obreiros - Charles Spurgeon.epub", mode="elements")
#data = loader.load()

#documents = loader.load()