# Import books - PDF

## Import libraries

In [1]:
import os
from langchain_community.embeddings import OllamaEmbeddings

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import PyPDFLoader

import logging

# sset logging level to INFO
logging.basicConfig(level=logging.INFO)

## Configuration

In [2]:
# Path to the folder with the books to be imported
path_books = './books/import'
# Path to the folder with the books that have already been imported
path_done_books = './books/done'
# Path to the folder with vector store
path_vector_store = '../vector-store/chroma/books'
# Chunk size for the vector store
# chunkSize controls the max size (in terms of number of characters) of the final documents. 
# chunkOverlap specifies how much overlap there should be between chunks. 
# This is often helpful to make sure that the text isn't split weirdly.
chunk_size = 500
# Define embedding model
embeddings = OllamaEmbeddings(model='nomic-embed-text:v1.5')

In [3]:
def load_books(path_books):
    """
    Load books from the specified path.

    Args:
        path_books (str): The path to the directory containing the books.

    Returns:
        list: A list of file paths for the books found in the directory.
    """
    logging.info(f'Loading books from {path_books}')
    books = []
    for root, dirs, files in os.walk(path_books):
        for file in files:
            if file.endswith('.pdf'):
                books.append(os.path.join(root, file))
    return books

def load_pdf(file):
    """
    Loads a PDF file and returns the extracted documents with PyPDFLoader.

    Args:
        file (str): The path to the PDF file.

    Returns:
        list: A list of extracted documents from the PDF.

    """
    logging.info(f'Loading PDF {file}')
    loader = PyPDFLoader(file)
    documents = loader.load()
    return documents

def load_chunked_docs(documents):
    """
    Chunk documents.

    Args:
        documents (list): A list of documents to be chunked.

    Returns:
        list: A list of chunked documents.

    """
    logging.info(f'Chunking documents')
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=100
    )
    chunked_docs = text_spliter.split_documents(documents)
    return chunked_docs

def load_vector_store(chunked_docs, path_vector_store):
    """
    Loads or creates a vector store using FAISS.

    Args:
        chunked_docs (list): A list of chunked documents.
        path_vector_store (str): The path to the vector store.

    Returns:
        FAISS object: The loaded or created vector store.

    Raises:
        None

    """
    if not os.path.isdir(path_vector_store):
        logging.info(f'Creating vector store')
        os.makedirs(path_vector_store)
        db = Chroma.from_documents(chunked_docs, embeddings, persist_directory=path_vector_store)
        return db
    else:
        logging.info(f'Loading vector store')
        db = Chroma(persist_directory=path_vector_store, embedding_function=embeddings)
        logging.info(f'Adding documents to vector store')
        db.add_documents(chunked_docs)
        return db

In [4]:
# 10min
# Load books from path
books = load_books(path_books)
# Process each book
for book in books:
    logging.info(f'Processing book {book}')
    documents_books = load_pdf(book)
    chunked_docs = load_chunked_docs(documents_books)
    db = load_vector_store(chunked_docs=chunked_docs, path_vector_store=path_vector_store)
    logging.info(f'Moving book {book} to {path_done_books}')
    os.rename(book, os.path.join(path_done_books, os.path.basename(book)))

INFO:root:Loading books from ./books/import
INFO:root:Processing book ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf
INFO:root:Loading PDF ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf
INFO:root:Chunking documents
INFO:root:Creating vector store
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:root:Moving book ./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf to ./books/done
INFO:root:Processing book ./books/import/Sociedade do Cansaço - Byung-Chul Han.pdf
INFO:root:Loading PDF ./books/import/Sociedade do Cansaço - Byung-Chul Han.pdf
INFO:root:Chunking documents
INFO:root:Loading vector store
INFO:root:Adding documents to vector store
INFO:root:Moving book ./books/import/Sociedade do Cansaço - Byung-Chul Han.pdf to ./books/done
INFO:root:Processing book ./books/import/Somos Todos Teólogos - RC Sproul.pdf
IN

## DEBUG

In [5]:
#testar se o documento foi adicionado
#db = FAISS.load_local("../vector-store/books", embeddings, allow_dangerous_deserialization=True)
db = Chroma(persist_directory=path_vector_store, embedding_function=embeddings)
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 5, 'fetch_k': 50}
)

results = retriever.get_relevant_documents("Calvinismo")


  warn_deprecated(


In [7]:
for row in results:
    page_content = row.page_content
    source = row.metadata['source']
    page = row.metadata['page']

    

    # Agora você pode fazer o que quiser com esses dados.
    # Por exemplo, você pode imprimi-los:
    print(f"""
Page Content: 
{page_content.replace('	',' ')}

Source: 
{source} 

Page:
{page}

""")


Page Content: 
5
 John Calvin, 
Commentaries on the Epistle of Paul to the Romans
, trans. and ed. John Owen (repr., Grand Rapids: Baker, 2003),
354.

Source: 
./books/import/Somos Todos Teólogos - RC Sproul.pdf 

Page:
94



Page Content: 
de nosso coração? Será que poderíamos convidar o Esp írito Santo 
para caminhar conosco de mãos dadas, p elos corredores del e? Não

Source: 
./books/import/Por que tarda o pleno Avivamento - Leonard Ravenhill.pdf 

Page:
21



Page Content: 
92 HENRI J. M. NOUWEN . ʺTHE PRODIGAL  COMES HOMEʺ, National Catholic Reporter,  4 DE AGOSTO  DE 1989. 
93 Robert BOLT. A man for all seasons. Nova York: Random  House, 1960, p. 140. 
94 G. K. CHESTERTON.  The fame of blessed Thomas More. Nova York: Sheed and Ward, 1929, p.6V

Source: 
./books/import/Evangelho Maltrapilho - Brennan Manning.pdf 

Page:
88



Page Content: 
7
 Repare semelhança com 2Coríntios 11:13-14.

Source: 
./books/import/O Peregrino - John Bunyan.pdf 

Page:
197



Page Content: 
numa sala