# Import books - PDF

## Import libraries

In [1]:
import os
from langchain_community.embeddings import OllamaEmbeddings

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import PyPDFLoader

import logging

# sset logging level to INFO
logging.basicConfig(level=logging.INFO)

## Configuration

In [2]:
# Path to the folder with the books to be imported
path_books = './books/import'
# Path to the folder with the books that have already been imported
path_done_books = './books/done'
# Path to the folder with vector store
path_vector_store = '../vector-store/chroma/books'
# Chunk size for the vector store
# chunkSize controls the max size (in terms of number of characters) of the final documents. 
# chunkOverlap specifies how much overlap there should be between chunks. 
# This is often helpful to make sure that the text isn't split weirdly.
chunk_size = 1000
# Define embedding model
embeddings = OllamaEmbeddings(model='nomic-embed-text:v1.5')

In [3]:
def load_books(path_books):
    """
    Load books from the specified path.

    Args:
        path_books (str): The path to the directory containing the books.

    Returns:
        list: A list of file paths for the books found in the directory.
    """
    logging.info(f'Loading books from {path_books}')
    books = []
    for root, dirs, files in os.walk(path_books):
        for file in files:
            if file.endswith('.pdf'):
                books.append(os.path.join(root, file))
    return books

def load_pdf(file):
    """
    Loads a PDF file and returns the extracted documents with PyPDFLoader.

    Args:
        file (str): The path to the PDF file.

    Returns:
        list: A list of extracted documents from the PDF.

    """
    logging.info(f'Loading PDF {file}')
    loader = PyPDFLoader(file)
    documents = loader.load()
    return documents

def load_chunked_docs(documents):
    """
    Chunk documents.

    Args:
        documents (list): A list of documents to be chunked.

    Returns:
        list: A list of chunked documents.

    """
    logging.info(f'Chunking documents')
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=100
    )
    chunked_docs = text_spliter.split_documents(documents)
    return chunked_docs

def load_vector_store(chunked_docs, path_vector_store):
    """
    Loads or creates a vector store using FAISS.

    Args:
        chunked_docs (list): A list of chunked documents.
        path_vector_store (str): The path to the vector store.

    Returns:
        FAISS object: The loaded or created vector store.

    Raises:
        None

    """
    if not os.path.isdir(path_vector_store):
        logging.info(f'Creating vector store')
        os.makedirs(path_vector_store)
        db = Chroma.from_documents(chunked_docs, embeddings, persist_directory=path_vector_store)
        return db
    else:
        logging.info(f'Loading vector store')
        db = Chroma(persist_directory=path_vector_store, embedding_function=embeddings)
        logging.info(f'Adding documents to vector store')
        db.add_documents(chunked_docs)
        return db

In [4]:
# Load books from path
books = load_books(path_books)
# Process each book
for book in books:
    logging.info(f'Processing book {book}')
    documents_books = load_pdf(book)
    chunked_docs = load_chunked_docs(documents_books)
    db = load_vector_store(chunked_docs=chunked_docs, path_vector_store=path_vector_store)
    logging.info(f'Moving book {book} to {path_done_books}')
    os.rename(book, os.path.join(path_done_books, os.path.basename(book)))

INFO:root:Loading books from ./books/import


## DEBUG

In [9]:
#testar se o documento foi adicionado
#db = FAISS.load_local("../vector-store/books", embeddings, allow_dangerous_deserialization=True)
db = Chroma(persist_directory=path_vector_store, embedding_function=embeddings)
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 5, 'fetch_k': 50}
)

results = retriever.get_relevant_documents("Calvinismo")


AttributeError: 'VectorStoreRetriever' object has no attribute 'similarity_search'

In [8]:
for row in results:
    page_content = row.page_content
    source = row.metadata['source']
    page = row.metadata['page']

    

    # Agora você pode fazer o que quiser com esses dados.
    # Por exemplo, você pode imprimi-los:
    print(f"""
Page Content: 
{page_content.replace('	',' ')}

Source: 
{source} 

Page:
{page}

""")


Page Content: 
5
 John Calvin, 
Commentaries on the Epistle of Paul to the Romans
, trans. and ed. John Owen (repr., Grand Rapids: Baker, 2003),
354.

Source: 
./books/import/Somos Todos Teólogos - RC Sproul.pdf 

Page:
94



Page Content: 
6
 Bunyan aqui se refere à criada diante de quem Pedro negou a Cristo;
conforme Mateus 26:69-72 e Lucas 22:56-57.

Source: 
./books/import/O Peregrino - John Bunyan.pdf 

Page:
196



Page Content: 
Sumário
1  A violência neuronal
2  Além da sociedade disciplinar
3  O tédio profundo
4  Vita activa
5  Pedagogia do ver
6  O Caso Bartleby
7  Sociedade do cansaço
Textos de capa

Source: 
./books/import/Sociedade do Cansaço - Byung-Chul Han.pdf 

Page:
8



Page Content: 
batalha confiando em nossa força, não devemos nos surpreender se voltarmos manchados pela
derrota. Se não fosse pelo teu poder, ó Espírito de Deus, não poderíamos nem tentar; mas,
quando confiamos em ti, seguimos adiante com fé.
Ultimamente tenho sido tocado, ao olhar a história da Ref