In [None]:
file_1 = 'knowledge_base/Ari Shokri.pdf'
file_2 = 'knowledge_base/Isaacson, Walter - Elon Musk (2023).pdf'
file_3 = 'knowledge_base/top_100_acc_firms.pdf'

### Using PyMuPDF

This so far has been the fastest loader.

In [None]:
# This has been the fastest option for PDF with no image.
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(file_3, extract_images=False)
docs = loader.load()

### Using PDF Miner

In [None]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(file_3)
data = loader.load()

### Using PyPDF

This provides fairly reasonable speed and reasonable quality of parsing table information.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_3, extract_images=False)
docs = loader.load()

### Using Unstructured.

You first need to install a good number of packages for this.

The OCR results for a document with images is simply exquisite. However it takes signifiantly longer than PyMUPDF.

It also returns all the results as one page for some reason.

In [None]:
%conda install poppler tesseract
%pip install unstructured_pytesseract

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(file_2)
docs = loader.load()

In [None]:
print(len(docs), '\n')
print(docs[0].dict().keys(), '\n')
for index, parsed in enumerate(docs):
    print(f'Page#{index+1}\nType: {parsed.type}\nContent:\n{parsed.page_content}\n\n')

### Using PDFPlumber

You can use itertools.islice with a generator function like parser() to only parse through a specific number of pages.

In [None]:
from itertools import islice
from langchain_community.document_loaders.parsers import PDFPlumberParser
from langchain_community.document_loaders.blob_loaders import Blob

def read_pdfs(file_path, extract_images=False):
    with open(file_path, 'rb') as f:
        blob = Blob(data=f.read(), source=file_path)
    parser = PDFPlumberParser(extract_images=extract_images)
    return parser.lazy_parse(blob)

parser = read_pdfs(file_3, extract_images=False)

for parsed in islice(parser, 1):
    print(parsed.page_content)