In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

file_1 = "knowledge_base/Ari Shokri.pdf"
file_2 = "knowledge_base/Isaacson, Walter - Elon Musk (2023).pdf"
file_3 = "knowledge_base/top_100_acc_firms.pdf"

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

### Using PyMuPDF

This so far has been the fastest loader and should be the default choice for documents with lots of text.

In [30]:
# This has been the fastest option for PDF with no image.
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(file_2)
docs = loader.load()

### Using PDF Miner

In [None]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(file_3)
data = loader.load()

### Using PyPDF

This provides fairly reasonable speed and reasonable quality of parsing table information.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_3, extract_images=False)
docs = loader.load()

### Using Unstructured.

You first need to install a good number of packages for this.

The OCR results for a document with images is simply exquisite. However it takes signifiantly longer than PyMUPDF.

It also returns all the results as one page for some reason.

In [None]:
%conda install poppler tesseract
%pip install unstructured_pytesseract

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(file_2)
docs = loader.load()

### Printing Results.

You can print the docs out to check the structure and can loop over to get the content.

You can also split the doc using splitter instance of RecursiveCharacterTextSplitter.

You may also split the docs while loading them. Instead of using .load() you can use .load_split() which most loaders have. You need to proved the runnable to this method.

In [31]:
print(len(docs), '\n')
print(docs[0].dict().keys(), '\n')
print(docs[0].metadata['source'], '\n')
for index, doc in enumerate(docs):
    print(f'Page#{index+1}\nType: {doc.type}\nContent:\n{doc.page_content}\n\n')

1646 

dict_keys(['id', 'metadata', 'page_content', 'type']) 

knowledge_base/Isaacson, Walter - Elon Musk (2023).pdf 

Page#1
Type: Document
Content:
Thank you for downloading this
Simon & Schuster ebook.
Get a FREE ebook when you join our mailing list. Plus, get updates on new releases,
deals, recommended reads, and more from Simon & Schuster. Click below to sign up
and see terms and conditions.
CLICK HERE TO SIGN UP
Already a subscriber? Provide your email again so we can register this ebook and send
you more of what you like to read. You will continue to receive exclusive offers in your
inbox.


Page#2
Type: Document
Content:
To anyone I’ve offended, I just want to say, I reinvented electric cars
and I’m sending people to Mars in a rocket ship. Did you think I was
also going to be a chill, normal dude?
—Elon Musk, Saturday Night Live, May 8, 2021
The people who are crazy enough to think they can change the world
are the ones who do.
—Steve Jobs


Page#3
Type: Document
Content:
PROL

In [29]:
splits = text_splitter.split_documents(docs)

print(len(splits), '\n')
print(splits[0].dict().keys(), '\n')
for index, split in enumerate(splits):
    print(f'Split#{index+1}\nType: {split.type}\nContent:\n{split.page_content}\n\n')

1646 

dict_keys(['id', 'metadata', 'page_content', 'type']) 

Split#1
Type: Document
Content:
Thank you for downloading this
Simon & Schuster ebook.
Get a FREE ebook when you join our mailing list. Plus, get updates on new releases,
deals, recommended reads, and more from Simon & Schuster. Click below to sign up
and see terms and conditions.
CLICK HERE TO SIGN UP
Already a subscriber? Provide your email again so we can register this ebook and send
you more of what you like to read. You will continue to receive exclusive offers in your
inbox.


Split#2
Type: Document
Content:
To anyone I’ve offended, I just want to say, I reinvented electric cars
and I’m sending people to Mars in a rocket ship. Did you think I was
also going to be a chill, normal dude?
—Elon Musk, Saturday Night Live, May 8, 2021
The people who are crazy enough to think they can change the world
are the ones who do.
—Steve Jobs


Split#3
Type: Document
Content:
PROLOGUE
Muse of Fire


Split#4
Type: Document
Content:
Th

### Using PDFPlumber

You can use itertools.islice with a generator function like parser() to only parse through a specific number of pages.

In [None]:
from itertools import islice
from langchain_community.document_loaders.parsers import PDFPlumberParser
from langchain_community.document_loaders.blob_loaders import Blob

def read_pdfs(file_path, extract_images=False):
    with open(file_path, 'rb') as f:
        blob = Blob(data=f.read(), source=file_path)
    parser = PDFPlumberParser(extract_images=extract_images)
    return parser.lazy_parse(blob)

parser = read_pdfs(file_3, extract_images=False)

for doc in islice(parser, 1):
    print(doc.page_content)

In [33]:
docs[0].metadata['source']

'knowledge_base/Isaacson, Walter - Elon Musk (2023).pdf'