In [5]:
import getpass
import os

if "UNSTRUCTURED_API_KEY" not in os.environ:
    os.environ["UNSTRUCTURED_API_KEY"] = getpass.getpass(
        "Enter your Unstructured API key: "
    )

In [6]:
%pip install --upgrade --quiet langchain-unstructured unstructured-client unstructured "unstructured[pdf]" python-magic

Note: you may need to restart the kernel to use updated packages.


In [29]:
from langchain_unstructured import UnstructuredLoader

file_paths = [
    "data/attention.pdf",
    "data/paul_graham.txt",
    # "data/DecodingStrategies.jpeg",
]


loader = UnstructuredLoader(file_paths, strategy="hi_res")


In [30]:
docs = loader.load()

docs

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /unstructuredio/yolo_x_layout/resolve/main/yolox_l0.05.onnx (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1006)')))"), '(Request ID: befa0e20-8a7c-4e86-8a02-009cdf33d76a)')

In [6]:
print(docs[0].metadata)

{'source': 'data/attention.pdf', 'coordinates': {'points': ((16.34, 213.92000000000007), (16.34, 253.92000000000007), (36.34, 253.92000000000007), (36.34, 213.92000000000007)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'data', 'filename': 'attention.pdf', 'languages': ['eng'], 'last_modified': '2025-02-10T13:30:43', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'da4d57fbf8f55a96700ed365e2d347f3'}


In [8]:
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
print(docs[0].page_content)

3 2 0 2
{'source': 'data/attention.pdf', 'coordinates': {'points': ((16.34, 213.92000000000007), (16.34, 253.92000000000007), (36.34, 253.92000000000007), (36.34, 213.92000000000007)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'data', 'filename': 'attention.pdf', 'languages': ['eng'], 'last_modified': '2025-02-10T13:30:43', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'da4d57fbf8f55a96700ed365e2d347f3'}
3 2 0 2


Local Unstructured method

In [11]:
from unstructured.partition.pdf import partition_pdf

# Process the PDF
elements = partition_pdf("data/attention.pdf")

elements

[<unstructured.documents.elements.Text at 0x32c206ed0>,
 <unstructured.documents.elements.NarrativeText at 0x32e2a2b90>,
 <unstructured.documents.elements.Title at 0x32d813a10>,
 <unstructured.documents.elements.Text at 0x326ec93d0>,
 <unstructured.documents.elements.NarrativeText at 0x32df03790>,
 <unstructured.documents.elements.Title at 0x32e2f2d10>,
 <unstructured.documents.elements.Title at 0x32db2f8d0>,
 <unstructured.documents.elements.Title at 0x32db2d1d0>,
 <unstructured.documents.elements.Title at 0x32e3bf810>,
 <unstructured.documents.elements.Title at 0x32d89ad10>,
 <unstructured.documents.elements.Title at 0x326f21810>,
 <unstructured.documents.elements.Title at 0x326880d10>,
 <unstructured.documents.elements.Title at 0x327009390>,
 <unstructured.documents.elements.Title at 0x32700ad50>,
 <unstructured.documents.elements.Title at 0x32700bc10>,
 <unstructured.documents.elements.NarrativeText at 0x32446a650>,
 <unstructured.documents.elements.NarrativeText at 0x32ba79410>,
 

In [12]:
# Separate content by type
text_elements = [e for e in elements if "Text" in str(type(e))]
image_elements = [e for e in elements if "Image" in str(type(e))]
table_elements = [e for e in elements if "Table" in str(type(e))]



In [13]:
# Print results
print("Text Content:")
for text in text_elements:
    print(text.text)



Text Content:
3 2 0 2
g u A 2
7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.
The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English- to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French 

In [14]:
print("\nExtracted Tables:")
for table in table_elements:
    print(table)



Extracted Tables:


In [19]:
# Print all extracted elements and their types
for element in elements:
    print(type(element), element)


<class 'unstructured.documents.elements.Text'> 3 2 0 2
<class 'unstructured.documents.elements.NarrativeText'> g u A 2
<class 'unstructured.documents.elements.Title'> ] L C . s c [
<class 'unstructured.documents.elements.Text'> 7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a
<class 'unstructured.documents.elements.NarrativeText'> Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.
<class 'unstructured.documents.elements.Title'> Attention Is All You Need
<class 'unstructured.documents.elements.Title'> Ashish Vaswani∗ Google Brain avaswani@google.com
<class 'unstructured.documents.elements.Title'> Noam Shazeer∗ Google Brain noam@google.com
<class 'unstructured.documents.elements.Title'> Niki Parmar∗ Google Research nikip@google.com
<class 'unstructured.documents.elements.Title'> Jakob Uszkoreit∗ Google Research usz@google.com
<class 'unstructured.documents.elements.Title'> Llion 

In [18]:
from unstructured.partition.pdf import partition_pdf
import shutil
import os

# Load PDF and extract elements
# elements = partition_pdf("sample.pdf")

# Create a directory to save images
image_dir = "extracted_images"
os.makedirs(image_dir, exist_ok=True)

# Save images
for idx, element in enumerate(elements):
    if "Image" in str(type(element)):  # Check if the element is an image
        image_path = os.path.join(image_dir, f"image_{idx}.jpg")
        with open(image_path, "wb") as f:
            shutil.copyfileobj(element.raw, f)  # Save image data

print(f"Images saved in: {image_dir}")


Images saved in: extracted_images


In [24]:
from langchain.document_loaders import UnstructuredPDFLoader
import os
import shutil
from unstructured.documents.elements import Text, Table, Image

# Load the PDF using LangChain (UnstructuredPDFLoader runs locally)
loader = UnstructuredPDFLoader("data/attention.pdf", strategy="hi_res")
docs = loader.load()

# Extract text, tables, and images
text_elements = [
    doc.page_content
    for doc in docs
    if isinstance(doc.metadata.get("elements", [{}])[0], Text)
]
table_elements = [
    doc.metadata["elements"]
    for doc in docs
    if isinstance(doc.metadata.get("elements", [{}])[0], Table)
]
image_elements = [
    doc.metadata["elements"]
    for doc in docs
    if isinstance(doc.metadata.get("elements", [{}])[0], Image)
]

# Print extracted data
print("\nExtracted Text:")
for text in text_elements:
    print(text)

print("\nExtracted Tables:")
for table in table_elements:
    print(table)

print("\nExtracted Images:")
for image in image_elements:
    print(image)  # This contains metadata, actual images need saving


SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /unstructuredio/yolo_x_layout/resolve/main/yolox_l0.05.onnx (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1006)')))"), '(Request ID: 95f994b1-caa7-4645-8857-9975dd7f3425)')