In [15]:
import os

os.environ["EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD"] = "30"
os.environ["EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD"] = "10"

In [47]:
from unstructured.partition.auto import partition

elements = partition(
    filename="embedded-images-tables.pdf",
    strategy="hi_res",
    extract_image_block_types=["Image", "Table"],
     extract_image_block_output_dir="output_img"
    )

INFO: PDF text extraction failed, skip text extraction...
INFO: Reading PDF for file: embedded-images-tables.pdf ...


In [48]:
for el in elements:
    print("{:15} {:10} {:10} {:40}".format(el.category, el._element_id[:6] if el._element_id is not None else "", el.metadata.parent_id[:6] if el.metadata.parent_id is not None else "", el.text))

Header          2d5583                454                                     
NarrativeText   f2883d     2d5583     O. Sanni, A.P.. Popoola / Data in Brief 22 (2019) 451-457
Image           82f81c                5 1 os = — 10; =o ° © —" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)
FigureCaption   53f386                Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO, solution in the presence and absence of ES.
Title           678447                Table 1                                 
FigureCaption   dfc7b7     678447     Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO, solution.
Table           a16555     678447     Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42

In [53]:
def remove_elements_of_type(elements, element_type=["Header", "Footer"]):
    """
    Remove header and footer elements from the list of elements.
    """
    return [el for el in elements if el.category not in element_type]

In [54]:
elements = remove_header_footer_elements(elements)

In [55]:
for el in elements:
    print("{:15} {:10} {:10} {:40}".format(el.category, el._element_id[:6] if el._element_id is not None else "", el.metadata.parent_id[:6] if el.metadata.parent_id is not None else "", el.text))

NarrativeText   f2883d     2d5583     O. Sanni, A.P.. Popoola / Data in Brief 22 (2019) 451-457
Image           82f81c                5 1 os = — 10; =o ° © —" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)
FigureCaption   53f386                Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO, solution in the presence and absence of ES.
Title           678447                Table 1                                 
FigureCaption   dfc7b7     678447     Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO, solution.
Table           a16555     678447     Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 30

In [56]:
def remove_text_on_elements_of_type(elements, element_type=["Image", "Table"]):
    """
    Remove text elements that are of a certain type.
    """
    ret = []
    for el in elements:
        if el.category not in element_type:
            ret.append(el)
        else:
            # Remove text from the element
            el.text = ""
            ret.append(el)
    return ret

In [57]:
elements = remove_text_on_elements_of_type(elements, element_type=["Image", "Table"])

In [58]:
for el in elements:
    print("{:15} {:10} {:10} {:40}".format(el.category, el._element_id[:6] if el._element_id is not None else "", el.metadata.parent_id[:6] if el.metadata.parent_id is not None else "", el.text))

NarrativeText   f2883d     2d5583     O. Sanni, A.P.. Popoola / Data in Brief 22 (2019) 451-457
Image           82f81c                                                        
FigureCaption   53f386                Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO, solution in the presence and absence of ES.
Title           678447                Table 1                                 
FigureCaption   dfc7b7     678447     Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO, solution.
Table           a16555     678447                                             
NarrativeText   2c0c28     678447     The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO, follow Langmuir adsorption isotherm. Figs. 6-8 show the SEM/EDX surface morpholog

In [59]:
from unstructured.chunking.basic import chunk_elements

chunks = chunk_elements(elements)

In [60]:
chunks[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '60c58217-0836-495a-898e-32a0544d0a64',
 'text': 'O. Sanni, A.P.. Popoola / Data in Brief 22 (2019) 451-457\n\nFig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO, solution in the presence and absence of ES.\n\nTable 1\n\nPotentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO, solution.',
 'metadata': {'filename': 'embedded-images-tables.pdf',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'last_modified': '2025-05-15T13:59:22',
  'page_number': 1,
  'orig_elements': 'eJzVVk2P2zYQ/SuETi1gc/lNam/pR9Ae2iywe1ssDEoc2ixkSZDoNNsg/70k7U3dXQdB3bhIb3qjGWqG7+mR9+8r6GALfVwFV12jyjNjuKvrxhMmWucpOCXyI7OuESCqBaq2EK2z0ab891U7DJMLvY0wF9zZx2EXVxsI601MEWpqkmoO4d+Di5sc5SRHxyH0Mdfd3wuisFwgyiSmDwv0EQuGdcbGECxO4JKfAtX8OEfY5hluwjvobkfbQvUhvXAQoY1h6FdtZ+d5NU5Dk9IIVpQQmhJ86KC3W8i1sG3AOXDLsLVrmJfRNh3MeHS+OmTGx7Fk2nHsQmvzwleH153t17tclieqoF9XDyU6x9V2cMEHKHvMCJNLIpdU3lF+LetrxnL1mCpX/S41