## experiment gathering element from english textbook.pdf

In [1]:
from unstructured.partition.pdf import partition_pdf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,  # Path to your PDF file
        strategy="hi_res", # Use the most accurate (but slower) processing method of extraction
        infer_table_structure=True, # Keep tables as structured HTML, not jumbled text
        extract_image_block_types=["Image"], # Grab images found in the PDF
        extract_image_block_to_payload=True, # Store images as base64 data you can actually use
    )
    
    print(f"Extracted {len(elements)} elements")
    return elements

In [3]:
file_path = '../dataset/mitre-attack-philosophy-2020.pdf'
elements = partition_document(file_path)
elements

Partitioning document: ../dataset/mitre-attack-philosophy-2020.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 569 elements


[<unstructured.documents.elements.Image at 0x19e60085d10>,
 <unstructured.documents.elements.NarrativeText at 0x19e71e6b8d0>,
 <unstructured.documents.elements.NarrativeText at 0x19e71e6b510>,
 <unstructured.documents.elements.NarrativeText at 0x19e71eadd10>,
 <unstructured.documents.elements.NarrativeText at 0x19e68dc5850>,
 <unstructured.documents.elements.NarrativeText at 0x19e5fd89010>,
 <unstructured.documents.elements.Title at 0x19e71ecfbd0>,
 <unstructured.documents.elements.Text at 0x19e601ec090>,
 <unstructured.documents.elements.Text at 0x19e62c6fad0>,
 <unstructured.documents.elements.Title at 0x19e5f376a90>,
 <unstructured.documents.elements.Title at 0x19e5fd89b50>,
 <unstructured.documents.elements.NarrativeText at 0x19e71eccb10>,
 <unstructured.documents.elements.NarrativeText at 0x19e71ecf5d0>,
 <unstructured.documents.elements.Title at 0x19e68dfd510>,
 <unstructured.documents.elements.NarrativeText at 0x19e62c6e590>,
 <unstructured.documents.elements.Text at 0x19e62c6dc

In [4]:
set([str(type(el)) for el in elements])

{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [5]:
elements[36].to_dict()

{'type': 'Table',
 'element_id': '55fe0ece70623eb50a050a0bcaa8ddda',
 'text': 'Introduction .............................................................................................................................. 1 1.1 Background and History ................................................................................................... 1 2.1 ATT&CK Coverage ......................................................................................................... 4 3.1 The ATT&CK Matrix ....................................................................................................... 6 3.2 Technology Domains ........................................................................................................ 8 3.3 Tactics ............................................................................................................................... 8 3.4 Techniques and Sub-Techniques ...................................................................................... 9

In [6]:
images = [element for element in elements if element.category == 'Image']
print(f"Found {len(images)} images")

images[0].to_dict()

Found 18 images


{'type': 'Image',
 'element_id': 'dfaa85c2560e8a392152814f6c0d6d65',
 'text': 'MITRE',
 'metadata': {'coordinates': {'points': ((np.float64(223.25811111111108),
     np.float64(480.61944444444447)),
    (np.float64(223.25811111111108), np.float64(541.0361111111112)),
    (np.float64(398.2581111111111), np.float64(541.0361111111112)),
    (np.float64(398.2581111111111), np.float64(480.61944444444447))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2026-01-14T14:15:00',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAA8AK8DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0N

In [7]:
tables = [element for element in elements if element.category == 'Table']
print(f"Found {len(tables)} tables")

tables[0].to_dict()

Found 12 tables


{'type': 'Table',
 'element_id': '55fe0ece70623eb50a050a0bcaa8ddda',
 'text': 'Introduction .............................................................................................................................. 1 1.1 Background and History ................................................................................................... 1 2.1 ATT&CK Coverage ......................................................................................................... 4 3.1 The ATT&CK Matrix ....................................................................................................... 6 3.2 Technology Domains ........................................................................................................ 8 3.3 Tactics ............................................................................................................................... 8 3.4 Techniques and Sub-Techniques ...................................................................................... 9

In [9]:
from unstructured.chunking.title import chunk_by_title

In [10]:

def create_chunks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print("Creating smart chunks...")
    
    chunks = chunk_by_title(
        elements, # The parsed PDF elements from previous step
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars=2400, # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500 # Merge tiny chunks under 500 chars with neighbors
    )
    
    print(f"Created {len(chunks)} chunks")
    return chunks

# Create chunks
chunks = create_chunks_by_title(elements)

Creating smart chunks...
Created 73 chunks


In [11]:
set([str(type(chunk)) for chunk in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.TableChunk'>"}

In [12]:
chunks[11].metadata.orig_elements[-1].to_dict()

{'type': 'NarrativeText',
 'element_id': 'c6272dea197beb7793b96419523acbe7',
 'text': 'Defensive Gap Assessment – A defensive gap assessment allows an organization to determine what parts of its enterprise lack defenses and/or visibility. These gaps represent blind spots for potential vectors that allow an adversary to gain access to its networks undetected or unmitigated.',
 'metadata': {'detection_class_prob': 0.9221768379211426,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(200.7029777777778),
     np.float64(1390.9573974609375)),
    (np.float64(200.7029777777778), np.float64(1543.2361111111113)),
    (np.float64(1482.10400390625), np.float64(1543.2361111111113)),
    (np.float64(1482.10400390625), np.float64(1390.9573974609375))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2026-01-14T14:15:00',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 13,
  'file_directory': '../dataset',


## experiment gathering element from slide_deck.pdf

In [None]:
from unstructured.partition.pdf import partition_pdf

In [None]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,  # Path to your PDF file
        strategy="hi_res", # Use the most accurate (but slower) processing method of extraction
        infer_table_structure=True, # Keep tables as structured HTML, not jumbled text
        extract_image_block_types=["Image"], # Grab images found in the PDF
        extract_image_block_to_payload=True, # Store images as base64 data you can actually use
    )
    
    print(f"Extracted {len(elements)} elements")
    return elements

In [13]:
file_path = '../dataset/owasp-top-10.pdf'
elements = partition_document(file_path)
elements

Partitioning document: ../dataset/owasp-top-10.pdf
Extracted 490 elements


[<unstructured.documents.elements.Image at 0x19e5ff7acd0>,
 <unstructured.documents.elements.Image at 0x19e7c23fe50>,
 <unstructured.documents.elements.Text at 0x19e7c23dad0>,
 <unstructured.documents.elements.Text at 0x19e7c23ded0>,
 <unstructured.documents.elements.Title at 0x19e5fdd3690>,
 <unstructured.documents.elements.Text at 0x19e7c23f790>,
 <unstructured.documents.elements.Text at 0x19e7c23d610>,
 <unstructured.documents.elements.Title at 0x19e7c23f610>,
 <unstructured.documents.elements.ListItem at 0x19e7c23f910>,
 <unstructured.documents.elements.ListItem at 0x19e7c23e490>,
 <unstructured.documents.elements.ListItem at 0x19e7c23e090>,
 <unstructured.documents.elements.Text at 0x19e7c23e7d0>,
 <unstructured.documents.elements.Text at 0x19e7c23f890>,
 <unstructured.documents.elements.Image at 0x19e7c1351d0>,
 <unstructured.documents.elements.Image at 0x19e7c1fb390>,
 <unstructured.documents.elements.Text at 0x19e7c30b350>,
 <unstructured.documents.elements.Image at 0x19e7c2ccb

In [14]:
set([str(type(el)) for el in elements])

{"<class 'unstructured.documents.elements.Footer'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [21]:
images = [element for element in elements if element.category == 'Image']
print(f"Found {len(images)} images")

images[1].to_dict()

Found 111 images


{'type': 'Image',
 'element_id': 'f59a6c7ecdb4e059603950437b6061f3',
 'text': '',
 'metadata': {'coordinates': {'points': ((np.float64(157.33333166666665),
     np.float64(789.9999408333333)),
    (np.float64(157.33333166666665), np.float64(1289.9999697222222)),
    (np.float64(657.1666524999999), np.float64(1289.9999697222222)),
    (np.float64(657.1666524999999), np.float64(789.9999408333333))),
   'system': 'PixelSpace',
   'layout_width': 2667,
   'layout_height': 1500},
  'last_modified': '2026-01-14T14:15:00',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAH0AfQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3OD

In [46]:
# filter unrelevent images
images = [element for element in elements if element.category == 'Image']

images_in_page_4 = [image for image in images if image.metadata.page_number == 4]

excluded_texts = [
    'Office of Information Security Securing One HHS 2',
    'Health Sector Cybersecurity Coordination Center'
]

images_dict = [image.to_dict() for image in images_in_page_4]

for img in images_dict:
    if img.get('text') not in excluded_texts:
        relevent_image = img

relevant_image

{'type': 'Image',
 'element_id': 'f539346f5e91cbe7a431348ab64504e5',
 'text': "2017 2021 A01:2021-Broken Access Control A02:2021-Cryptographic Failures >» A03:2021-Injection _-(New) A04:2021-Insecure Design os A05:2021-Security Misconfiguration A06:2021-Vulnerable and Outdated Components A07:2017-Cross-Site Scripting (XSS) ' A07:2021-Identification and Authentication Failures AO8:2017-Insecure Deserialization rr eee {New} A08:2021-Software and Data Integrity Failures A09:2017-Using Components with Known Vulnerabilities es A09:2021-Security Logging and Monitoring Failures* A10:2017-Insufficient Logging & Monitoring (New) A10:2021-Server-Side Request Forgery (SSRF)* * From the Survey A01:2017-Injection A02:2017-Broken Authentication A03:2017-Sensitive Data Exposure A04:2017-XML External Entities (XXE) A05:2017-Broken Access Control A06:2017-Security Misconfiguration",
 'metadata': {'coordinates': {'points': ((np.float64(1350.0),
     np.float64(614.9998983333332)),
    (np.float64(1350.0

## experiment gathering element from Thai document.pdf using unstructure

In [None]:
from unstructured.partition.pdf import partition_pdf

In [191]:
def partition_document(file_path: str):
    print(f"Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=None,
        extract_image_block_to_payload=False,
        languages=["tha"] 
    )
        
    print(f"Successfully extracted {len(elements)} elements.")
    return elements

In [192]:
file_path = "../dataset/thailand-web-security-standard-2025.pdf"
elements = partition_document(file_path)
elements

Partitioning document: ../dataset/thailand-web-security-standard-2025.pdf
Successfully extracted 894 elements.


[<unstructured.documents.elements.Text at 0x19ea7177fd0>,
 <unstructured.documents.elements.Text at 0x19ea71762d0>,
 <unstructured.documents.elements.Text at 0x19ea7176fd0>,
 <unstructured.documents.elements.Text at 0x19ea71757d0>,
 <unstructured.documents.elements.Text at 0x19ea7177ed0>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7175e90>,
 <unstructured.documents.elements.Text at 0x19ea7177010>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7175490>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7177790>,
 <unstructured.documents.elements.NarrativeText at 0x19ea71778d0>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7177d10>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7175d50>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7175650>,
 <unstructured.documents.elements.NarrativeText at 0x19ea7176090>,
 <unstructured.documents.elements.Text at 0x19ea71769d0>,
 <unstructured.documents.elements.Text at 0x19ea7177890>,


## another solution

In [194]:
from pdf2image import convert_from_path
import pytesseract

In [198]:
PDF_PATH = "../dataset/thailand-web-security-standard-2025.pdf"

images = convert_from_path(PDF_PATH, dpi=600)

texts = []
for i, img in enumerate(images):
    text = pytesseract.image_to_string(
        img,
        lang="tha+eng",
        config="--psm 6 --oem 1"
    )
    texts.append(text)

full_text = "\n".join(texts)

with open("thai_text.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

In [199]:
import re

In [200]:
def clean_thai_gov_ocr(text):
    # 1. Remove OCR Source Tags (e.g., )
    text = re.sub(r'\[.*?\]', '', text)

    # 2. Remove Page Headers and Footers
    # Removing "Page XX" (หน้า ๓๓)
    text = re.sub(r'หน้า\s+[๐-๙\d]+', '', text)
    # Removing Gazette Header (เล่ม ... ราชกิจจานุเบกษา ...)
    text = re.sub(r'เล่ม\s+[๐-๙\d]+.*?ราชกิจจานุเบกษา.*?[\r\n]+', '', text, flags=re.DOTALL)
    # Removing centered page numbers like "- ๕ -" or "-ไ๒-"
    text = re.sub(r'-\s*[\w๐-๙]+\s*-', '', text)

    # "we." appears to be a misread of "พ.ศ."
    text = text.replace('we.', 'พ.ศ.')
    # "๒๕๒๐๒" is a common OCR error for "๒๕๖๒" (Cybersecurity Act year)
    text = text.replace('๒๕๒๐๒', '๒๕๖๒') 
    # "๒๕๒๐๕" seems to be a misread of "๒๕๖๕"
    text = text.replace('๒๕๒๐๕', '๒๕๖๕')
    
    thai_digits = '๐１２３４５６７８９' # Using wide chars just in case, or standard
    thai_digits_std = '๐๑๒๓๔๕๖๗๘๙'
    arabic_digits = '0123456789'
    trans_table = str.maketrans(thai_digits_std, arabic_digits)
    text = text.translate(trans_table)

    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [203]:
with open('thai_text.txt', 'r', encoding='utf-8') as f:
    raw_content = f.read()

print(raw_content)

cleaned_text = clean_thai_gov_ocr(raw_content)
with open('cleaned_text.txt', "w", encoding="utf-8") as f:
    f.write(cleaned_text)

หน้า ๓๓
เล่ม ๑๕๒ ตอนพิเศษ ๓๐๕ ง   ราชกิจจานุเบกษา       ๑๐ กันยายน ๒๕๒๐๕
ประกาศคณะกรรมการการรักษาความมันคงปลอดภัยไซเบอร์แห่งชาติ
เรือง มาตรฐานการรักษาความมันคงปลอดภัยสําหรับเว็บไซต์
พ.ศ. ๒๕๒๐๕

โดยที่พระราชบัญญัติการรักษาความมันคงปลอดภัยไซเบอร์ พ.ศ. ๒๕๒๒ กําหนดให้
คณะกรรมการการรักษาความมันคงปลอดภัยไซเบอร์แห่งชาติมีหน้าที่และอํานาจสร้างมาตรฐานเกี่ยวกับ
การรักษาความมั่นคงปลอดภัยไซเบอร์ และกําหนดมาตรฐานขั้นตําที่เกี่ยวข้องกับคอมพิวเตอร์
ระบบคอมพิวเตอร์หรือโปรแกรมคอมพิวเตอร์ จึงสมควรมีมาตรฐานการรักษาความมันคงปลอดภัย
สําหรับเว็บไซต์ เพื่อให้การดําเนินงานเกี่ยวกับการรักษาความมันคงปลอดภัยไซเบอร์เป็นไปอย่างมีประสิทธิภาพ

อาศัยอํานาจตามความในมาตรา ๕ (๕@๕) มาตรา ๒๒ (๑๓) และ (od) แห่งพระราชบัญญัติ
การรักษาความมันคงปลอดภัยไซเบอร์ พ.ศ. ๒๕๒๒ ประกอบกับมติคณะกรรมการบริหารสํานักงาน
คณะกรรมการการรักษาความมั่นคงปลอดภัยไซเบอร์ ในคราวการประชุมครั้งที่ ๒/๒๕๒๕ เมื่อวันที่
๕ พฤศจิกายน ๒๕๒๕ มติคณะกรรมการการรักษาความมันคงปลอดภัยไซเบอร์แห่งชาติ
ในคราวการประชุมครั้งที่ ๕/๒๕๒๒ เมื่อวันที ๒๐ พฤศจิกายน ๒๕๒๒ และมติคณ