In [24]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io

# Specify the installation path of Tesseract and modify it according to your actual installation location
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows

def extract_text_from_coordinates(pdf_path, coordinates):
    # Open PDF file
    doc = fitz.open(pdf_path)
    extracted_texts = {}
    
    for label, (x0, y0, x1, y1) in coordinates.items():
        # Get the pixmap of the entire page (image)
        page = doc[0]  # Assume all pages have the same relevant content
        pix = page.get_pixmap()
        
        # Convert the pixmap to an image object
        img = Image.open(io.BytesIO(pix.tobytes()))
        
        # Define the region of interest (ROI) as a box
        roi = img.crop((x0, y0, x1, y1))
        
        # Use Tesseract to do OCR on the ROI
        text = pytesseract.image_to_string(roi, lang='eng', config='--psm 6').strip()
        extracted_texts[label] = text
    
    return extracted_texts

# Coordinates for the areas to extract
coordinates = {
    "title": (62, 62, 780, 174),  # Title and subtitle
    "notes": (62, 1488, 890, 1618),  # Notes
    "information_block": (1488, 1498, 2327, 1627)  # Information block
}

# coordinates = {
#     "title": (200, 180, 2200, 425),
#     "notes": (180, 4160, 2420, 4485),
#     "information_block_rev_table": (2500, 4160, 4133, 4520),
#     "information_table_A1": (4133, 4160, 4338, 4520),
#     "information_table_customer": (4338, 4160, 4890, 4520),
#     "information_table_ingenia": (4890, 4160, 5480, 4520),
#     "information_status": (5480, 4162, 6463, 4520),
# }


# Paths
pdf_path = r'pdf_files\236-020-STR-001_D.pdf'
output_text_path = r'extract_data\testing.txt'

# Extract text
extracted_text_regions = extract_text_from_coordinates(pdf_path, coordinates)

# Structuring and saving the extracted text
with open(output_text_path, 'w', encoding='utf-8', errors='replace') as file:
    for label, text in extracted_text_regions.items():
        file.write(f"{label}:\n{text}\n\n")

print(f"Extracted text saved to {output_text_path}")

Extracted text saved to extract_data\testing.txt


In [8]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io

# Ensure the correct installation path of Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update for your path

def preprocess_image(pix):
    # Convert to a PIL image
    img = Image.open(io.BytesIO(pix.tobytes()))
    # Convert to grayscale
    img = img.convert('L')
    # Apply thresholding
    img = img.point(lambda x: 0 if x<128 else 255, '1')
    return img

def extract_text_from_coordinates(pdf_path, coordinates):
    # Open PDF file
    doc = fitz.open(pdf_path)
    extracted_texts = {}
    
    for label, (x0, y0, x1, y1) in coordinates.items():
        page = doc[0]  # Assuming the first page
        pix = page.get_pixmap()
        
        # Preprocess the image for better OCR results
        img = preprocess_image(pix)
        
        # Define the region of interest (ROI)
        roi = img.crop((x0, y0, x1, y1))
        
        # Use Tesseract to do OCR on the ROI
        # Using PSM 6 assumes a single uniform block of text
        text = pytesseract.image_to_string(roi, lang='eng', config='--psm 6').strip()
        extracted_texts[label] = text
    
    return extracted_texts

# Update these coordinates based on the actual position of the block in your image
coordinates = {
    "title": (62, 62, 780, 174),  # Title and subtitle
    "notes": (62, 1490, 890, 1620),  # Notes
    "information_block": (1488, 1498, 2327, 1627)  # Information block
}

pdf_path = r'pdf_files\236-020-SAS-120_0.pdf'  # Update for your path
output_text_path = r'extract_data\236-020-SAS-120_0.txt'

# Extract text
extracted_text_regions = extract_text_from_coordinates(pdf_path, coordinates)

# Structuring and saving the extracted text
with open(output_text_path, 'w', encoding='utf-8') as file:  # Specify UTF-8 encoding here
    for label, text in extracted_text_regions.items():
        file.write(f"{label}:\n{text}\n\n")

print(f"Extracted text saved to {output_text_path}")

Extracted text saved to extract_data\236-020-SAS-120_0.txt


In [7]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io

# Specify the installation path of Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows

def preprocess_image(pix):
    # Convert to a PIL image and preprocess
    img = Image.open(io.BytesIO(pix.tobytes()))
    img = img.convert('L')
    img = img.point(lambda x: 0 if x<128 else 255, '1')
    return img

def extract_text(pdf_path, coordinates=None):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    text = ''
    extracted_texts = {}
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        
        if coordinates and page_num == 0:  # Only process coordinates for the first page
            for label, (x0, y0, x1, y1) in coordinates.items():
                # Preprocess the image for better OCR results
                img = preprocess_image(pix)
                # Define the region of interest (ROI)
                roi = img.crop((x0, y0, x1, y1))
                # Use Tesseract to do OCR on the ROI
                text = pytesseract.image_to_string(roi, lang='eng', config='--psm 6').strip()
                extracted_texts[label] = text
        else:
            # Convert the entire page to an image
            img = Image.open(io.BytesIO(pix.tobytes()))
            # Use OCR to extract text from the image
            page_text = pytesseract.image_to_string(img, lang='eng')
            text += page_text + '\n'
    
    return text, extracted_texts

pdf_path = r'pdf_files\236-020-SAS-120_0.pdf'   # Modify the path according to the actual situation

# Coordinates for the areas to extract
coordinates = {
    "title": (62, 62, 780, 174),  # Title and subtitle
    "notes": (62, 1490, 890, 1620),  # Notes
    "information_block": (1488, 1498, 2327, 1627)  # Information block
}

# Extract text from the whole PDF and specific coordinates
extracted_text, extracted_text_regions = extract_text(pdf_path, coordinates)

# Printing the full PDF text
print(extracted_text)

# Saving the extracted text regions to a file with UTF-8 encoding
output_text_path = 'extract_data/236-020-SAS-120_0.txt'
with open(output_text_path, 'w', encoding='utf-8') as file:
    for label, text in extracted_text_regions.items():
        file.write(f"{label}:\n{text}\n\n")

print(f"Extracted text regions saved to {output_text_path}")

At Ses ISSUED FOR CONSTRUCTION
een 8 . ar 8 foe IBUWESZE AC a9o159 Melee” “8
etn 10007 RMS SILO
weet McMAHON INgeNIa SILO ROOF HATCH
» e GENERAL ASSEMBLY
; seas coma Py eens es ig00s CIRICDME  oNurber Tipe iain She Reior
Camere MOMBHON SERVICESABE to" Hensy Beach Pr setae 379 BEG. O20 -SAS -120-01. 0ADBRI BIRKENHEAD 1000T RMS SILO

SILO ROOF HATCH

a]
a

370

370

162

SIDE VIEW
(1:5)

TOP VIEW
ITEM: 236-020-SAS-120

NOTES:

if io
LEA] ale
TW
A A
f) CT)
y {yf o Wt
LI 3 as

1. REFER TO 236-020-SAS-001 FOR ASSEMBLY DETAILS, 236-020-GEN-001 FOR GENERAL NOTES & 236-020-RPT-001

2. WELD LINES TO BE OFFSET AS PER STANDARDS,
3. SURFACE TREATMENT AS PER GENERAL NOTES DWG 236-020-GEN-001
4, IF IN DOUBT, ASK

A-A(1:2)
SPONGE SEAL DETAIL

EXTERNAL ISOMETRIC
(1:5)

BOM - GENERAL ASSEMBLY
ITEM ary PART NUMBER DESCRIPTION DETAILED DRAWING
1 1 236-020-WAS-157 HATCH DOOR ASSEMBLY 236-020-WAS-157
2 1 236-020-WAS-158 HATCH BASE ASSEMBLY 236-020-WAS-158
3 1 236-020-PRT-122 WEBFORGE GRATING 0253 236-020-P

In [9]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io

# Specify the installation path of Tesseract and modify it according to your actual installation location
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows

def extract_text_from_coordinates(pdf_path, coordinates):
    # Open PDF file
    doc = fitz.open(pdf_path)
    extracted_texts = {}
    
    for label, (x0, y0, x1, y1) in coordinates.items():
        # Get the pixmap of the entire page (image)
        page = doc[0]  # Assume all pages have the same relevant content
        pix = page.get_pixmap()
        
        # Convert the pixmap to an image object
        img = Image.open(io.BytesIO(pix.tobytes()))
        
        # Define the region of interest (ROI) as a box
        roi = img.crop((x0, y0, x1, y1))
        
        # Use Tesseract to do OCR on the ROI
        text = pytesseract.image_to_string(roi, lang='eng', config='--psm 6').strip()
        extracted_texts[label] = text
    
    return extracted_texts

# Updated coordinates for the areas to extract based on the provided specifications
coordinates = {
    "title": (175, 165, 2260, 425),
    "notes": (180, 4160, 2420, 4485),
    "information_block_rev_table": (2500, 4160, 4133, 4520),
    "information_table_A1": (4133, 4160, 4338, 4520),
    "information_table_customer": (4338, 4160, 4890, 4520),
    "information_table_ingenia": (4890, 4160, 5480, 4520),
    "information_status": (5480, 4162, 6463, 4520),
}

# Paths (Adjust these paths as per your file system and where your files are located)
pdf_path = r'pdf_files\236-020-SAS-120_0.pdf'
output_text_path = r'extract_data\236-020-SAS-120_0.txt'

# Extract text
extracted_text_regions = extract_text_from_coordinates(pdf_path, coordinates)

# Structuring and saving the extracted text
with open(output_text_path, 'w', encoding='utf-8', errors='replace') as file:
    for label, text in extracted_text_regions.items():
        file.write(f"{label}:\n{text}\n\n")

print(f"Extracted text saved to {output_text_path}")

Extracted text saved to extract_data\236-020-SAS-120_0.txt


In [21]:
# 图像预处理
import fitz  # PyMuPDF
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import io

# Specify the installation path of Tesseract and modify it according to your actual installation location
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows


# 图像预处理函数
def preprocess_image(img):
    # 转换为灰度
    img = img.convert('L')
    # 图像增强 - 对比度增强
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(2)
    # 图像二值化
    img = img.point(lambda x: 0 if x < 128 else 255, '1')
    # 图像去噪（可选）
    # img = img.filter(ImageFilter.MedianFilter())
    return img


def extract_text_from_coordinates(pdf_path, coordinates):
    # Open PDF file
    doc = fitz.open(pdf_path)
    extracted_texts = {}
    
    for label, (x0, y0, x1, y1) in coordinates.items():
        # Get the pixmap of the entire page (image)
        page = doc[0]  # Assume all pages have the same relevant content
        pix = page.get_pixmap()

        
        # Convert the pixmap to an image object
        img = Image.open(io.BytesIO(pix.tobytes()))

        # 调用预处理函数
        img = preprocess_image(img)

        # Define the region of interest (ROI) as a box
        roi = img.crop((x0, y0, x1, y1))
        
        # Use Tesseract to do OCR on the ROI
        text = pytesseract.image_to_string(roi, lang='eng', config='--psm 6 --oem 1').strip()
        extracted_texts[label] = text
    
    return extracted_texts

# Coordinates for the areas to extract
coordinates = {
    "title": (62, 62, 780, 174),  # Title and subtitle
    "notes": (62, 1488, 890, 1618),  # Notes
    "information_block": (1488, 1498, 2327, 1627)  # Information block
}

# coordinates = {
#     "title": (200, 180, 2200, 425),
#     "notes": (180, 4160, 2420, 4485),
#     "information_block_rev_table": (2500, 4160, 4133, 4520),
#     "information_table_A1": (4133, 4160, 4338, 4520),
#     "information_table_customer": (4338, 4160, 4890, 4520),
#     "information_table_ingenia": (4890, 4160, 5480, 4520),
#     "information_status": (5480, 4162, 6463, 4520),
# }


# Paths
pdf_path = r'pdf_files\236-020-STR-001_D.pdf'
output_text_path = r'extract_data\testing_data.txt'

# Extract text
extracted_text_regions = extract_text_from_coordinates(pdf_path, coordinates)

# Structuring and saving the extracted text
with open(output_text_path, 'w', encoding='utf-8', errors='replace') as file:
    for label, text in extracted_text_regions.items():
        file.write(f"{label}:\n{text}\n\n")

print(f"Extracted text saved to {output_text_path}")

Extracted text saved to extract_data\testing_data.txt
