## Dependencies

In [None]:
import pdfplumber

### Extract one page

In [2]:
body_bbox = (50, 130, 550, 750)  # (left_x, top_y, right_x, bottom_y) for pdfplumber
def extract_body_from_page(pdf_path, page_number, bbox):
    """
    Extracts text from a specific page and bounding box, excluding headers/footers.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): The 0-indexed page number to extract from.
        bbox (tuple): A tuple (x0, top, x1, bottom) defining the bounding box
                      of the content area.

    Returns:
        str: The extracted text from the body of the page.
    """
    with pdfplumber.open(pdf_path) as pdf:
        if page_number < 0 or page_number >= len(pdf.pages):
            print(f"Error: Page number {page_number} is out of range.")
            return ""

        page = pdf.pages[page_number]
        
        # Crop the page to the desired bounding box
        cropped_page = page.crop(bbox)
        
        # Extract text from the cropped page
        text = cropped_page.extract_text()
        return text if text else ""
    
textone = extract_body_from_page("longsonpetrochem.pdf", 170, body_bbox)
print(textone)

STAAD SPACE
START JOB INFORMATION
JOB NAME LSP Complex Project Package
JOB CLIENT Long Son Petrochemicals Co. Ltd.
JOB NO SC0353
ENGINEER DATE 30-Sep-19
END JOB INFORMATION
INPUT WIDTH 79
***********************************************************************************************
***********************************************************************************************
***********************************************************************************************
UNIT METER KN
JOINT COORDINATES
1 -3.3 60.6 -6; 2 0 57.95 -6; 3 -3.3 55.3 -6; 4 0 52.65 -6; 5 -3.3 49.95 -6;
6 0 47.35 -6; 7 -3.3 55.3 0; 8 0 52.65 0; 9 -3.3 49.95 0; 10 0 47.35 0;
11 -3.3 60.6 0; 12 0 57.95 0; 13 17.5 60.6 -2.27; 14 20 60.6 -4.78;
15 12.5 60.6 -7.5; 16 17.5 60.6 -7.5; 17 10 60.6 -7.5; 18 9.675 60.6 0;
19 7.5 60.6 -1.73; 20 15.39 63.3 -2; 21 17.5 60.6 -2; 22 12.5 60.6 -2;
23 20 60.6 -2; 24 -0.6 55.3 0; 26 -1.9 55.3 0; 28 -1.5 55.3 0; 30 17.5 65.7 -2;
31 20 65.7 -2; 32 -2.8 55.3 0; 34 12.5 66.75 -2; 35 

In [None]:
def extract_text_from_page_range(pdf_path, start_page, end_page, bbox):
    """
    Extracts text from a range of pages and merges them.
    
    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Starting page number (0-indexed).
        end_page (int): Ending page number (0-indexed, inclusive).
        bbox (tuple): Bounding box for text extraction.
    
    Returns:
        str: Merged text from all pages in the range.
    """
    all_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        print(f"PDF has {total_pages} pages total")
        
        # Validate page range
        if start_page < 0:
            start_page = 0
        if end_page >= total_pages:
            end_page = total_pages - 1
            
        print(f"Extracting text from pages {start_page + 1} to {end_page + 1} (1-indexed)")
        
        for page_num in range(start_page, end_page + 1):
            print(f"Processing page {page_num + 1}...")
            
            page = pdf.pages[page_num]
            cropped_page = page.crop(bbox)
            text = cropped_page.extract_text()
            all_text.append(text)
            
            # if text:
            #     # Add page separator and page number for clarity
            #     page_header = f"\n{'='*50}\nPAGE {page_num + 1}\n{'='*50}\n"
            #     all_text.append(page_header + text)
            # else:
            #     print(f"No text found on page {page_num + 1}")
    
    return "\n".join(all_text)

In [None]:
# Example Usage:
pdf_file = "longsonpetrochem.pdf"  # Replace with your PDF file
start_page = 170  # Convert to 0-indexed (page 170 becomes 169)
end_page = 317    # Convert to 0-indexed (page 317 becomes 316)
body_bbox = (50, 130, 550, 750)  # (left_x, top_y, right_x, bottom_y) for pdfplumber

print("Starting text extraction...")
extracted_content = extract_text_from_page_range(pdf_file, start_page, end_page, body_bbox)

# Export the extracted content to a text file
with open("extracted_content.txt", "w", encoding="utf-8") as f:
    f.write(extracted_content)

print(f"\nExtraction complete! Text from pages 170-317 saved to 'extracted_content.txt'")
print(f"Total characters extracted: {len(extracted_content)}")