In [1]:

import fitz  # PyMuPDF
import os

def clean_text(text):
    # Remove hyphens at the end of lines and join lines
    text = text.replace('-\n', '')
    # Remove extraneous whitespace
    return ' '.join(text.split())

def extract_text_from_pdf(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    full_text = ''
    
    for page_num in range(len(doc)):
        # Get the page
        page = doc.load_page(page_num)
        # Extract text in reading order
        text = page.get_text('text', sort=True)
        # Clean and append the text
        full_text += clean_text(text)
    
    doc.close()
    return full_text

def extract_images_from_pdf(pdf_path, min_dim=200, min_size=100000):
    doc = fitz.open(pdf_path)
    img_dir = os.path.splitext(pdf_path)[0] + '_images'
    os.makedirs(img_dir, exist_ok=True)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image['image']
            # Filter images by size and dimension
            if len(image_bytes) >= min_size or base_image['width'] >= min_dim or base_image['height'] >= min_dim:
                image_path = os.path.join(img_dir, f'img_{page_num}_{image_index}.png')
                with open(image_path, 'wb') as f:
                    f.write(image_bytes)

def process_pdf_list(pdf_list_path):
    with open(pdf_list_path, 'r') as file:
        pdf_paths = file.readlines()
    
    for pdf_path in pdf_paths:
        pdf_path = pdf_path.strip()
        text = extract_text_from_pdf(pdf_path)
        text_file_path = os.path.splitext(pdf_path)[0] + '.txt'
        
        with open(text_file_path, 'w') as f:
            f.write(text)
        
        extract_images_from_pdf(pdf_path)

# Assume 'pdfs_to_extract.txt' is in the current directory
process_pdf_list('pdfs_to_extract.txt')
