# Document Preprocessing

In [88]:
import fitz  # pip install PyMuPDF
import os

# Set the directory containing PDFs
pdf_dir = "Papers"  # Replace with the actual path if needed
output_dir = "extracted_texts"  # Directory to store the extracted text

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [89]:
import re
import json
# Define keywords to stop processing
stop_keywords = ["References", "REFERENCES", "references", "Conflict of interest", "Conflicts of Interest"]

# Iterate over all PDF files in the directory
for pdf_filename in os.listdir(pdf_dir):
    if pdf_filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_filename)
        doc = fitz.open(pdf_path)
        content = []
        stop_processing = False

        # Original loop to extract text and images
        for page_num in range(len(doc)):
            if stop_processing:
                break  # Stop processing pages once a stop keyword is found
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] == 0:  # text block
                    block_text = ""
                    for line in block["lines"]:
                        for span in line["spans"]:
                            block_text += span["text"] + " "
                    block_text = block_text.strip()

                     # Check if "References" is in the current block text
                    if any(keyword in block_text for keyword in stop_keywords):
                        stop_processing = True
                        break  # Stop further processing if any stop keyword is found

                    # Clean and process text
                    block_text = " ".join(block_text.split())  # Remove extra spaces and newlines
                    block_text = re.sub(r'^\d+\.\s+', '', block_text)  # Remove section numbering
                    block_text = re.sub(r'[^A-Za-z0-9\s,.]', '', block_text)  # Keep only basic punctuation and letters
            
                    if len(block_text) < 10 or re.match(r'Page \d+', block_text):
                        continue  # Skip short blocks or page numbers

                    content.append({
                        "type": "text",
                        "content": block_text,
                        "bbox": block["bbox"],
                        "page_num": page_num
                    })

                elif block["type"] == 1:  # image block
                    image_ext = block["ext"]
                    image_filename = f"images/image_{page_num}_{block['number']}.{image_ext}"
                    # Uncomment the line below if you want to save the image files
                    # with open(image_filename, "wb") as img_file:
                    #     img_file.write(block["image"])
                    content.append({
                        "type": "image",
                        "content": image_filename,
                        "bbox": block["bbox"],
                        "page_num": page_num
                    })
    # Sort content by page number and y-position
    content.sort(key=lambda x: (x["page_num"], x["bbox"][1]))

    # Save the extracted content to a .txt or .json file
    output_filename = os.path.splitext(pdf_filename)[0] + ".json"
    output_filepath = os.path.join(output_dir, output_filename)

    with open(output_filepath, "w", encoding="utf-8") as f:
        json.dump(content, f, ensure_ascii=False, indent=4)

    print(f"Processed {pdf_filename} and saved to {output_filepath}")

Processed FSB2-37-e23130.pdf and saved to extracted_texts/FSB2-37-e23130.json
Processed 13048_2019_Article_582.pdf and saved to extracted_texts/13048_2019_Article_582.json
Processed f1000research-8-16126.pdf and saved to extracted_texts/f1000research-8-16126.json
Processed FVVinObGyn-11-269.pdf and saved to extracted_texts/FVVinObGyn-11-269.json
Processed hoaa002.pdf and saved to extracted_texts/hoaa002.json
Processed 11604_2024_Article_1569.pdf and saved to extracted_texts/11604_2024_Article_1569.json
Processed ijms-24-07503.pdf and saved to extracted_texts/ijms-24-07503.json
Processed 404_2022_Article_6766.pdf and saved to extracted_texts/404_2022_Article_6766.json
Processed biomolecules-12-01721.pdf and saved to extracted_texts/biomolecules-12-01721.json
Processed dead229.pdf and saved to extracted_texts/dead229.json
Processed AOGS-103-1634.pdf and saved to extracted_texts/AOGS-103-1634.json
Processed ijerph-19-06162.pdf and saved to extracted_texts/ijerph-19-06162.json
Processed ho

In [90]:
content.sort(key=lambda x: (x["page_num"], x["bbox"][1])) # sort by page, y0 (y axis of top-left corner)

for item in content:
    if item["type"] == "text":
        print(f"Text: {item['content']}")
    elif item["type"] == "image":
        print(f"Image: {item['content']}")

Text: TYPE Review
Text: PUBLISHED 26 October 2022
Text: DOI 10.3389fendo.2022.1020827
Text: Endometriosisassociated infertility From pathophysiology to tailored treatment
Image: images/image_0_9.jpeg
Text: OPEN ACCESS
Text: EDITED BY Lusine Aghajanova, Stanford Healthcare, United States
Text: REVIEWED BY Michael Strug, Stanford University, United States Antonio Simone Lagan, University of Palermo, Italy
Text: Giulia Bonavina and Hugh S. Taylor 
Text: Department of Obstetrics, Gynecology and Reproductive Sciences, Yale School of Medicine, New Haven, CT, United States
Text: CORRESPONDENCE Hugh S. Taylor hugh.tayloryale.edu
Text: Despite the clinically recognized association between endometriosis and infertility, the mechanisms implicated in endometriosisassociated infertility are not fully understood. Endometriosis is a multifactorial and systemic disease that has pleiotropic direct and indirect effects on reproduction. A complex interaction between endometriosis subtype, pain, in  ammat