# Document Preprocessing

In [74]:
import fitz  # pip install PyMuPDF

doc = fitz.open("Papers/FSB2-37-e23130.pdf")
content = []

In [75]:
import re
stop_processing = False
# Define keywords to stop processing
stop_keywords = ["References", "REFERENCES", "references", "Conflict of interest", "Conflicts of Interest"]


# Original loop to extract text and images
for page_num in range(len(doc)):
    if stop_processing:
        break  # Stop processing pages once "References" is found
    page = doc.load_page(page_num)
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        if block["type"] == 0:  # text block
            block_text = ""
            for line in block["lines"]:
                for span in line["spans"]:
                    block_text += span["text"] + " "
            block_text = block_text.strip()

            # Check if "References" is in the current block text
            if any(keyword in block_text for keyword in stop_keywords):
                stop_processing = True
                break  # Stop further processing if any stop keyword is found

            # Clean and process text
            block_text = " ".join(block_text.split())  # Remove extra spaces and newlines
            block_text = re.sub(r'^\d+\.\s+', '', block_text)  # Remove section numbering
            block_text = re.sub(r'[^A-Za-z0-9\s,.]', '', block_text)  # Keep only basic punctuation and letters
            
            if len(block_text) < 10 or re.match(r'Page \d+', block_text):
                continue  # Skip short blocks or page numbers

            content.append({
                "type": "text",
                "content": block_text,
                "bbox": block["bbox"], 
                "page_num": page_num
            })
        
        elif block["type"] == 1:  # image block
            image_ext = block["ext"]
            image_filename = f"images/image_{page_num}_{block['number']}.{image_ext}"
            # Uncomment the line below if you want to save the image files
            # with open(image_filename, "wb") as img_file:
            #     img_file.write(block["image"])
            content.append({
                "type": "image",
                "content": image_filename,
                "bbox": block["bbox"], 
                "page_num": page_num
            })


In [76]:
content.sort(key=lambda x: (x["page_num"], x["bbox"][1])) # sort by page, y0 (y axis of top-left corner)

for item in content:
    if item["type"] == "text":
        print(f"Text: {item['content']}")
    elif item["type"] == "image":
        print(f"Image: {item['content']}")

Text: Received 5 May 2023  Accepted 26 July 2023
Text: DOI 10.1096fj.202300907
Text: R E V I E W A R T I C L E
Text: Endometriosis in the era of precision medicine and impact on sexual and reproductive health across the lifespan and in diverse populations
Text: Linda C. Giudice 1,2  Tomiko T. Oskotsky 1,3  Simileoluwa Falako 1,4  Jessica OpokuAnane 1,5  Marina Sirota 1,3,6
Text: 1 UCSF Stanford Endometriosis Center for Innovation, Training, and Community Outreach ENACT, University of California, San Francisco, San Francisco, California, USA
Text: Abstract Endometriosis is a common estrogendependent disorder wherein uterine lining tissue endometrium is found mainly in the pelvis where it causes inflammation, chronic pelvic pain, pain with intercourse and menses, and infertility. Recent evidence also supports a systemic inflammatory component that underlies associated comorbidities, e.g., migraines and cardiovascular and autoimmune diseases. Genetics and environment contribute significan

## Cleaning the text