In [6]:
import pandas as pd
import numpy as np
import ftfy
import unicodedata
import re
import fitz
from lxml import etree
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [27]:
def clean_text(text):
    # 1. Fix encoding issues like mojibake or smart quotes
    text = ftfy.fix_text(text)

    # 2. Normalize ligatures and full-width characters
    text = unicodedata.normalize("NFKC", text)

    # 3. Remove non-printable control characters (ASCII 0–31 and 127)
    text = re.sub(r'[\x00-\x1F\x7F]', ' ', text)

    # 4. Remove unnecessary backslashes before quotes (like d\'Océanographie → d'Océanographie)
    text = re.sub(r'\\([\'"“”‘’`´])', r'\1', text)

    # 5. Add space between glued lowercase-uppercase boundaries (e.g., datasetEuropean → dataset European)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

    # 6. Insert space before URLs (e.g., doihttps://... → doi https://...)
    text = re.sub(r'(?<=[a-zA-Z])(?=https?://)', ' ', text)

    # 7. Replace all weird dash characters with standard hyphen
    text = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2212]', '-', text)

    # 8. Add space after punctuation if not followed by space
    text = re.sub(r'([,;!?)\]\}])(?=\S)', r'\1 ', text)

    text = re.sub(r'([/.])\s+', r'\1', text)

    # 9.Remove all symbols **except** alphanumeric, hyphen, underscore, dot, and space
    text = re.sub(r'[^\w\s\:\.\-_\/]', ' ', text)

    # 10.Collapse repeated valid symbols, while preserving `//`
    text = re.sub(r'-{2,}', '-', text)
    text = re.sub(r'_{2,}', '_', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'/{3,}', '//', text)  # Keep `//` but collapse longer

    # 11. Flatten all whitespace (spaces, tabs, newlines) into a single space
    text = re.sub(r'\s+', ' ', text)

    # 12. Remove any leftover leading/trailing space
    return text.strip()

In [28]:
def extract_text_from_xml(xml_path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_path, parser)
    root = tree.getroot()
    text_parts = []
    for elem in root.iter():
        if elem.text and elem.text.strip():
            text_parts.append(elem.text.strip())
        if elem.tail and elem.tail.strip():
            text_parts.append(elem.tail.strip())
    
    text =  ' '.join(text_parts)
    return clean_text(text)

In [29]:

def extract_text_fitz(pdf_path):
    all_text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text = page.get_text()  # default = "text"
        all_text += text + "\n"
    doc.close()
    return clean_text(all_text)

In [26]:
textpf = extract_text_fitz('data/train/PDF/10.1007_s00259-022-06053-8.pdf')
textpf

'https://doi.org/10.1007/s00259-022-06053-8 ORIGINAL ARTICLE Decentralized collaborative multi-institutional PET attenuation and scatter correction using federated deep learning Isaac Shiri1 Alireza Vafaei Sadr2 3 Azadeh Akhavan1 Yazdan Salimi1 Amirhossein Sanaat1 Mehdi Amini1 Behrooz Razeghi4 Abdollah Saberi1 Hossein Arabi1 Sohrab Ferdowsi5 Slava Voloshynovskiy4 Deniz Gündüz6 Arman Rahmim7 8 Habib Zaidi1 9 10 11 Received: 13 August 2022 / Accepted: 18 November 2022 The Author s 2022 Abstract Purpose Attenuation correction and scatter compensation AC/SC are two main steps toward quantitative PET imaging which remain challenging in PET-only and PET/MRI systems. These can be effectively tackled via deep learning DL methods. However trustworthy and generalizable DL models commonly require well-curated heterogeneous and large datasets from multiple clinical centers. At the same time owing to legal/ethical issues and privacy concerns forming a large collective centralized dataset poses sign

In [83]:
import pdfplumber
from collections import defaultdict

def extract_text_two_columns(pdf_path):
    all_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            words = page.extract_words()
            mid_x = page.width / 2

            # Separate into left and right column words
            left_words = [w for w in words if w['x0'] < mid_x]
            right_words = [w for w in words if w['x0'] >= mid_x]

            def group_and_reconstruct(words):
                lines = defaultdict(list)
                for word in words:
                    line_key = round(word['top'] / 3)  # Y threshold
                    lines[line_key].append(word)
                reconstructed_lines = []
                for key in sorted(lines.keys()):
                    line_words = sorted(lines[key], key=lambda w: w['x0'])
                    line_text = " ".join(w['text'] for w in line_words)
                    reconstructed_lines.append(line_text)
                return "\n".join(reconstructed_lines)

            # Reconstruct both columns
            left_text = group_and_reconstruct(left_words)
            right_text = group_and_reconstruct(right_words)

            # Append left then right (as it appears in reading order)
            page_text = left_text + "\n" + right_text + "\n"
            all_text += page_text

    return clean_text(all_text)

In [12]:
def extract_context(text, pattern, window=30):
    """
    Extract context around regex matches from given text.

    Parameters:
    - text (str): Input document text.
    - pattern (str): Regex pattern to match.
    - window (int): Number of words before and after match to include in context.

    Returns:
    - List of dicts with 'match' and 'context' keys.
    """

    matches = []

    # Tokenize the full text once
    tokens = text.split()

    # Map character index → token index
    char_to_token = {}
    index = 0
    for i, tok in enumerate(tokens):
        index = text.find(tok, index)
        for j in range(index, index + len(tok)):
            char_to_token[j] = i
        index += len(tok)

    # Find matches using re.finditer()
    for match in re.finditer(pattern, text):
        raw_match = match.group()
        cleaned_match = re.sub(r'[\s]', '', raw_match)  # Remove spaces inside ID

        start_char = match.start()

        if start_char in char_to_token:
            token_index = char_to_token[start_char]
            start = max(0, token_index - window)
            end = min(len(tokens), token_index + window + 1)
            context_window = tokens[start:end]

            matches.append({
                'match': cleaned_match,
                'context': ' '.join(context_window)
            })

    return matches

In [96]:
textpf

'R E S E A R C H A R T I C L E Past present and future of a meandering river in the Bolivian Amazon basin Kattia Rubi Arnez Ferrel1 Jonathan Mark Nelson2 Yasuyuki Shimizu1 Tomoko Kyuka1 1Graduate School of Engineering Hokkaido University Sapporo Japan 2U.S. Geological Survey Golden Colorado USA Correspondence Kattia Rubi Arnez Ferrel Graduate School of Engineering Hokkaido University Sapporo 060-0808 Japan. Email: rubikraf gmail.com Funding information Nitobe School Project Abstract Field observations on small rivers of the Amazon basin are less common due to their remote location and difficult accessibility. Here we show through remote sensing analysis and field works the planform evolution and riverbed topography of a small river located in the upper foreland Amazon basin the Ichilo River. By tracking plan- form changes over 30 years we identified the factors that control meander migration rates in the Ichilo River: cutoffs climate and human interventions. The data suggest that neck 

In [53]:
textx

'open_access -//NLM//DTD JATS Z39.96 Journal Archiving and Interchange DTD v1.1 20151215//EN public JATS-archivearticle1.dtd 1.1 jp2nlmx2.xsl 1 PLo S One PLo S ONE plos plosone PLo S ONE 1932-6203 Public Library of Science San Francisco USA 3742674 23967097 PONE-D-13-07080 10.1371/journal.pone.0070749 Research Article Biology Ecology Evolutionary Ecology Microbial Ecology Genomics Metagenomics Microbiology Microbial Ecology Model Organisms Animal Models Drosophila Melanogaster Population Biology Population Ecology Host Species and Environmental Effects on Bacterial Communities Associated with Drosophila in the Laboratory and in the Natural Environment Drosophila Associated Bacteria Staubach Fabian 1 Baines John F. 2 Künzel Sven 2 Bik Elisabeth M. 3 Petrov Dmitri A. 1 1 Department of Biology Stanford University Stanford California United States of America 2 Max Planck Institute for Evolutionary Biology Plön Germany 3 Department of Microbiology Immunology Stanford School of Medicine Stan

In [None]:
len(extract_context(textpf, r'\bhttps?://(?:doi\.org|dx\.doi\.org)/10\.\d{4,9}/[A-Za-z0-9._\-()/]+\b'))

1

In [None]:
len(extract_context(textx, r'\bdoi:\s*10\.\d{4,9}/[A-Za-z0-9._\-()/]+\b'))

44

In [45]:
extract_context(textx, r'\bdoi:\s*10\.\d{4,9}/[A-Za-z0-9._\-()/]+\b')

[{'match': 'doi:10.5061/dryad.5q1sb',
  'context': '4523267.3 4523269.3 4523286.3 4523288.3 4523290.3 4523292.3 4523294.3 4523296.3 4523271.3 4523300.3 4523298.3 4523304.3 4523250.3 4523275.3 4523277.3 4523302.3 4523273.3 4523272.3 4523301.3 4523299.3 4523305.3 4523251.3 4523276.3 4523278.3 4523303.3 4523274.3 and on datadryad.org under doi:10.5061/dryad.5q1sb. The data in the dryad repository includes sequence quality. Table S3 contains information about the 100 most common OTUs including a representative sequence and relative abundances in all samples. Supporting'},
 {'match': 'doi:10.1126/science.1223813',
  'context': 'Center for providing samples. References 1 Nicholson JK Holmes E Kinross J Burcelin R Gibson G et al 2012 Host-Gut Microbiota Metabolic Interactions . Science 336 : 1262 - 1267 doi: 10.1126/science.1223813 22674330 2 Krause DO Denman SE Mackie RI Morrison M Rae AL et al 2003 Opportunities to improve fiber degradation in the rumen: microbiology ecology and genomics . 

In [24]:
import re

text = """
Aerts HJWL, Wee L, Rios Velazquez E, et al. Data from NSCLC-Radio-
mics [Dataset]. In: The Cancer Imaging Archive; 2019. https://doi.org/ 10.7937/K9/TCIA.2015.PF0M9REI
"""

# Space-tolerant DOI pattern (short version)
doi_short = (
    r'https?\s*:\s*/\s*/\s*'                      # https:// (space-tolerant)
    r'(?:doi\.org|dx\.doi\.org)\s*/\s*'          # domain + optional spaces after /
    r'10\.\d{4,9}/[A-Za-z0-9._\-()/]+'
)

# Standard (no-space) long DOI
doi_long = (
    r'https?://(?:doi\.org|dx\.doi\.org)/10\.\d{4,9}/[A-Za-z0-9._\-()/]+(?: [A-Za-z0-9._\-()/]+)?'
)

# Compile both
matches = set()

for pat in [doi_short, doi_long]:
    compiled = re.compile(pat)
    for m in compiled.finditer(text):
        matches.add(m.group(0).replace(" ", ""))

print(matches)

{'https://doi.org/10.7937/K9/TCIA.2015.PF0M9REI'}


In [25]:
import re

text = """
Aerts HJWL, Wee L, Rios Velazquez E, et al. Data from NSCLC-Radio-
mics [Dataset]. In: The Cancer Imaging Archive; 2019. https://doi.org/ 10.7937/K9/TCIA.2015.PF0M9REI
"""

# Two patterns
doi_short = r'https?\s*:\s*/\s*/\s*(?:doi\.org|dx\.doi\.org)\s*/\s*10\.\d{4,9}/[A-Za-z0-9._\-()/]+'
doi_long = r'\bhttps?://(?:doi\.org|dx\.doi\.org)/10\.\d{4,9}/[A-Za-z0-9._\-()/]+(?: [A-Za-z0-9._\-()/]+)?\b'

# Combine matches
matches = set()

for pat in [doi_short, doi_long]:
    compiled = re.compile(pat)
    for m in compiled.finditer(text):
        # Clean spacing if it was a space-tolerant pattern
        matches.add(m.group(0).replace(" ", ""))

print(matches)

{'https://doi.org/10.7937/K9/TCIA.2015.PF0M9REI'}


In [3]:
matches

{'http://dx.doi.org/10.5061/dryad.p3fg9',
 'http://dx.doi.org/10.5061/dryad.p3fg9 21508925'}

In [21]:
list = extract_context(textpf, re.compile(
    r'\bhttps?\s*:\s*/\s*/\s*(?:doi\.org|dx\.doi\.org)\s*/10\.\d{4,9}/[A-Za-z0-9._\-()/]+\b'
    r'\bhttps?://(?:doi\.org|dx\.doi\.org)/10\.\d{4,9}/[A-Za-z0-9._\-()/]+(?: [A-Za-z0-9._\-()/]+)?\b'
))
matchx = [m['match'] for m in list]
#matchx = [match for match in matchx if not match.startswith('doi:10.1371_journal.pone.0070749')]
matchx

[]

In [56]:
list = extract_context(textp, r'\bhttps?://(?:doi\.org|dx\.doi\.org)/10\.\d{4,9}/[A-Za-z0-9._\-()/]+\b')
matchp = [m['match'] for m in list]
matchp = [match for match in matchp if not match.startswith('https://doi.org/10.1371_journal.pone.0070749')]
matchp

['http://dx.doi.org/10.5061/dryad.p3fg9']

In [15]:
a = ['EPI_ISL_293288']

In [16]:
set(a) - set(matchx)

{'EPI_ISL_293288'}

In [16]:
set(a) - set(matchp)

set()

In [17]:
len(matchx), len(matchp)

(1, 104)

In [18]:
len(set(matchx)) , len(set(matchp))

(1, 102)

(1, 1)