In [146]:
import pandas as pd
import numpy as np
import ftfy
import unicodedata
import re
import fitz
from lxml import etree
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [147]:
def clean_text(text):
    # 1. Fix encoding issues like mojibake or smart quotes
    text = ftfy.fix_text(text)

    # 2. Normalize ligatures and full-width characters
    text = unicodedata.normalize("NFKC", text)

    # 3. Remove non-printable control characters (ASCII 0–31 and 127)
    text = re.sub(r'[\x00-\x1F\x7F]', ' ', text)

    # 4. Remove unnecessary backslashes before quotes (like d\'Océanographie → d'Océanographie)
    text = re.sub(r'\\([\'"“”‘’`´])', r'\1', text)

    # 5. Add space between glued lowercase-uppercase boundaries (e.g., datasetEuropean → dataset European)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

    # 6. Insert space before URLs (e.g., doihttps://... → doi https://...)
    text = re.sub(r'(?<=[a-zA-Z])(?=https?://)', ' ', text)

    # 7. Replace all weird dash characters with standard hyphen
    text = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2212]', '-', text)

    # 8. Add space after punctuation if not followed by space
    text = re.sub(r'([,;!?)\]\}])(?=\S)', r'\1 ', text)

    # 9.Remove all symbols **except** alphanumeric, hyphen, underscore, dot, and space
    text = re.sub(r'[^\w\s\.\-_\/]', ' ', text)

    # 10.Collapse repeated valid symbols, while preserving `//`
    text = re.sub(r'-{2,}', '-', text)
    text = re.sub(r'_{2,}', '_', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'/{3,}', '//', text)  # Keep `//` but collapse longer

    # 11. Flatten all whitespace (spaces, tabs, newlines) into a single space
    text = re.sub(r'\s+', ' ', text)

    # 12. Remove any leftover leading/trailing space
    return text.strip()

In [148]:
def extract_text_from_xml(xml_path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_path, parser)
    root = tree.getroot()
    text_parts = []
    for elem in root.iter():
        if elem.text and elem.text.strip():
            text_parts.append(elem.text.strip())
        if elem.tail and elem.tail.strip():
            text_parts.append(elem.tail.strip())
    
    text =  ' '.join(text_parts)
    return clean_text(text)

In [149]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " "
    for page in doc:
        text += page.get_text()
    doc.close()
    return clean_text(text)

In [150]:
def extract_context(text, pattern, window=30):
    """
    Extract context around regex matches from given text.

    Parameters:
    - text (str): Input document text.
    - pattern (str): Regex pattern to match.
    - window (int): Number of words before and after match to include in context.

    Returns:
    - List of dicts with 'match' and 'context' keys.
    """

    matches = []

    # Tokenize the full text once
    tokens = text.split()

    # Map character index → token index
    char_to_token = {}
    index = 0
    for i, tok in enumerate(tokens):
        index = text.find(tok, index)
        for j in range(index, index + len(tok)):
            char_to_token[j] = i
        index += len(tok)

    # Find matches using re.finditer()
    for match in re.finditer(pattern, text):
        raw_match = match.group()
        cleaned_match = re.sub(r'[\s]', '', raw_match)  # Remove spaces inside ID

        start_char = match.start()

        if start_char in char_to_token:
            token_index = char_to_token[start_char]
            start = max(0, token_index - window)
            end = min(len(tokens), token_index + window + 1)
            context_window = tokens[start:end]

            matches.append({
                'match': cleaned_match,
                'context': ' '.join(context_window)
            })

    return matches

In [237]:
textx = extract_text_from_xml('data/train/XML/10.1371_journal.pcbi.1011828.xml')
textp = extract_text_from_pdf('data/train/PDF/10.1371_journal.pcbi.1011828.pdf')

In [238]:
textp

'RESEARCH ARTICLE Integrated meta-analysis of colorectal cancer public proteomic datasets for biomarker discovery and validation Javier Robles1 2 Ananth Prakash3 Juan Antonio Vizcaı no ID3 J. Ignacio Casal ID1 1 Department of Molecular Biomedicine Centro de Investigaciones Biolo gicas Margarita Salas Consejo Superior de Investigaciones Cientı ficas Madrid Spain 2 Protein Alternatives SL Tres Cantos Madrid Spain 3 European Molecular Biology Laboratory-European Bioinformatics Institute EMBL-EBI Wellcome Genome Campus Hinxton Cambridge United Kingdom juan ebi.ac.uk JAV icasal cib.csic.es JIC Abstract The cancer biomarker field has been an object of thorough investigation in the last decades. Despite this colorectal cancer CRC heterogeneity makes it challenging to identify and vali- date effective prognostic biomarkers for patient classification according to outcome and treatment response. Although a massive amount of proteomics data has been deposited in public data repositories this rich

In [239]:
textx

'-//NLM//DTD JATS Z39.96 Journal Publishing DTD v1.1d3 20150301//EN public JATS-journalpublishing1.dtd 39.96 jats2jats3.xsl 1 open_access pmc PLo S Comput Biol PLo S Comput Biol plos PLOS Computational Biology 1553-734X 1553-7358 Public Library of Science San Francisco CA USA 10833860 38252632 10.1371/journal.pcbi.1011828 PCOMPBIOL-D-23-01487 Research Article Medicine and Health Sciences Oncology Cancers and Neoplasms Colorectal Cancer Research and Analysis Methods Database and Informatics Methods Biological Databases Proteomic Databases Biology and Life Sciences Biochemistry Proteomics Proteomic Databases Biology and Life Sciences Biochemistry Biomarkers Biology and Life Sciences Computational Biology Genome Analysis Transcriptome Analysis Biology and Life Sciences Genetics Genomics Genome Analysis Transcriptome Analysis Biology and Life Sciences Molecular Biology Molecular Biology Techniques Molecular Biology Assays and Analysis Techniques Gene Expression and Vector Techniques Protei

In [248]:
len(extract_context(textp, r'\bE-PROT-\d{2,3}\b'))

12

In [249]:
len(extract_context(textx, r'\bE-PROT-\d{2,3}\b'))

12

In [247]:
extract_context(textx, r'\bE-PROT-\d{2,3}\b')

[{'match': 'E-PROT-103',
  'context': 'of patients Number of protein groups Number of peptides Number of unique peptides Number of unique genes canonical proteins mapped Solid samples Mucosa colorectal adenoma colorectal carcinoma Colorectum PXD001676 58 E-PROT-103 16 No 16 8 9 711 215 033 196 017 8 949 PXD002137 59 E-PROT-104 192 Yes 6 32 25 PXD014511 60 E-PROT-105 310 Yes 5 62 52 PXD019504 61'},
 {'match': 'E-PROT-104',
  'context': 'unique genes canonical proteins mapped Solid samples Mucosa colorectal adenoma colorectal carcinoma Colorectum PXD001676 58 E-PROT-103 16 No 16 8 9 711 215 033 196 017 8 949 PXD002137 59 E-PROT-104 192 Yes 6 32 25 PXD014511 60 E-PROT-105 310 Yes 5 62 52 PXD019504 61 E-PROT-106 74 No 74 37 CPTAC PDC000111 31 E-PROT-23 1425 Yes 15 90 90 Total'},
 {'match': 'E-PROT-105',
  'context': 'colorectal adenoma colorectal carcinoma Colorectum PXD001676 58 E-PROT-103 16 No 16 8 9 711 215 033 196 017 8 949 PXD002137 59 E-PROT-104 192 Yes 6 32 25 PXD014511 60 E-PROT-1

In [250]:
list = extract_context(textp, r'\bE-PROT-\d{2,3}\b')
matches = [m['match'] for m in list]
matches

['E-PROT-103',
 'E-PROT-104',
 'E-PROT-105',
 'E-PROT-106',
 'E-PROT-23',
 'E-PROT-100',
 'E-PROT-101',
 'E-PROT-102',
 'E-PROT-107',
 'E-PROT-108',
 'E-PROT-109',
 'E-PROT-110']

In [251]:
a = ['E-PROT-100',
 'E-PROT-101',
 'E-PROT-102',
 'E-PROT-103',
 'E-PROT-104',
 'E-PROT-105',
 'E-PROT-106',
 'E-PROT-107',
 'E-PROT-108',
 'E-PROT-109',
 'E-PROT-110',
 'E-PROT-23',
 'PXD001676',
 'PXD002137',
 'PXD005693',
 'PXD005709',
 'PXD010458',
 'PXD014511',
 'PXD019504',
 'PXD020454',
 'PXD031556',
 'PXD032899']

In [252]:
set(a) - set(matches)

{'PXD001676',
 'PXD002137',
 'PXD005693',
 'PXD005709',
 'PXD010458',
 'PXD014511',
 'PXD019504',
 'PXD020454',
 'PXD031556',
 'PXD032899'}

In [260]:
list = extract_context(textx, r'\bE-PROT-\d{2,3}\b')
matche = [m['match'] for m in list]
set(matches) - set(matches)

set()

In [256]:
len(set(matche)) , len(set(matches))

(12, 12)

In [257]:
len(matche), len(matches)

(12, 12)