In [1]:
import pandas as pd
import numpy as np
import ftfy
import unicodedata
import re
import fitz
from lxml import etree
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def clean_text(text):
    # 1. Fix encoding issues like mojibake or smart quotes
    text = ftfy.fix_text(text)

    # 2. Normalize ligatures and full-width characters
    text = unicodedata.normalize("NFKC", text)

    # 3. Remove non-printable control characters (ASCII 0–31 and 127)
    text = re.sub(r'[\x00-\x1F\x7F]', ' ', text)

    # 4. Remove unnecessary backslashes before quotes (like d\'Océanographie → d'Océanographie)
    text = re.sub(r'\\([\'"“”‘’`´])', r'\1', text)

    # 5. Add space between glued lowercase-uppercase boundaries (e.g., datasetEuropean → dataset European)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

    # 6. Insert space before URLs (e.g., doihttps://... → doi https://...)
    text = re.sub(r'(?<=[a-zA-Z])(?=https?://)', ' ', text)

    # 7. Replace all weird dash characters with standard hyphen
    text = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2212]', '-', text)

    # 8. Add space after punctuation if not followed by space
    text = re.sub(r'([,;!?)\]\}])(?=\S)', r'\1 ', text)

    # 9.Remove all symbols **except** alphanumeric, hyphen, underscore, dot, and space
    text = re.sub(r'[^\w\s\:\.\-_\/]', ' ', text)

    # 10.Collapse repeated valid symbols, while preserving `//`
    text = re.sub(r'-{2,}', '-', text)
    text = re.sub(r'_{2,}', '_', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'/{3,}', '//', text)  # Keep `//` but collapse longer

    # 11. Flatten all whitespace (spaces, tabs, newlines) into a single space
    text = re.sub(r'\s+', ' ', text)

    # 12. Remove any leftover leading/trailing space
    return text.strip()

In [3]:
def extract_text_from_xml(xml_path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_path, parser)
    root = tree.getroot()
    text_parts = []
    for elem in root.iter():
        if elem.text and elem.text.strip():
            text_parts.append(elem.text.strip())
        if elem.tail and elem.tail.strip():
            text_parts.append(elem.tail.strip())
    
    text =  ' '.join(text_parts)
    return clean_text(text)

In [4]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " "
    for page in doc:
        text += page.get_text()
    doc.close()
    return clean_text(text)

In [5]:
def extract_context(text, pattern, window=30):
    """
    Extract context around regex matches from given text.

    Parameters:
    - text (str): Input document text.
    - pattern (str): Regex pattern to match.
    - window (int): Number of words before and after match to include in context.

    Returns:
    - List of dicts with 'match' and 'context' keys.
    """

    matches = []

    # Tokenize the full text once
    tokens = text.split()

    # Map character index → token index
    char_to_token = {}
    index = 0
    for i, tok in enumerate(tokens):
        index = text.find(tok, index)
        for j in range(index, index + len(tok)):
            char_to_token[j] = i
        index += len(tok)

    # Find matches using re.finditer()
    for match in re.finditer(pattern, text):
        raw_match = match.group()
        cleaned_match = re.sub(r'[\s]', '', raw_match)  # Remove spaces inside ID

        start_char = match.start()

        if start_char in char_to_token:
            token_index = char_to_token[start_char]
            start = max(0, token_index - window)
            end = min(len(tokens), token_index + window + 1)
            context_window = tokens[start:end]

            matches.append({
                'match': cleaned_match,
                'context': ' '.join(context_window)
            })

    return matches

In [6]:
textx = extract_text_from_xml('data/train/XML/10.1002_2017jc013030.xml')
textp = extract_text_from_pdf('data/train/PDF/10.1002_2017jc013030.pdf')

In [7]:
textp

'RESEARCH ARTICLE 10.1002/2017JC013030 Assessing the Variability in the Relationship Between the Particulate Backscattering Coefficient and the Chlorophyll a Concentration From a Global Biogeochemical-Argo Database Marie Barbieux1 Julia Uitz1 Annick Bricaud1 Emanuele Organelli1 2 Antoine Poteau1 Catherine Schmechtig3 Bernard Gentili1 Grigor Obolensky4 Edouard Leymarie1 Christophe Penkerc h1 Fabrizio D Ortenzio1 and Herve Claustre1 1Sorbonne Universites UPMC Univ Paris 06 CNRS Observatoire Oceanologique de Villefranche Laboratoire d Oceanographie de Villefranche Villefranche-sur-Mer France 2Plymouth Marine Laboratory Prospect Place The Hoe Plymouth United Kingdom 3OSU Ecce Terra UMS 3455 CNRS and Universite Pierre et Marie Curie Paris 6 Paris France 4ERIC Euro-Argo 29280 Plouzane France Abstract Characterizing phytoplankton distribution and dynamics in the world s open oceans requires in situ observations over a broad range of space and time scales. In addition to temperature/salinity m

In [8]:
textx

'Assessing the variability in the relationship between the particulate backscattering coefficient and the chlorophyll a concentration from a global Biogeochemical-Argo database European Union European Commission GMMC LEFE Cyber British Natural Environment Research Council -NERC CNES-TOSCA Agence Nationale de la Recherche Italian Ministry of Education University and Research -MIUR European Research Council Argo-Italy Marie Barbieux barbieux obs-vlfr.fr -Sorbonne Universités UPMC Univ Paris 06 CNRS Observatoire Océanologique de Villefranche OOV Laboratoire d Océanographie de Villefranche LOV 181 Chemin du Lazaret 06 230 Villefranche-sur-Mer France. Sorbonne Universités Observatoire Océanologique de Villefranche OOV Laboratoire d Océanographie de Villefranche LOV UPMC Univ Paris 06 CNRS 181 Chemin du Lazaret 06 230 Villefranche-sur-Mer France Julia Uitz -Sorbonne Universités UPMC Univ Paris 06 CNRS Observatoire Océanologique de Villefranche OOV Laboratoire d Océanographie de Villefranche 

In [9]:
len(extract_context(textp, r'\bhttps://doi\.org/10\.\d{4,9}/[A-Za-z0-9._\-()]+\b'))

104

In [24]:
len(extract_context(textx, r'\bdoi:\s*10\.\d{4,9}/\S+\b'))

121

In [11]:
extract_context(textx, r'\bhttps://doi\.org/10\.\d{4,9}/[A-Za-z0-9._\-()]+\b')

[{'match': 'https://doi.org/10.5194/essd-2017-58',
  'context': 'Syst. Sci. Data 9 2017 Organelli E. et al. 2017b Two databases derived from BGC-Argo float measurements for marine biogeochemical and bio- optical applications Earth Syst. Sci. Data 9 861-880 doi:https://doi.org/10.5194/essd-2017-58. Bio-optical and biogeochemical properties of different trophic regimes in oceanic waters K Oubelkheir H Claustre A Sciandra M Babin 10.4319/lo.2005.50.6.1795 Limnol. Oceanogr 50 6 2005 Oubelkheir K. H. Claustre A.'}]

In [12]:
list = extract_context(textx, r'\bhttps://doi\.org/10\.\d{4,9}/[A-Za-z0-9._\-()]+\b')
matchx = [m['match'] for m in list]
matchx

['https://doi.org/10.5194/essd-2017-58']

In [23]:
list = extract_context(textp, r'\bhttps://doi\.org/10\.\d{4,9}/[A-Za-z0-9._\-()]+\b')
matchp = [m['match'] for m in list]
set(matchx) - set(matchp)

{'https://doi.org/10.5194/essd-2017-58'}

In [14]:
a = ['https://doi.org/10.17882/49388']


In [15]:
set(a) - set(matchx)

{'https://doi.org/10.17882/49388'}

In [16]:
set(a) - set(matchp)

set()

In [17]:
len(matchx), len(matchp)

(1, 104)

In [18]:
len(set(matchx)) , len(set(matchp))

(1, 102)

In [21]:
matchp

['https://doi.org/10.3354/meps11580',
 'https://doi.org/10.4319/lo.2011.56',
 'https://doi.org/10.4319/lo.2003.48.2.0843',
 'https://doi.org/10.1016/S0967-0637',
 'https://doi.org/10.4319/lo.1996.41.8',
 'https://doi.org/10.17882/49388',
 'https://doi.org/10.1002/lno.10011',
 'https://doi.org/10.1016/j.pocean.2007.09.002',
 'https://doi.org/10.1016/j.dsr.2003.09.002',
 'https://doi.org/10.17882/49388',
 'https://doi.org/10.1029/2004GB002299',
 'https://doi.org/10.1038/nclimate2838',
 'https://doi.org/10.1016/j.rse.2016.08',
 'https://doi.org/10.13155/46601',
 'https://doi.org/10.1119/1.1991484',
 'https://doi.org/10.1038/nature05700',
 'https://doi.org/10.1029/2010GL044174',
 'https://doi.org/10.1364/AO.40.005503',
 'https://doi.org/10.1046/j.1529-8817.2002.t01-1-01203.x',
 'https://doi.org/10.1029/2004JC002419',
 'https://doi.org/10.4319/lo.1995.40.2.0393',
 'https://doi.org/10.1029/2011JC007771',
 'https://doi.org/10.5194/bg-12-2179-2015',
 'https://doi.org/10.5270/Ocean',
 'https://

(1, 1)