In [1]:
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import requests
import pandas as pd
from pathlib import Path
import os

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# ✅ Fast Dataset Mention Extraction (Regex + Keywords + Small SpaCy)

import os
import re
import pandas as pd
import spacy
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize

# --- Paths ---
XML_DIR = r"make-data-count-finding-data-references\train\train_XML\XML"

# --- Load smaller NLP model ---

nlp_scispacy = spacy.load("en_core_sci_sm")

# --- Regex patterns for dataset IDs ---
DATASET_PATTERNS = [
    r'10\.\d{4,9}/[-._;()/:a-z0-9]+',       # DOIs
    r'doi:10\.\d{4,9}/[-._;()/:a-z0-9]+',
    r'gse\d{4,6}', r'srp\d+', r'empiar-?\d+', r'ens[a-z]{0,5}\d+',
    r'nm_\d+', r'bx\d+', r'cp\d+', r'sth\d+', r'f\d+[a-z]+\d*',
    r'rs\d+', r'hgnc:\d+', r'cab\d+', r'hpa\d+', r'[a-z]\d{5}',
    r'epi_isl_\d+', r'k\d+', r'cvcl_\d+', r'e-prot-\d+',
    r'pxd\d+', r'prjna\d+', r'srx\d+', r'ku\d+', r'e-geod-\d+',
    r'\d{1}[a-z0-9]{3}', r'\d+\.\d+\.\d+\.\d+', r'model\d+', r'err\d+', r'srr\d+'
]
regex_combined = re.compile("|".join(DATASET_PATTERNS), re.IGNORECASE)

# --- Keywords for dataset mentions ---
DATASET_KEYWORDS = [
    "data available", "dataset", "accession number", "repository",
    "dryad", "pdb", "geo", "arrayexpress", "figshare", "ebi",
    "sequence read archive", "chembl", "mgnify"
]

def is_dataset_like(sentence):
    """
    Check if a sentence is likely dataset-related.
    """
    if regex_combined.search(sentence.lower()):
        return True
    for kw in DATASET_KEYWORDS:
        if kw in sentence.lower():
            return True
    return False

# --- Extract sentences and identifiers from XML ---
def extract_sentences_from_xml(xml_path):
    with open(xml_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml-xml")

    text_blocks = [tag.get_text(separator=" ", strip=True) 
                   for tag in soup.find_all(["p", "sec", "title", "abstract", "ref", "supplementary-material"])]
    sentences = [sent for block in text_blocks for sent in sent_tokenize(block)]

    # Add all <idno> tags (DOI, PDB, dataset IDs, etc.) as pseudo-sentences
    idno_tags = soup.find_all("idno")
    for tag in idno_tags:
        id_type = tag.get("type", "unknown").upper()
        value = tag.get_text(strip=True)
        if value:
            sentences.append(f"Referenced dataset {id_type}: {value}")

    return sentences

# --- Process all XML files ---
all_results = []
xml_files = [os.path.join(XML_DIR, f) for f in os.listdir(XML_DIR) if f.endswith(".xml")]

for xml_file in xml_files:
    article_id = os.path.basename(xml_file).replace(".xml", "")
    sentences = extract_sentences_from_xml(xml_file)

    for sent in sentences:
        dataset_flag = is_dataset_like(sent)
        all_results.append({
            "article_id": article_id,
            "sentence": sent,
            "is_dataset_like": int(dataset_flag)
        })

# --- Save output ---
df_results = pd.DataFrame(all_results)
df_results.to_csv("fast_dataset_sentences.csv", index=False)
print(f"✅ Extracted {len(df_results)} sentences, {df_results['is_dataset_like'].sum()} marked as dataset-related.")


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


✅ Extracted 277750 sentences, 62009 marked as dataset-related.
