CONVERSION FROM PDF TO XML

In [2]:
import os
import requests

# === Configuration ===
PDF_FOLDER = r"make-data-count-finding-data-references\train\PDF"
XML_FOLDER = "path/to/xml_folder"
GROBID_URL = "http://localhost:8070/api/processFulltextDocument"

# === Statistics ===
success_count = 0
fail_count = 0
skipped_count = 0

# === Ensure output folder exists ===
os.makedirs(XML_FOLDER, exist_ok=True)

# === Process each PDF file ===
for pdf_file in os.listdir(PDF_FOLDER):
    if not pdf_file.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_FOLDER, pdf_file)
    xml_filename = os.path.splitext(pdf_file)[0] + ".xml"
    xml_path = os.path.join(XML_FOLDER, xml_filename)

    # Skip if XML already exists
    if os.path.exists(xml_path):
        print(f"Skipping {pdf_file} (already converted)")
        skipped_count += 1
        continue

    print(f"Converting {pdf_file}...")

    try:
        with open(pdf_path, "rb") as f:
            response = requests.post(
                GROBID_URL,
                files={"input": f},
                timeout=60
            )

        if response.status_code == 200:
            with open(xml_path, "w", encoding="utf-8") as out_file:
                out_file.write(response.text)
            print(f"✓ Success: {pdf_file}")
            success_count += 1
        else:
            print(f"✗ Failed (HTTP {response.status_code}): {pdf_file}")
            fail_count += 1

    except Exception as e:
        print(f"✗ Error processing {pdf_file}: {e}")
        fail_count += 1

# === Summary ===
print("\n=== Summary ===")
print(f"Total PDFs found: {len([f for f in os.listdir(PDF_FOLDER) if f.lower().endswith('.pdf')])}")
print(f"Converted: {success_count}")
print(f"Failed: {fail_count}")
print(f"Skipped (already exists): {skipped_count}")


Converting 10.1002_2017jc013030.pdf...
✓ Success: 10.1002_2017jc013030.pdf
Converting 10.1002_anie.201916483.pdf...
✓ Success: 10.1002_anie.201916483.pdf
Converting 10.1002_anie.202005531.pdf...
✓ Success: 10.1002_anie.202005531.pdf
Converting 10.1002_anie.202007717.pdf...
✓ Success: 10.1002_anie.202007717.pdf
Converting 10.1002_chem.201902131.pdf...
✓ Success: 10.1002_chem.201902131.pdf
Converting 10.1002_chem.201903120.pdf...
✓ Success: 10.1002_chem.201903120.pdf
Converting 10.1002_chem.202000235.pdf...
✓ Success: 10.1002_chem.202000235.pdf
Converting 10.1002_chem.202001412.pdf...
✓ Success: 10.1002_chem.202001412.pdf
Converting 10.1002_chem.202001668.pdf...
✓ Success: 10.1002_chem.202001668.pdf
Converting 10.1002_chem.202003167.pdf...
✓ Success: 10.1002_chem.202003167.pdf
Converting 10.1002_cssc.202201821.pdf...
✓ Success: 10.1002_cssc.202201821.pdf
Converting 10.1002_ece3.3985.pdf...
✓ Success: 10.1002_ece3.3985.pdf
Converting 10.1002_ece3.4466.pdf...
✓ Success: 10.1002_ece3.4466.p

CONVERSION FROM XML TO JSON

In [4]:
import os
import xmltodict
import json

# === Configuration ===
xml_folder = r"make-data-count-finding-data-references\train\ALL_XML_FILES"
json_folder = r"make-data-count-finding-data-references\train\all_json_files"

# === Ensure output folder exists ===
os.makedirs(json_folder, exist_ok=True)

# === Process each XML file ===
converted = 0
failed = 0

for filename in os.listdir(xml_folder):
    if not filename.lower().endswith(".xml"):
        continue

    xml_path = os.path.join(xml_folder, filename)
    json_filename = os.path.splitext(filename)[0] + ".json"
    json_path = os.path.join(json_folder, json_filename)

    try:
        with open(xml_path, 'r', encoding='utf-8') as xml_file:
            xml_content = xml_file.read()
            data_dict = xmltodict.parse(xml_content)

        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(data_dict, json_file, indent=2)

        print(f"✓ Converted: {filename}")
        converted += 1

    except Exception as e:
        print(f"✗ Failed to convert {filename}: {e}")
        failed += 1

# === Summary ===
print("\n=== Conversion Summary ===")
print(f"Total XML files: {converted + failed}")
print(f"Converted successfully: {converted}")
print(f"Failed: {failed}")


✓ Converted: 10.1002_2017jc013030.xml
✓ Converted: 10.1002_anie.201916483.xml
✓ Converted: 10.1002_anie.202005531.xml
✓ Converted: 10.1002_anie.202007717.xml
✓ Converted: 10.1002_chem.201902131.xml
✓ Converted: 10.1002_chem.201903120.xml
✓ Converted: 10.1002_chem.202000235.xml
✓ Converted: 10.1002_chem.202001412.xml
✓ Converted: 10.1002_chem.202001668.xml
✓ Converted: 10.1002_chem.202003167.xml
✓ Converted: 10.1002_cssc.202201821.xml
✓ Converted: 10.1002_ece3.3985.xml
✓ Converted: 10.1002_ece3.4466.xml
✓ Converted: 10.1002_ece3.5260.xml
✓ Converted: 10.1002_ece3.5395.xml
✓ Converted: 10.1002_ece3.6144.xml
✓ Converted: 10.1002_ece3.6303.xml
✓ Converted: 10.1002_ece3.6784.xml
✓ Converted: 10.1002_ece3.961.xml
✓ Converted: 10.1002_ece3.9627.xml
✓ Converted: 10.1002_ecs2.1280.xml
✓ Converted: 10.1002_ecs2.4619.xml
✓ Converted: 10.1002_ejic.201900904.xml
✓ Converted: 10.1002_ejoc.202000139.xml
✓ Converted: 10.1002_ejoc.202000916.xml
✓ Converted: 10.1002_esp.5058.xml
✓ Converted: 10.1002_esp

REGEX PATTERN ONLY 

In [9]:
import os
import json
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# === Paths ===
JSON_DIR = r"make-data-count-finding-data-references/train/all_json_files"

# === Dataset Regex Patterns ===
DATASET_PATTERNS = [
    r'10\.\d{4,9}/[-._;()/:a-z0-9]+',       # DOIs
    r'doi:10\.\d{4,9}/[-._;()/:a-z0-9]+',   # doi: prefix
    r'\bgse\d{4,6}\b', r'\bsrp\d+\b', r'\bsrx\d+\b', r'\bsrr\d+\b', r'\berr\d+\b',
    r'\bdrr\d+\b', r'\be-mexp-\d+\b', r'\be-geod-\d+\b',
    r'\bens[a-z]{0,5}\d+\b', r'\bnm_\d+\b', r'\brs\d+\b',
    r'\bhgnc:\d+\b', r'\b[opq][0-9][a-z0-9]{3}[0-9]\b',
    r'\bmodel\d+\b', r'\bepi_isl_\d+\b', r'\bchembl\d+\b', r'\bmgyp\d+\b',
    r'\bempiar-?\d+\b', r'\bprjna\d+\b', r'\bsamn\d+\b', r'\bpxd\d+\b',
    r'\bku\d+\b', r'\bcvcl_\d+\b', r'\bcab\d+\b', r'\bhpa\d+\b',
    r'\be-prot-\d+\b',
    r'\b[A-Z]{1,2}\d{6}(?:\.\d+)?\b',       # GenBank accessions (e.g., MH714866)
    r'\b\d+\.\d+\.\d+\.\d+\b'               # IP-like or version numbers
]
regex_combined = re.compile("|".join(DATASET_PATTERNS), re.IGNORECASE)

# === Dataset-related keywords
DATASET_KEYWORDS = [
    "dataset", "data available", "accession number", "repository",
    "deposited", "submitted to", "released", "archive", "downloaded from",
    "data were deposited", "data can be accessed", "hosted in", "publicly available"
]

def contains_keyword(text):
    text = text.lower()
    return any(keyword in text for keyword in DATASET_KEYWORDS)

# === Pre-clean broken patterns like `doi. org` → `doi.org`
def clean_text(text):
    text = re.sub(r'doi\s*\.\s*org', 'doi.org', text, flags=re.IGNORECASE)
    text = re.sub(r'https?\s*:\s*/\s*/\s*doi\s*\.\s*org', 'https://doi.org', text, flags=re.IGNORECASE)
    text = re.sub(r'10\s*\.\s*(\d+)', r'10.\1', text)
    return text

# === Extract all string values recursively from JSON
def extract_strings_from_json(obj):
    if isinstance(obj, dict):
        for v in obj.values():
            yield from extract_strings_from_json(v)
    elif isinstance(obj, list):
        for item in obj:
            yield from extract_strings_from_json(item)
    elif isinstance(obj, str):
        yield obj

# === Filtering Function for Invalid Matches
def is_valid_dataset_id(match, sentence):
    if re.fullmatch(r"(19|20)\d{2}", match):  # year
        return False
    if re.fullmatch(r"\d+\.\d+\.\d+\.\d+", match):  # IP or version
        if re.search(r"(v|version)[\s:=]*" + re.escape(match), sentence, re.IGNORECASE):
            return False
    if match.isdigit() and len(match) <= 3:
        return False
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", match):  # date
        return False
    return True

# === Main Dataset Detection
results = []

for filename in tqdm(os.listdir(JSON_DIR)):
    if not filename.endswith(".json"):
        continue
    path = os.path.join(JSON_DIR, filename)
    article_id = filename.replace(".json", "")

    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ Error in {filename}: {e}")
        continue

    all_strings = list(extract_strings_from_json(data))

    for raw_text in all_strings:
        cleaned_text = clean_text(raw_text)
        for sent in sent_tokenize(cleaned_text):
            matches = regex_combined.findall(sent)
            valid_matches = [m for m in matches if is_valid_dataset_id(m, sent)]
            if valid_matches:
                results.append({
                    "article_id": article_id,
                    "sentence": sent.strip(),
                    "dataset_id": "; ".join(sorted(set(valid_matches)))
                })

# === Save to CSV
df = pd.DataFrame(results)
df.to_csv("dataset_mentions_train.csv", index=False)
print(f"✅ Extracted {len(df)} dataset sentences after filtering. Saved to dataset_mentions_train.csv")


100%|██████████| 524/524 [01:01<00:00,  8.58it/s]

✅ Extracted 27428 dataset sentences after filtering. Saved to dataset_mentions_train.csv





PDF TO XML FOR TEST

In [3]:
import os
import requests

# === Configuration ===
PDF_FOLDER = r"make-data-count-finding-data-references\test\PDF"
XML_FOLDER = r"make-data-count-finding-data-references\test\PDF_TO_XML"
GROBID_URL = "http://localhost:8070/api/processFulltextDocument"

# === Statistics ===
success_count = 0
fail_count = 0
skipped_count = 0

# === Ensure output folder exists ===
os.makedirs(XML_FOLDER, exist_ok=True)

# === Process each PDF file ===
for pdf_file in os.listdir(PDF_FOLDER):
    if not pdf_file.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_FOLDER, pdf_file)
    xml_filename = os.path.splitext(pdf_file)[0] + ".xml"
    xml_path = os.path.join(XML_FOLDER, xml_filename)

    # Skip if XML already exists
    if os.path.exists(xml_path):
        print(f"Skipping {pdf_file} (already converted)")
        skipped_count += 1
        continue

    print(f"Converting {pdf_file}...")

    try:
        with open(pdf_path, "rb") as f:
            response = requests.post(
                GROBID_URL,
                files={"input": f},
                timeout=60
            )

        if response.status_code == 200:
            with open(xml_path, "w", encoding="utf-8") as out_file:
                out_file.write(response.text)
            print(f"✓ Success: {pdf_file}")
            success_count += 1
        else:
            print(f"✗ Failed (HTTP {response.status_code}): {pdf_file}")
            fail_count += 1

    except Exception as e:
        print(f"✗ Error processing {pdf_file}: {e}")
        fail_count += 1

# === Summary ===
print("\n=== Summary ===")
print(f"Total PDFs found: {len([f for f in os.listdir(PDF_FOLDER) if f.lower().endswith('.pdf')])}")
print(f"Converted: {success_count}")
print(f"Failed: {fail_count}")
print(f"Skipped (already exists): {skipped_count}")


Converting 10.1002_2017jc013030.pdf...
✓ Success: 10.1002_2017jc013030.pdf
Converting 10.1002_anie.201916483.pdf...
✓ Success: 10.1002_anie.201916483.pdf
Converting 10.1002_anie.202005531.pdf...
✓ Success: 10.1002_anie.202005531.pdf
Converting 10.1002_anie.202007717.pdf...
✓ Success: 10.1002_anie.202007717.pdf
Converting 10.1002_chem.201902131.pdf...
✓ Success: 10.1002_chem.201902131.pdf
Converting 10.1002_chem.201903120.pdf...
✓ Success: 10.1002_chem.201903120.pdf
Converting 10.1002_chem.202000235.pdf...
✓ Success: 10.1002_chem.202000235.pdf
Converting 10.1002_chem.202001412.pdf...
✓ Success: 10.1002_chem.202001412.pdf
Converting 10.1002_chem.202001668.pdf...
✓ Success: 10.1002_chem.202001668.pdf
Converting 10.1002_chem.202003167.pdf...
✓ Success: 10.1002_chem.202003167.pdf
Converting 10.1002_cssc.202201821.pdf...
✓ Success: 10.1002_cssc.202201821.pdf
Converting 10.1002_ece3.3985.pdf...
✓ Success: 10.1002_ece3.3985.pdf
Converting 10.1002_ece3.4466.pdf...
✓ Success: 10.1002_ece3.4466.p

XML TO JSON FILE

In [4]:
import os
import xmltodict
import json

# === Configuration ===
xml_folder = r"make-data-count-finding-data-references\test\ALL_XML"
json_folder = r"make-data-count-finding-data-references\test\all_json_files"

# === Ensure output folder exists ===
os.makedirs(json_folder, exist_ok=True)

# === Process each XML file ===
converted = 0
failed = 0

for filename in os.listdir(xml_folder):
    if not filename.lower().endswith(".xml"):
        continue

    xml_path = os.path.join(xml_folder, filename)
    json_filename = os.path.splitext(filename)[0] + ".json"
    json_path = os.path.join(json_folder, json_filename)

    try:
        with open(xml_path, 'r', encoding='utf-8') as xml_file:
            xml_content = xml_file.read()
            data_dict = xmltodict.parse(xml_content)

        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(data_dict, json_file, indent=2)

        print(f"✓ Converted: {filename}")
        converted += 1

    except Exception as e:
        print(f"✗ Failed to convert {filename}: {e}")
        failed += 1

# === Summary ===
print("\n=== Conversion Summary ===")
print(f"Total XML files: {converted + failed}")
print(f"Converted successfully: {converted}")
print(f"Failed: {failed}")


✓ Converted: 10.1002_2017jc013030.xml
✓ Converted: 10.1002_anie.201916483.xml
✓ Converted: 10.1002_anie.202005531.xml
✓ Converted: 10.1002_anie.202007717.xml
✓ Converted: 10.1002_chem.201902131.xml
✓ Converted: 10.1002_chem.201903120.xml
✓ Converted: 10.1002_chem.202000235.xml
✓ Converted: 10.1002_chem.202001412.xml
✓ Converted: 10.1002_chem.202001668.xml
✓ Converted: 10.1002_chem.202003167.xml
✓ Converted: 10.1002_cssc.202201821.xml
✓ Converted: 10.1002_ece3.3985.xml
✓ Converted: 10.1002_ece3.4466.xml
✓ Converted: 10.1002_ece3.5260.xml
✓ Converted: 10.1002_ece3.5395.xml
✓ Converted: 10.1002_ece3.6144.xml
✓ Converted: 10.1002_ece3.6303.xml
✓ Converted: 10.1002_ece3.6784.xml
✓ Converted: 10.1002_ece3.961.xml
✓ Converted: 10.1002_ece3.9627.xml
✓ Converted: 10.1002_ecs2.1280.xml
✓ Converted: 10.1002_ecs2.4619.xml
✓ Converted: 10.1002_ejic.201900904.xml
✓ Converted: 10.1002_ejoc.202000139.xml
✓ Converted: 10.1002_ejoc.202000916.xml
✓ Converted: 10.1002_esp.5058.xml
✓ Converted: 10.1002_esp

REGEX PATTERN FOR TEST DATASET

In [7]:
import os
import json
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# === Paths ===
JSON_DIR = r"make-data-count-finding-data-references/test/all_json_files"

# === Dataset Regex Patterns ===
DATASET_PATTERNS = [
    r'10\.\d{4,9}/[-._;()/:a-z0-9]+',       # DOIs
    r'doi:10\.\d{4,9}/[-._;()/:a-z0-9]+',   # doi: prefix
    r'\bgse\d{4,6}\b', r'\bsrp\d+\b', r'\bsrx\d+\b', r'\bsrr\d+\b', r'\berr\d+\b',
    r'\bdrr\d+\b', r'\be-mexp-\d+\b', r'\be-geod-\d+\b',
    r'\bens[a-z]{0,5}\d+\b', r'\bnm_\d+\b', r'\brs\d+\b',
    r'\bhgnc:\d+\b', r'\b[opq][0-9][a-z0-9]{3}[0-9]\b',
    r'\bmodel\d+\b', r'\bepi_isl_\d+\b', r'\bchembl\d+\b', r'\bmgyp\d+\b',
    r'\bempiar-?\d+\b', r'\bprjna\d+\b', r'\bsamn\d+\b', r'\bpxd\d+\b',
    r'\bku\d+\b', r'\bcvcl_\d+\b', r'\bcab\d+\b', r'\bhpa\d+\b',
    r'\be-prot-\d+\b',
    r'\b[A-Z]{1,2}\d{6}(?:\.\d+)?\b',       # GenBank accessions (e.g., MH714866)
    r'\b\d+\.\d+\.\d+\.\d+\b'               # IP-like or version numbers
]
regex_combined = re.compile("|".join(DATASET_PATTERNS), re.IGNORECASE)

# === Dataset-related keywords
DATASET_KEYWORDS = [
    "dataset", "data available", "accession number", "repository",
    "deposited", "submitted to", "released", "archive", "downloaded from",
    "data were deposited", "data can be accessed", "hosted in", "publicly available"
]

def contains_keyword(text):
    text = text.lower()
    return any(keyword in text for keyword in DATASET_KEYWORDS)

# === Pre-clean broken patterns like `doi. org` → `doi.org`
def clean_text(text):
    text = re.sub(r'doi\s*\.\s*org', 'doi.org', text, flags=re.IGNORECASE)
    text = re.sub(r'https?\s*:\s*/\s*/\s*doi\s*\.\s*org', 'https://doi.org', text, flags=re.IGNORECASE)
    text = re.sub(r'10\s*\.\s*(\d+)', r'10.\1', text)
    return text

# === Extract all string values recursively from JSON
def extract_strings_from_json(obj):
    if isinstance(obj, dict):
        for v in obj.values():
            yield from extract_strings_from_json(v)
    elif isinstance(obj, list):
        for item in obj:
            yield from extract_strings_from_json(item)
    elif isinstance(obj, str):
        yield obj

# === Filtering Function for Invalid Matches
def is_valid_dataset_id(match, sentence):
    if re.fullmatch(r"(19|20)\d{2}", match):  # year
        return False
    if re.fullmatch(r"\d+\.\d+\.\d+\.\d+", match):  # IP or version
        if re.search(r"(v|version)[\s:=]*" + re.escape(match), sentence, re.IGNORECASE):
            return False
    if match.isdigit() and len(match) <= 3:
        return False
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", match):  # date
        return False
    return True

# === Main Dataset Detection
results = []

for filename in tqdm(os.listdir(JSON_DIR)):
    if not filename.endswith(".json"):
        continue
    path = os.path.join(JSON_DIR, filename)
    article_id = filename.replace(".json", "")

    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ Error in {filename}: {e}")
        continue

    all_strings = list(extract_strings_from_json(data))

    for raw_text in all_strings:
        cleaned_text = clean_text(raw_text)
        for sent in sent_tokenize(cleaned_text):
            matches = regex_combined.findall(sent)
            valid_matches = [m for m in matches if is_valid_dataset_id(m, sent)]
            if valid_matches:
                results.append({
                    "article_id": article_id,
                    "sentence": sent.strip(),
                    "dataset_id": "; ".join(sorted(set(valid_matches)))
                })

# === Save to CSV
df = pd.DataFrame(results)
df.to_csv("dataset_mentions_test.csv", index=False)
print(f"✅ Extracted {len(df)} dataset sentences after filtering. Saved to dataset_mentions_test.csv")


100%|██████████| 30/30 [00:02<00:00, 10.58it/s]

✅ Extracted 2092 dataset sentences after filtering. Saved to dataset_mentions_test.csv



