Original Notebook:

https://www.kaggle.com/code/guanyuzhen/mdc-datareferenceextractor

In [1]:
! uv pip uninstall --system 'tensorflow'
! uv pip install --system --no-index --find-links='/kaggle/input/latest-mdc-whls/whls' 'pymupdf' 'vllm' 'triton' 'logits-processor-zoo' 'numpy<2'
! mkdir -p /tmp/src

[2mUsing Python 3.11.13 environment at: /usr[0m
[2mUninstalled [1m1 package[0m [2min 4.99s[0m[0m
 [31m-[39m [1mtensorflow[0m[2m==2.18.0[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m157 packages[0m [2min 469ms[0m[0m                                       [0m
[2K[2mPrepared [1m52 packages[0m [2min 20.85s[0m[0m                                           
[2mUninstalled [1m14 packages[0m [2min 306ms[0m[0m
[2K[2mInstalled [1m52 packages[0m [2min 146ms[0m[0m=11.6.1.9                     [0m
 [32m+[39m [1mairportsdata[0m[2m==20250622[0m
 [32m+[39m [1mastor[0m[2m==0.8.1[0m
 [32m+[39m [1mblake3[0m[2m==1.0.5[0m
 [32m+[39m [1mcompressed-tensors[0m[2m==0.9.3[0m
 [32m+[39m [1mdepyf[0m[2m==0.18.0[0m
 [32m+[39m [1mdiskcache[0m[2m==5.6.3[0m
 [32m+[39m [1mfastapi-cli[0m[2m==0.0.7[0m
 [32m+[39m [1mgguf[0m[2m==0.17.1[0m
 [32m+[39m [1mhttptools[0m[2m==0.6.4[0m
 [31m-[39m [1mimportlib

In [2]:
%%writefile /tmp/src/helpers.py
import logging, os, kagglehub, inspect
from pathlib import Path
import polars as pl

IS_KAGGLE_ENV = sum(['KAGGLE' in k for k in os.environ]) > 0
IS_KAGGLE_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
COMP_DIR = Path(('/kaggle/input/make-data-count-finding-data-references' if IS_KAGGLE_SUBMISSION else kagglehub.competition_download('make-data-count-finding-data-references')))
PDF_DIR = COMP_DIR / ('test' if IS_KAGGLE_SUBMISSION else 'train') / 'PDF'
WORKING_DIR = Path(('/kaggle/working/' if IS_KAGGLE_ENV else '.working/'))

DOI_LINK = 'https://doi.org/'

DEFAULT_LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper() if not IS_KAGGLE_SUBMISSION else "WARNING"
LOG_FILE_PATH = os.getenv("LOG_FILE", "logs/project.log")
LOG_DIR = Path(LOG_FILE_PATH).parent

LOG_DIR.mkdir(parents=True, exist_ok=True)

LOG_FORMAT = "%(levelname)s %(asctime)s  [%(filename)s:%(lineno)d - %(funcName)s()] %(message)s"
LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"

def get_logger(name=None):
    if name is None:
        frame = inspect.currentframe()
        if frame is None or frame.f_back is None:
            name = "__main__"
        else:
            name = frame.f_back.f_globals.get("__name__", "__main__")

    logger = logging.getLogger(name)

    if not logger.handlers:
        logger.setLevel(DEFAULT_LOG_LEVEL)
        formatter = logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_DATEFMT)
        ch = logging.StreamHandler()
        ch.setLevel(DEFAULT_LOG_LEVEL)
        ch.setFormatter(formatter)
        fh = logging.FileHandler(LOG_FILE_PATH)
        fh.setLevel(DEFAULT_LOG_LEVEL)
        fh.setFormatter(formatter)
        logger.addHandler(ch)
        logger.addHandler(fh)
        logger.propagate = False
    return logger

def is_doi_link(name: str) -> pl.Expr:
    return pl.col(name).str.starts_with(DOI_LINK)

def string_normalization(name: str) -> pl.Expr:
    return pl.col(name).str.normalize("NFKC").str.replace_all(r"[^\p{Ascii}]", '').str.replace_all(r"https?://zenodo\.org/record/(\d+)", r" 10.5281/zenodo.$1 ")

def get_df(parse_dir: str):
    records = []
    txt_files = list(Path(parse_dir).glob('*.txt'))
    for txt_file in txt_files:
        id_ = txt_file.stem
        with open(txt_file, 'r') as f:
            text = f.read()
        records.append({'article_id': id_, 'text': text})
    return pl.DataFrame(records).with_columns(string_normalization('text').alias('text'))

def assume_type(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns(pl.when(is_doi_link('dataset_id').or_(pl.col('dataset_id').str.starts_with('SAMN'))).then(pl.lit('Primary')).otherwise(pl.lit('Secondary')).alias('type'))
    )

def score(df, gt, on, tag='all'):
    hits = gt.join(df, on=on)
    tp = hits.height
    fp = df.height - tp
    fn = gt.height - tp
    f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    return f"{tag} - f1: {f1:.4f} [{tp}/{fp}/{fn}]"

def evaluate(df, on=['article_id', 'dataset_id']):
    gt = pl.read_csv(COMP_DIR/'train_labels.csv').filter(pl.col('type')!='Missing')
    return (
        score(df, gt, on),
        score(df.filter(is_doi_link('dataset_id')), gt.filter(is_doi_link('dataset_id')), on, 'doi'),
        score(df.filter(~is_doi_link('dataset_id')), gt.filter(~is_doi_link('dataset_id')), on, 'acc'),
    )

Writing /tmp/src/helpers.py


In [3]:
%%writefile /tmp/src/parse.py
import argparse
from pathlib import Path
import pymupdf
from helpers import get_logger, PDF_DIR

l = get_logger()

def pdf_to_txt(output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)
    pdf_files = list(PDF_DIR.glob("*.pdf")) + list(PDF_DIR.glob("*.PDF"))
    existing_txt_files = {f.stem for f in output_dir.glob("*.txt")}
    for pdf_file in pdf_files:
        txt_file = output_dir / f"{pdf_file.stem}.txt"
        if pdf_file.stem in existing_txt_files:
            continue
        try:
            text = ""
            with pymupdf.open(pdf_file) as doc:
                for page in doc:
                    text += page.get_text()
            txt_file.write_text(text, encoding='utf-8')
        except Exception:
            pass

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir', type=Path, help='Directory to save text files')
    args = parser.parse_args()
    pdf_to_txt(args.output_dir)

if __name__ == "__main__":
    main()

Writing /tmp/src/parse.py


In [4]:
%%writefile /tmp/src/check_parse.py
import polars as pl
from pathlib import Path
from helpers import *

l=get_logger()

def gt_dataset_id_normalization(name:str) -> pl.Expr:
    return (
        pl.when(is_doi_link(name))
        .then(pl.col(name).str.split(DOI_LINK).list.last())
        .otherwise(name)
        .str.to_lowercase()
    )

def main():
    if IS_KAGGLE_SUBMISSION:
        l.debug('skipping check_parse for submission')
        return
    df = (
        get_df('/tmp/train_parse')
        .with_columns(pl.col('text').str.replace_all('\s+', '').str.to_lowercase().alias('text'))
    )

    gt = (
        pl.read_csv(COMP_DIR/'train_labels.csv')
        .filter(pl.col('article_id').is_in(df['article_id']))
        .filter(pl.col('type')!='Missing')
        .with_columns(gt_dataset_id_normalization('dataset_id').alias('norm_id'))
    )

    l.info(f"pymupdf misses: {gt.join(df, on='article_id').with_columns(hit=pl.col('text').str.contains(pl.col('norm_id'), literal=True)).filter(~pl.col('hit')).height} dataset_ids")

if __name__=='__main__': main()

Writing /tmp/src/check_parse.py


In [5]:
%%writefile /tmp/src/getid.py
import re
import polars as pl
from pathlib import Path
from typing import Optional, Tuple

from helpers import *

COMPILED_PATTERNS = {
'ref_header_patterns': [re.compile(r'\b(R\s*E\s*F\s*E\s*R\s*E\s*N\s*C\s*E\s*S|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED|CITED WORKS|ACKNOWLEDGEMENTS)\b[:\s]*', re.IGNORECASE)],
'citation_pattern': re.compile(r'^\s*(\[\d+\]|\(\d+\)|\d+\.|\d+\)|\d+(?=\s|$))\s*'),
'first_citation_patterns': [re.compile(r'^\s*\[1\]\s*'), re.compile(r'^\s*\(1\)\s*'), re.compile(r'^\s*1\.\s*'), re.compile(r'^\s*1\)\s*'), re.compile(r'^\s*1(?=\s|$)')],
}

l = get_logger()

def find_last_reference_header(text: str, header_patterns: list[re.Pattern]) -> Optional[int]:
    last_match_idx = None
    for pattern in header_patterns:
        matches = list(pattern.finditer(text))
        if matches: last_match_idx = matches[-1].start()
    return last_match_idx

def find_last_first_citation(text: str) -> Optional[int]:
    lines = text.splitlines()
    last_match_line = None
    for line_num, line in enumerate(lines):
        line = line.strip()
        for pattern in COMPILED_PATTERNS['first_citation_patterns']:
            if pattern.match(line):
                next_lines = lines[line_num:line_num+3]
                if any(COMPILED_PATTERNS['citation_pattern'].match(l.strip()) for l in next_lines[1:]):
                    last_match_line = line_num
                break
    return last_match_line

def find_reference_start(text: str) -> Optional[int]:
    lines = text.splitlines()
    last_first_citation = find_last_first_citation(text)
    if last_first_citation is not None: return last_first_citation
    start_search_idx = int(len(lines) * 0.5)
    for i in range(start_search_idx, len(lines)):
        line = lines[i].strip()
        if COMPILED_PATTERNS['citation_pattern'].match(line):
            next_lines = lines[i:i+3]
            if sum(1 for l in next_lines if COMPILED_PATTERNS['citation_pattern'].match(l.strip())) >= 2:
                for j in range(i, max(-1, i-10), -1):
                    if not COMPILED_PATTERNS['citation_pattern'].match(lines[j].strip()): return j + 1
                return max(0, i-10)
    return None

def split_text_and_references(text: str) -> Tuple[str, str]:
    header_idx = find_last_reference_header(text, COMPILED_PATTERNS['ref_header_patterns'])
    if header_idx is not None:
        header_idx2 = find_last_reference_header(text[:header_idx].strip(), COMPILED_PATTERNS['ref_header_patterns'])
        if header_idx2 is not None:
            header_idx3 = find_last_reference_header(text[:header_idx2].strip(), COMPILED_PATTERNS['ref_header_patterns'])
            if header_idx3 is not None: return text[:header_idx3].strip(), text[header_idx3:].strip()
            return text[:header_idx2].strip(), text[header_idx2:].strip()
        return text[:header_idx].strip(), text[header_idx:].strip()
    ref_start_line = find_reference_start(text)
    if ref_start_line is not None:
        lines = text.splitlines()
        body = '\n'.join(lines[:ref_start_line])
        refs = '\n'.join(lines[ref_start_line:])
        return body.strip(), refs.strip()
    return text.strip(), ''

def get_splits(df):
    main_texts, ref_texts = [], []
    for raw_text in df['text']:
        main, refs = split_text_and_references(raw_text)
        main_texts.append(main)
        ref_texts.append(refs)
    df = df.with_columns(pl.Series('body', main_texts), pl.Series('ref', ref_texts))
    return df

def tidy_extraction(df) -> pl.DataFrame:
    bad_ids = [f'{DOI_LINK}{e}' for e in ['10.5061/dryad', '10.5281/zenodo', '10.6073/pasta']]
    doi_df = (df.with_columns(pl.col('body').str.extract_all(r'10\s*\.\s*\d{4,9}\s*/\s*\S+').alias('match')).explode('match').drop_nulls('match').with_columns(pl.col('match').str.replace_all(r'\s+', '').str.replace(r'[^A-Za-z0-9]+$', '').str.to_lowercase().alias('dataset_id')).group_by('article_id','dataset_id').agg('match').with_columns((DOI_LINK+pl.col('dataset_id')).alias('dataset_id')))
    acc_df = (df.with_columns(pl.col('text').str.extract_all(r'(?i)\b(?:CHEMBL\d+|E-GEOD-\d+|E-PROT-\d+|EMPIAR-\d+|ENSBTAG\d+|ENSOARG\d+|EPI_ISL_\d{5,}|EPI\d{6,7}|HPA\d+|CP\d{6}|IPR\d{6}|PF\d{5}|KX\d{6}|K0\d{4}|PRJNA\d+|PXD\d+|SAMN\d+|dryad\s*\.\s*[^\s"<>]+|pasta\s*/\s*[^\s"<>])').alias('match')).explode('match').drop_nulls('match').with_columns(pl.col('match').str.replace_all(r'\s+', '').str.replace(r'[^A-Za-z0-9]+$', '').alias('dataset_id')).group_by('article_id','dataset_id').agg('match').with_columns(pl.when(pl.col('dataset_id').str.starts_with('dryad.')).then(f'{DOI_LINK}10.5061/' + pl.col('dataset_id')).otherwise('dataset_id').alias('dataset_id')).with_columns(pl.when(pl.col('dataset_id').str.starts_with('pasta/')).then(f'{DOI_LINK}10.6073/' + pl.col('dataset_id')).otherwise('dataset_id').alias('dataset_id')))
    df = pl.concat([doi_df, acc_df])
    df = (df.unique('dataset_id').filter(~pl.col('article_id').str.replace('_','/').str.contains(pl.col('dataset_id').str.split(DOI_LINK).list.last().str.escape_regex())).filter(~pl.col('dataset_id').str.contains(pl.col('article_id').str.replace('_','/').str.escape_regex())).filter(~pl.col('dataset_id').str.contains('figshare', literal=True)).filter(~pl.col('dataset_id').is_in(bad_ids)).filter(pl.when(is_doi_link('dataset_id').and_(pl.col('dataset_id').str.split('/').list.last().str.len_chars()<5)).then(False).otherwise(True)).with_columns(pl.col('match').list.unique()))
    return df

def get_context_window(text: str, substring: str, window: int = 100) -> str:
    index = text.find(substring)
    if index == -1: raise ValueError
    start = max(index - window, 0)
    end = min(index + len(substring) + window, len(text))
    return text[start:end]

def get_window_df(text_df, ids_df):
    df = ids_df.join(text_df, on='article_id')
    windows = []
    for text, match_ids in df.select('text', 'match').rows():
        windows.append(get_context_window(text, match_ids[0]))
    return df.with_columns(pl.Series('window', windows)).select('article_id', 'dataset_id', 'window')


def main():
    text_df = get_df('/tmp/train_parse')
    df = get_splits(text_df)
    df = tidy_extraction(df)
    df = get_window_df(text_df, df)
    df.write_parquet('/tmp/extracted.parquet')
    df = assume_type(df)
    df.select(['article_id', 'dataset_id', 'type']).with_row_index(name='row_id').write_csv('/kaggle/working/submission.csv')
    if not IS_KAGGLE_SUBMISSION:
        results = evaluate(df)
        for r in results: l.info(r) 
        results = evaluate(df, on=['article_id', 'dataset_id', 'type'])
        for r in results: l.info(r) 

if __name__=='__main__': main()

Writing /tmp/src/getid.py


In [6]:
%%writefile /tmp/src/llm_validate.py
import polars as pl
import os

from helpers import *

l = get_logger()

SYS_PROMPT_CLASSIFY_DOI = """
You are give a piece of academic text. Your task is to is to classify whether the DOI citation refers specifically to open-access research data.
Classify the DOI as:
A) Data: if the DOI is related to a dataset.
B) Literature: does not refer to research data or is a written academic paper.

Respond with only one letter: A or B.
""".strip()

def build_df():
    df = pl.read_parquet('/tmp/extracted.parquet')
    df.filter(~is_doi_link('dataset_id')).select('article_id', 'dataset_id').write_csv('/tmp/accid_sub.csv')
    return df.filter(is_doi_link('dataset_id'))

def build_prompt(tokenizer, df):
    prompts = []
    for doi, text in df.select('dataset_id', 'window').rows():
        messages = [{'role':'system','content': SYS_PROMPT_CLASSIFY_DOI}, {'role':'user', 'content': text}]
        prompts.append(tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False))
    return df.with_columns(pl.Series('prompt', prompts))

if __name__=='__main__':
    os.environ["VLLM_USE_V1"] = "0"
    import vllm
    from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
    model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    llm = vllm.LLM(model_path, quantization='awq', tensor_parallel_size=2, gpu_memory_utilization=0.9, trust_remote_code=True, dtype="half", enforce_eager=True, max_model_len=2048, disable_log_stats=True, disable_custom_all_reduce=True, enable_prefix_caching=True, task='generate')
    tokenizer = llm.get_tokenizer()
    df = build_df()
    df = build_prompt(tokenizer, df)
    prompts = df['prompt'].to_list()
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=["A", "B"])
    outputs = llm.generate(prompts, vllm.SamplingParams(seed=777, temperature=0.1, skip_special_tokens=True, max_tokens=1, logits_processors=[mclp], logprobs=len(mclp.choices)), use_tqdm=True)
    logprobs = [{lp.decoded_token: lp.logprob for lp in list(lps)} for lps in [output.outputs[0].logprobs[0].values() for output in outputs]]
    choices = [max(d, key=d.get) for d in logprobs]
    types = {'A': True, 'B': False}
    choices = [types[c] for c in choices]
    df = df.with_columns(pl.Series('type', choices))
    df.filter(pl.col('type')).select('article_id', 'dataset_id').write_csv('/tmp/doi_sub.csv')
    df = pl.concat([pl.read_csv('/tmp/doi_sub.csv'), pl.read_csv('/tmp/accid_sub.csv')])
    df = assume_type(df)
    df.select(['article_id', 'dataset_id', 'type']).with_row_index(name='row_id').write_csv('/kaggle/working/submission.csv')
    if not IS_KAGGLE_SUBMISSION:
        results = evaluate(df)
        for r in results: l.info(r) 
        results = evaluate(df, on=['article_id', 'dataset_id', 'type'])
        for r in results: l.info(r)

Writing /tmp/src/llm_validate.py


In [7]:
%cd /tmp
!LOG_LEVEL=INFO python src/parse.py /tmp/train_parse
! python src/check_parse.py
! python src/getid.py
! python src/llm_validate.py

/tmp
MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error

In [8]:
! cat /tmp/logs/project.log

INFO 2025-07-22 23:16:37  [check_parse.py:31 - main()] pymupdf misses: 42 dataset_ids
INFO 2025-07-22 23:16:43  [getid.py:110 - main()] all - f1: 0.6043 [481/392/238]
INFO 2025-07-22 23:16:43  [getid.py:110 - main()] doi - f1: 0.4356 [164/264/161]
INFO 2025-07-22 23:16:43  [getid.py:110 - main()] acc - f1: 0.7557 [317/128/77]
INFO 2025-07-22 23:16:43  [getid.py:112 - main()] all - f1: 0.5050 [402/471/317]
INFO 2025-07-22 23:16:43  [getid.py:112 - main()] doi - f1: 0.3400 [128/300/197]
INFO 2025-07-22 23:16:43  [getid.py:112 - main()] acc - f1: 0.6532 [274/171/120]
INFO 2025-07-22 23:23:04  [llm_validate.py:52 - <module>()] all - f1: 0.6867 [481/201/238]
INFO 2025-07-22 23:23:04  [llm_validate.py:52 - <module>()] doi - f1: 0.5836 [164/73/161]
INFO 2025-07-22 23:23:04  [llm_validate.py:52 - <module>()] acc - f1: 0.7557 [317/128/77]
INFO 2025-07-22 23:23:04  [llm_validate.py:54 - <module>()] all - f1: 0.5739 [402/280/317]
INFO 2025-07-22 23:23:04  [llm_validate.py:54 - <module>()] doi - f

Result:

Score: 0.533

Rank: 118 (2025-07-23-8:53, JST)

Run time: 8min (Kaggle editor), 30min (Scoring)

Moved up to rank 118 on #kaggle. I'm not addicted. I can quit when I want. https://kaggle.com/competitions/make-data-count-finding-data-references 

Your Best Entry!
Your most recent submission scored 0.533, which is an improvement of your previous score of 0.532. Great job!

Make Data Count, Maggie Demkin, and Walter Reade. Make Data Count - Finding Data References. https://kaggle.com/competitions/make-data-count-finding-data-references, 2025. Kaggle.