# Mamoru PubMed Search + Fetch

## Notebook outline

| Cell | Purpose |
|------|---------|
| 1 | Load env (NCBI_EMAIL, NCBI_API_KEY), set MeSH + tiab query, run ESearch (get WebEnv/QueryKey), print summary |
| 2 | Fetch paper abstracts in batches, format, write .txt to data/pubmed_fetch/. |
| 3 | Cleanup by deleting the local `data/pubmed_fetch/` folder |
| 4 | Upload fetched `.txt` files to S3 `raw/` prefix (requires S3_BUCKET) |

---

**Prerequisites.** None (run this notebook first to populate `data/pubmed_fetch/`).

**Env.** `NCBI_EMAIL` (required), `NCBI_API_KEY` (optional), `S3_BUCKET` (cell 4). Set in `.env` or your shell. To create an API key: https://account.ncbi.nlm.nih.gov/settings/

**Reference.** https://biopython.org/docs/latest/Tutorial/chapter_entrez.html#esearch-searching-the-entrez-databases

**Note about MEDLINE format.** PubMed returns records in **MEDLINE** format: a standard tagged text format (e.g. `PMID-`, `TI-`, `AB-`) that we parse into structured metadata and abstract text. In this notebook we pull that data out and write one `.txt` per record. Downstream (see the processing notebook), we process the paper metadata and **chunk and embed the abstract text** into vectors so it can be indexed in the RAG knowledge base for semantic search.

In [5]:
# Cell 1: Load env, define query, run ESearch, print summary.
import os
import time
from typing import Any, Iterator

from Bio import Entrez, Medline

DEFAULT_QUERY: str = (
    '("Dementia"[Mesh] OR "Mild Cognitive Impairment"[Mesh]) '
    'AND ("Decision Support Systems, Clinical"[Mesh] OR "Caregivers"[Mesh] '
    'OR caregiver*[tiab] OR "decision support"[tiab])'
)
DEFAULT_RETMAX: int = 500
DEFAULT_FETCH_DIR: str = "data/pubmed_fetch"
RAW_PREFIX: str = "raw"


try:
    from dotenv import load_dotenv

    load_dotenv()
except Exception:
    pass


def load_env(reload: bool = False) -> tuple[str, str, str]:
    """Load env vars and validate required settings."""
    if reload:
        try:
            from dotenv import load_dotenv

            load_dotenv(override=True)
        except Exception:
            pass

    email = os.getenv("NCBI_EMAIL", "")
    api_key = os.getenv("NCBI_API_KEY", "")
    s3_bucket = os.getenv("S3_BUCKET", "")
    if not email:
        raise ValueError("NCBI_EMAIL must be set in your environment or .env")
    return email, api_key, s3_bucket


def configure_entrez(email: str, api_key: str) -> None:
    """Configure Biopython Entrez settings."""
    Entrez.email = email
    if api_key:
        Entrez.api_key = api_key


def run_esearch(query: str, retmax: int) -> dict:
    """Run a PubMed ESearch and return the result; usehistory='y' enables batched EFetch via WebEnv/QueryKey."""
    try:
        stream = Entrez.esearch(db="pubmed", term=query, retmax=retmax, usehistory="y")
        record = Entrez.read(stream)
        stream.close()
        return record
    except Exception as exc:
        raise RuntimeError(f"PubMed search failed: {exc}") from exc


def _format_record(rec: dict[str, Any]) -> str:
    """Format a parsed MEDLINE record into a single text block for our .txt files (PMID, title, authors, journal, date, abstract)."""
    parts = []
    if rec.get("PMID"):
        parts.append(f"PMID: {rec['PMID']}")
    if rec.get("TI"):
        parts.append(f"Title: {rec['TI']}")
    if rec.get("AU"):
        parts.append(f"Authors: {', '.join(rec['AU'])}")
    if rec.get("JT"):
        parts.append(f"Journal: {rec['JT']}")
    if rec.get("DP"):
        parts.append(f"Date: {rec['DP']}")
    if rec.get("AB"):
        parts.append(f"Abstract:\n{rec['AB']}")
    return "\n".join(parts).strip()


def iter_efetch_batches(
    webenv: str | None,
    query_key: str | None,
    target_count: int,
    batch_size: int = 100,
    request_delay_sec: float = 0.34,
) -> Iterator[Any]:
    """Yield EFetch response streams for each batch; caller parses and closes each stream. Yields: open file-like from Entrez.efetch (MEDLINE)."""
    if not (webenv and query_key):
        return
    for start in range(0, target_count, batch_size):
        stream = Entrez.efetch(
            db="pubmed",
            rettype="medline",
            retmode="text",
            retstart=start,
            retmax=min(batch_size, target_count - start),
            webenv=webenv,
            query_key=query_key,
        )
        try:
            yield stream
        finally:
            stream.close()
        if start + batch_size < target_count:
            time.sleep(request_delay_sec)


def fetch_and_write_medline_records(
    record: dict[str, Any],
    retmax: int,
    fetch_dir: str,
    batch_size: int = 100,
    request_delay_sec: float = 0.34,
) -> int:
    """Fetch MEDLINE in batches from an ESearch result, format each record, write one .txt per PMID; returns count written."""
    webenv = record.get("WebEnv")
    query_key = record.get("QueryKey")
    total_count = int(record.get("Count", 0))
    target_count = min(retmax, total_count)

    written = 0
    for stream in iter_efetch_batches(
        webenv, query_key, target_count, batch_size, request_delay_sec
    ):
        for rec in Medline.parse(stream):
            pmid = rec.get("PMID")
            if not pmid:
                continue
            text = _format_record(rec)
            if not text:
                continue
            out_path = os.path.join(fetch_dir, f"{pmid}.txt")
            with open(out_path, "w", encoding="utf-8") as handle:
                handle.write(text)
            written += 1
    return written


def get_request_delay(api_key: str) -> float:
    """Shorter delay when API key is set (NCBI allows higher rate)."""
    return 0.10 if api_key else 0.34


def delete_fetch_dir(fetch_dir: str, confirm: bool = False) -> str:
    """Delete the fetch directory when confirm=True."""
    if not confirm:
        return "Cleanup skipped. Set confirm=True to delete local data."
    import shutil

    if os.path.isdir(fetch_dir):
        shutil.rmtree(fetch_dir)
        return "Deleted fetched data folder."
    return "Nothing to delete."


def upload_fetch_dir(fetch_dir: str, bucket: str, raw_prefix: str = RAW_PREFIX) -> int:
    """Upload .txt files from fetch_dir to S3 raw/ prefix; returns count uploaded."""
    import boto3

    s3 = boto3.client("s3")
    uploaded = 0
    for filename in os.listdir(fetch_dir):
        if not filename.endswith(".txt"):
            continue
        local_path = os.path.join(fetch_dir, filename)
        key = f"{raw_prefix}/{filename}"
        s3.upload_file(local_path, bucket, key)
        uploaded += 1
    return uploaded


EMAIL, API_KEY, _ = load_env()
configure_entrez(EMAIL, API_KEY)

# MeSH + tiab query for dementia caregiving; usehistory enables batched EFetch.
QUERY = DEFAULT_QUERY
RETMAX = DEFAULT_RETMAX
FETCH_DIR = DEFAULT_FETCH_DIR

record = run_esearch(QUERY, RETMAX)
summary = {
    "count": int(record.get("Count", 0)),
    "retmax": int(record.get("RetMax", 0)),
    "returned_ids": len(record.get("IdList", [])),
    "query_translation": record.get("QueryTranslation"),
    "webenv": record.get("WebEnv"),
    "query_key": record.get("QueryKey"),
}
summary

{'count': 15669,
 'retmax': 500,
 'returned_ids': 500,
 'query_translation': '"Dementia"[MeSH Terms] AND ("decision support systems, clinical"[MeSH Terms] OR "Caregivers"[MeSH Terms] OR "caregiver*"[Title/Abstract] OR "decision support"[Title/Abstract])',
 'webenv': 'MCID_697c0fd33ad238690c0b587b',
 'query_key': '1'}

In [6]:
# Cell 2: Fetch paper abstracts in batches, format, write .txt to data/pubmed_fetch/.
os.makedirs(FETCH_DIR, exist_ok=True)

request_delay = get_request_delay(API_KEY)
written = fetch_and_write_medline_records(
    record, RETMAX, FETCH_DIR, batch_size=100, request_delay_sec=request_delay
)
f"Wrote {written} records to {FETCH_DIR}."

'Wrote 500 records to data/pubmed_fetch.'

In [7]:
# Cell 3: Cleanup â€” delete local data/pubmed_fetch/ folder.
confirm_delete = False

delete_fetch_dir(FETCH_DIR, confirm=confirm_delete)

'Cleanup skipped. Set confirm=True to delete local data.'

In [8]:
# Cell 4: Upload fetched .txt files to S3 raw/ prefix if S3_BUCKET is set.
_, _, S3_BUCKET = load_env(reload=True)

if not S3_BUCKET:
    raise ValueError("S3_BUCKET must be set in your environment or .env")

uploaded = upload_fetch_dir(FETCH_DIR, S3_BUCKET, raw_prefix=RAW_PREFIX)

f"Uploaded {uploaded} files to s3://{S3_BUCKET}/{RAW_PREFIX}/"

'Uploaded 500 files to s3://pubmed-rag-data/raw/'