# Mamoru PubMed Search + Fetch

## Notebook outline

| Cell | Purpose |
|------|---------|
| 1 | Load env (NCBI_EMAIL, NCBI_API_KEY), set MeSH + tiab query, run ESearch (get WebEnv/QueryKey), print summary |
| 2 | Fetch paper abstracts in batches, format, write .txt to data/pubmed_fetch/. |
| 3 | Cleanup by deleting the local `data/pubmed_fetch/` folder |
| 4 | Upload fetched `.txt` files to S3 `raw/` prefix (requires S3_BUCKET) |

---

Set your email and optional NCBI API key as environment variables before running.

To create an API key, sign in at https://account.ncbi.nlm.nih.gov/settings/ and generate an access key.

Example env var setup:
- `export NCBI_EMAIL="you@example.com"`
- `export NCBI_API_KEY="YOUR_KEY"`
- `export S3_BUCKET="your-bucket"`

Optional: create a local `.env` (gitignored) which will be loaded with `python-dotenv`.

Reference: https://biopython.org/docs/latest/Tutorial/chapter_entrez.html#esearch-searching-the-entrez-databases

**Note about MEDLINE formtat.** PubMed returns records in **MEDLINE** format: a standard tagged text format (e.g. `PMID-`, `TI-`, `AB-`) that we parse into structured metadata and abstract text. In this notebook we pull that data out and write one `.txt` per record. Downstream (see the processing notebook), we process the paper metadata and **chunk and embed the abstract text** into vectors so it can be indexed in the RAG knowledge base for semantic search.

In [None]:
# Cell 1: Load env, define query, run ESearch, print summary.
import os
from Bio import Entrez

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

EMAIL = os.getenv("NCBI_EMAIL")
API_KEY = os.getenv("NCBI_API_KEY", "")

if not EMAIL:
    raise ValueError("NCBI_EMAIL must be set in your environment or .env")

Entrez.email = EMAIL
if API_KEY:
    Entrez.api_key = API_KEY


def run_esearch(query: str, retmax: int) -> dict:
    """Run a PubMed search and return the ESearch result.

    We use usehistory='y' so we can fetch results in batches later with EFetch
    (WebEnv and QueryKey). Relies on Entrez.email and Entrez.api_key from this notebook.
    """
    try:
        stream = Entrez.esearch(db="pubmed", term=query, retmax=retmax, usehistory="y")
        record = Entrez.read(stream)
        stream.close()
        return record
    except Exception as exc:
        raise RuntimeError(f"PubMed search failed: {exc}") from exc


# MeSH + tiab query for dementia caregiving; usehistory enables batched EFetch.
QUERY = (
    '("Dementia"[Mesh] OR "Mild Cognitive Impairment"[Mesh]) '
    'AND ("Decision Support Systems, Clinical"[Mesh] OR "Caregivers"[Mesh] '
    'OR caregiver*[tiab] OR "decision support"[tiab])'
)
RETMAX = 500

record = run_esearch(QUERY, RETMAX)
summary = {
    "count": int(record.get("Count", 0)),
    "retmax": int(record.get("RetMax", 0)),
    "returned_ids": len(record.get("IdList", [])),
    "query_translation": record.get("QueryTranslation"),
    "webenv": record.get("WebEnv"),
    "query_key": record.get("QueryKey"),
}
summary


In [None]:
# Cell 2: Fetch paper abstracts in batches, format, write .txt to data/pubmed_fetch/.
import os
import time
from typing import Any, Iterator

from Bio import Entrez, Medline

FETCH_DIR = "data/pubmed_fetch"
os.makedirs(FETCH_DIR, exist_ok=True)


def _format_record(rec: dict[str, Any]) -> str:
    """Format a parsed MEDLINE record into a single text block for our .txt files.

    Pulls out PMID, title, authors, journal, date, and abstract when they're there.
    """
    parts = []
    if rec.get("PMID"):
        parts.append(f"PMID: {rec['PMID']}")
    if rec.get("TI"):
        parts.append(f"Title: {rec['TI']}")
    if rec.get("AU"):
        parts.append(f"Authors: {', '.join(rec['AU'])}")
    if rec.get("JT"):
        parts.append(f"Journal: {rec['JT']}")
    if rec.get("DP"):
        parts.append(f"Date: {rec['DP']}")
    if rec.get("AB"):
        parts.append(f"Abstract:\n{rec['AB']}")
    return "\n".join(parts).strip()


def iter_efetch_batches(
    webenv: str | None,
    query_key: str | None,
    target_count: int,
    batch_size: int = 100,
    request_delay_sec: float = 0.34,
) -> Iterator[Any]:
    """Yield EFetch response streams for each batch (caller parses and closes each stream).

    Args:
        webenv: ESearch WebEnv (required with query_key).
        query_key: ESearch QueryKey (required with webenv).
        target_count: Total records to fetch across batches.
        batch_size: Records per EFetch request.
        request_delay_sec: Sleep between batches for rate limiting.

    Yields:
        Open file-like stream from Entrez.efetch (rettype='medline'); caller must consume
        then stream is closed before the next yield.
    """
    if not (webenv and query_key):
        return
    for start in range(0, target_count, batch_size):
        stream = Entrez.efetch(
            db="pubmed",
            rettype="medline",
            retmode="text",
            retstart=start,
            retmax=min(batch_size, target_count - start),
            webenv=webenv,
            query_key=query_key,
        )
        try:
            yield stream
        finally:
            stream.close()
        if start + batch_size < target_count:
            time.sleep(request_delay_sec)


def fetch_and_write_medline_records(
    record: dict[str, Any],
    retmax: int,
    fetch_dir: str,
    batch_size: int = 100,
    request_delay_sec: float = 0.34,
) -> int:
    """Fetch MEDLINE in batches from PubMed, format each record, and write one .txt per PMID.

    Expects an ESearch record (WebEnv, QueryKey, Count). We cap at retmax records,
    sleep between batches to respect rate limits, and return how many files we wrote.
    """
    webenv = record.get("WebEnv")
    query_key = record.get("QueryKey")
    total_count = int(record.get("Count", 0))
    target_count = min(retmax, total_count)

    written = 0
    for stream in iter_efetch_batches(
        webenv, query_key, target_count, batch_size, request_delay_sec
    ):
        for rec in Medline.parse(stream):
            pmid = rec.get("PMID")
            if not pmid:
                continue
            text = _format_record(rec)
            if not text:
                continue
            out_path = os.path.join(fetch_dir, f"{pmid}.txt")
            with open(out_path, "w", encoding="utf-8") as handle:
                handle.write(text)
            written += 1
    return written


request_delay = 0.10
written = fetch_and_write_medline_records(
    record, RETMAX, FETCH_DIR, batch_size=100, request_delay_sec=request_delay
)
f"Wrote {written} records to {FETCH_DIR}."

In [None]:
# # Cell 3: Cleanup: delete local data/pubmed_fetch/ folder.  Commented out to avoid accidentally running.
# import shutil

# if os.path.isdir(FETCH_DIR):
#     shutil.rmtree(FETCH_DIR)
#     "Deleted fetched data folder."
# else:
#     "Nothing to delete."

In [None]:
# Cell 4: Upload fetched .txt files to S3 raw/ prefix if S3_BUCKET is set.
import os
import boto3

S3_BUCKET = os.getenv("S3_BUCKET")
RAW_PREFIX = "raw"

if not S3_BUCKET:
    raise ValueError("S3_BUCKET must be set in your environment or .env")

s3 = boto3.client("s3")

uploaded = 0
for filename in os.listdir(FETCH_DIR):
    if not filename.endswith(".txt"):
        continue
    local_path = os.path.join(FETCH_DIR, filename)
    key = f"{RAW_PREFIX}/{filename}"
    s3.upload_file(local_path, S3_BUCKET, key)
    uploaded += 1

f"Uploaded {uploaded} files to s3://{S3_BUCKET}/{RAW_PREFIX}/"