Set your email and optional NCBI API key as environment variables before running.

To create an API key, sign in at https://account.ncbi.nlm.nih.gov/settings/ and generate an access key.

Example env var setup:
- `export NCBI_EMAIL="you@example.com"` (bash/zsh)
- `export NCBI_API_KEY="YOUR_KEY"` (bash/zsh)
- `export S3_BUCKET="your-bucket"` (bash/zsh)
- `export S3_PREFIX="source"` (bash/zsh)

Optional: create a local `.env` (gitignored) and load it with `python-dotenv`.

Reference: https://biopython.org/docs/latest/Tutorial/chapter_entrez.html#esearch-searching-the-entrez-databases

In [4]:
import os

from Bio import Entrez

try:
    from dotenv import load_dotenv

    load_dotenv()
except Exception:
    pass

EMAIL = os.getenv("NCBI_EMAIL")
API_KEY = os.getenv("NCBI_API_KEY", "")

if not EMAIL:
    raise ValueError("NCBI_EMAIL must be set in your environment or .env")

Entrez.email = EMAIL
if API_KEY:
    Entrez.api_key = API_KEY

# Using MeSH (Medical Subject Headings) for broader recall, plus tiab
# (Title/Abstract) terms to keep results focused and more current.
QUERY = (
    '("Dementia"[Mesh] OR "Mild Cognitive Impairment"[Mesh]) '
    'AND ("Decision Support Systems, Clinical"[Mesh] OR "Caregivers"[Mesh] '
    'OR caregiver*[tiab] OR "decision support"[tiab])'
)
RETMAX = 100

# Using usehistory to enable lazy fetches (WebEnv + QueryKey).
# Docs: https://biopython.org/docs/latest/Tutorial/chapter_entrez.html#esearch-searching-the-entrez-databases
try:
    stream = Entrez.esearch(db="pubmed", term=QUERY, retmax=RETMAX, usehistory="y")
    record = Entrez.read(stream)
    stream.close()
except Exception as exc:
    raise RuntimeError(f"PubMed search failed: {exc}")

summary = {
    "count": int(record.get("Count", 0)),
    "retmax": int(record.get("RetMax", 0)),
    "returned_ids": len(record.get("IdList", [])),
    "query_translation": record.get("QueryTranslation"),
    "webenv": record.get("WebEnv"),
    "query_key": record.get("QueryKey"),
}
summary


{'count': 15659,
 'retmax': 100,
 'returned_ids': 100,
 'query_translation': '"Dementia"[MeSH Terms] AND ("decision support systems, clinical"[MeSH Terms] OR "Caregivers"[MeSH Terms] OR "caregiver*"[Title/Abstract] OR "decision support"[Title/Abstract])',
 'webenv': 'MCID_69795d58f23aefa8550d45e3',
 'query_key': '1'}

In [3]:
# Lazy fetch: stream the first 100 records using the search history.
import os

from Bio import Medline

FETCH_DIR = "data/pubmed_fetch"
os.makedirs(FETCH_DIR, exist_ok=True)

webenv = record.get("WebEnv")
query_key = record.get("QueryKey")

batch_size = 100

def iter_fetch_batches():
    if not (webenv and query_key):
        return
    stream = Entrez.efetch(
        db="pubmed",
        rettype="medline",
        retmode="text",
        retstart=0,
        retmax=batch_size,
        webenv=webenv,
        query_key=query_key,
    )
    try:
        yield stream
    finally:
        stream.close()

def format_record(rec):
    parts = []
    if rec.get("PMID"):
        parts.append(f"PMID: {rec['PMID']}")
    if rec.get("TI"):
        parts.append(f"Title: {rec['TI']}")
    if rec.get("AU"):
        parts.append(f"Authors: {', '.join(rec['AU'])}")
    if rec.get("JT"):
        parts.append(f"Journal: {rec['JT']}")
    if rec.get("DP"):
        parts.append(f"Date: {rec['DP']}")
    if rec.get("AB"):
        parts.append(f"Abstract:\n{rec['AB']}")
    return "\n".join(parts).strip()

written = 0
for stream in iter_fetch_batches() or []:
    for rec in Medline.parse(stream):
        pmid = rec.get("PMID")
        if not pmid:
            continue
        text = format_record(rec)
        if not text:
            continue
        out_path = os.path.join(FETCH_DIR, f"{pmid}.txt")
        with open(out_path, "w", encoding="utf-8") as handle:
            handle.write(text)
        written += 1

f"Wrote {written} records to {FETCH_DIR}."

In [None]:
# Cleanup: delete fetched files.
import shutil

if os.path.isdir(FETCH_DIR):
    shutil.rmtree(FETCH_DIR)
    "Deleted fetched data folder."
else:
    "Nothing to delete."

In [None]:
# Upload fetched files to S3.
import os

import boto3

S3_BUCKET = os.getenv("S3_BUCKET")
S3_PREFIX = os.getenv("S3_PREFIX", "")

if not S3_BUCKET:
    raise ValueError("S3_BUCKET must be set in your environment or .env")

s3 = boto3.client("s3")

uploaded = 0
for filename in os.listdir(FETCH_DIR):
    if not filename.endswith(".txt"):
        continue
    local_path = os.path.join(FETCH_DIR, filename)
    key = f"{S3_PREFIX.strip('/')}/{filename}" if S3_PREFIX else filename
    s3.upload_file(local_path, S3_BUCKET, key)
    uploaded += 1

f"Uploaded {uploaded} files to s3://{S3_BUCKET}/{S3_PREFIX.strip('/') if S3_PREFIX else ''}"