# PubMed Processing + Quick Relevance Check

This notebook was created with the help of Cursor to review and process the data being fetched. It reads the fetched `.txt` records from `data/pubmed_fetch/`, normalizes them into a simple structure, and runs a lightweight relevance check to see if the search filter is producing useful articles.

It is intentionally simple and fast to run locally before investing time in chunking/embedding.

In [None]:
import glob
import os
import re

try:
    from dotenv import load_dotenv

    load_dotenv()
except Exception:
    pass

FETCH_DIR = "data/pubmed_fetch"
paths = sorted(glob.glob(os.path.join(FETCH_DIR, "*.txt")))

if not paths:
    raise FileNotFoundError(
        f"No records found in {FETCH_DIR}. Run the search notebook first."
    )


def parse_record(text):
    record = {
        "pmid": None,
        "title": "",
        "authors": "",
        "journal": "",
        "date": "",
        "abstract": "",
    }
    abstract_lines = []
    in_abstract = False

    for line in text.splitlines():
        if line.startswith("PMID: "):
            record["pmid"] = line.replace("PMID: ", "").strip()
            continue
        if line.startswith("Title: "):
            record["title"] = line.replace("Title: ", "").strip()
            continue
        if line.startswith("Authors: "):
            record["authors"] = line.replace("Authors: ", "").strip()
            continue
        if line.startswith("Journal: "):
            record["journal"] = line.replace("Journal: ", "").strip()
            continue
        if line.startswith("Date: "):
            record["date"] = line.replace("Date: ", "").strip()
            continue
        if line.startswith("Abstract:"):
            in_abstract = True
            abstract_lines.append(line.replace("Abstract:", "").lstrip())
            continue
        if in_abstract:
            abstract_lines.append(line)

    record["abstract"] = "\n".join([line for line in abstract_lines if line]).strip()
    return record


records = []
for path in paths:
    with open(path, "r", encoding="utf-8") as handle:
        records.append(parse_record(handle.read()))

len(records)

In [None]:
from collections import Counter


def normalize(text):
    return re.sub(r"\s+", " ", text or "").strip().lower()


signal_terms = [
    "caregiver",
    "caregiving",
    "decision support",
    "clinical decision support",
    "cdss",
    "dementia",
    "alzheimer",
    "mild cognitive impairment",
]


def has_signal(rec):
    haystack = normalize(f"{rec.get('title', '')} {rec.get('abstract', '')}")
    return any(term in haystack for term in signal_terms)


with_abstract = sum(1 for rec in records if rec.get("abstract"))
signal_hits = [rec for rec in records if has_signal(rec)]

avg_abstract_len = (
    sum(len(rec.get("abstract", "")) for rec in records) / max(len(records), 1)
)

journal_counts = Counter(rec.get("journal", "").strip() for rec in records)

summary = {
    "total_records": len(records),
    "with_abstract": with_abstract,
    "with_abstract_pct": round(with_abstract / max(len(records), 1) * 100, 1),
    "signal_match_pct": round(len(signal_hits) / max(len(records), 1) * 100, 1),
    "avg_abstract_len_chars": int(avg_abstract_len),
    "top_journals": journal_counts.most_common(5),
}

summary

In [None]:
# Quick spot-check: titles that did NOT match the simple signal terms.
no_signal_titles = [rec.get("title") for rec in records if not has_signal(rec)]
no_signal_titles[:10]

In [None]:
import json
from datetime import datetime


def normalize_whitespace(text):
    return re.sub(r"\s+", " ", text or "").strip()


def normalize_date(value):
    # Best-effort normalization to YYYY-MM-DD; fallback to original
    value = (value or "").strip()
    if not value:
        return ""
    # Examples observed: "2026 Jan 7", "2025 Dec", "2025"
    try:
        return datetime.strptime(value, "%Y %b %d").strftime("%Y-%m-%d")
    except ValueError:
        pass
    try:
        return datetime.strptime(value, "%Y %b").strftime("%Y-%m-01")
    except ValueError:
        pass
    if re.fullmatch(r"\d{4}", value):
        return f"{value}-01-01"
    return value

In [None]:
OUTPUT_DIR = "data"
RUN_DATE = datetime.utcnow().strftime("%Y%m%d")
OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"pubmed_records_{RUN_DATE}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)

record_docs = []
for rec in records:
    title = normalize_whitespace(rec.get("title", ""))
    abstract = normalize_whitespace(rec.get("abstract", ""))
    full_text = "\n".join([t for t in [title, abstract] if t])
    record_docs.append(
        {
            "id": rec.get("pmid"),
            "text": full_text,
            "metadata": {
                "pmid": rec.get("pmid"),
                "title": title,
                "journal": rec.get("journal"),
                "authors": rec.get("authors"),
                "date": normalize_date(rec.get("date")),
                "source": "pubmed_fetch",
            },
        }
    )

with open(OUTPUT_PATH, "w", encoding="utf-8") as handle:
    for doc in record_docs:
        handle.write(json.dumps(doc, ensure_ascii=True) + "\n")

OUTPUT_PATH, len(record_docs)

In [None]:
# Upload to S3 processed prefix
S3_BUCKET = os.getenv("S3_BUCKET", "")
S3_PREFIX = os.getenv("S3_PREFIX", "processed/")

if not S3_BUCKET:
    raise ValueError("Set S3_BUCKET in your environment or .env before upload.")

s3_uri = f"s3://{S3_BUCKET}/{S3_PREFIX}"

import subprocess

subprocess.run([
    "aws",
    "s3",
    "cp",
    OUTPUT_PATH,
    s3_uri,
], check=True)

s3_uri

In [None]:
# Kick off Bedrock KB ingestion
KB_ID = os.getenv("BEDROCK_KB_ID", "")
DATA_SOURCE_ID = os.getenv("BEDROCK_KB_DATA_SOURCE_ID", "")

if not KB_ID or not DATA_SOURCE_ID:
    raise ValueError("Set BEDROCK_KB_ID and BEDROCK_KB_DATA_SOURCE_ID in your env or .env")

import subprocess
import json

result = subprocess.run(
    [
        "aws",
        "bedrock-agent",
        "start-ingestion-job",
        "--knowledge-base-id",
        KB_ID,
        "--data-source-id",
        DATA_SOURCE_ID,
    ],
    check=True,
    capture_output=True,
    text=True,
)

payload = json.loads(result.stdout)
job_id = payload["ingestionJob"]["ingestionJobId"]
{
    "knowledgeBaseId": KB_ID,
    "dataSourceId": DATA_SOURCE_ID,
    "ingestionJobId": job_id,
}