In [12]:
import requests
import json
from datetime import datetime
import os
from xml.etree import ElementTree as ET


In [2]:
import os
import time
import requests
import random
import xml.etree.ElementTree as ET
from tqdm import tqdm
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

# === CONFIG ===
API_KEY = "5e6b250a2385792b115d07ec55db5ec44908"  # Replace with your actual key

RATE_LIMIT_DELAY = 0.12
MAX_RESULTS = 10000
PMID_FILE = "open_access_pmids.txt"
PUBMED_BATCH_SIZE = 200

MAX_WORKERS = 8  # Number of concurrent threads

def fetch_pubmed_summaries_batch(batch_pmids):
    id_str = ",".join(batch_pmids)
    summary_url = (
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
        "?db=pubmed"
        f"&id={id_str}"
        "&retmode=json"
    )
    if API_KEY:
        summary_url += f"&api_key={API_KEY}"
    summary_resp = requests.get(summary_url)
    summary_resp.raise_for_status()
    summaries = summary_resp.json()["result"]
    results = []
    for pmid in batch_pmids:
        item = summaries.get(pmid, {})
        doi = item.get("elocationid") or item.get("doi")
        pmcid = item.get("pmcid") if "pmcid" in item else None
        results.append({
            "doi": doi,
            "pmid": pmid,
            "pmcid": pmcid,
            "source": "PubMed"
        })
    return results

def fetch_pubmed_identifiers(keywords, start_date, end_date, max_results=MAX_RESULTS):
    query = " OR ".join([f'"{kw}"' for kw in keywords]) + " AND pubmed pmc open access[filter]"
    url = (
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        "?db=pubmed"
        f"&term=({query})"
        f"&mindate={start_date}&maxdate={end_date}&datetype=pdat"
        f"&retmode=json&retmax={max_results}"
    )
    if API_KEY:
        url += f"&api_key={API_KEY}"
    print("Fetching PMIDs from:", url)
    response = requests.get(url)
    response.raise_for_status()
    pmids = response.json()["esearchresult"]["idlist"]

    # Concurrent batch fetch summaries
    results = []
    batches = [pmids[i:i+PUBMED_BATCH_SIZE] for i in range(0, len(pmids), PUBMED_BATCH_SIZE)]
    print(f"Fetching PubMed summaries in {len(batches)} batches (concurrent)...")
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_batch = {executor.submit(fetch_pubmed_summaries_batch, batch): batch for batch in batches}
        for future in tqdm(as_completed(future_to_batch), total=len(batches), desc="PubMed batches"):
            try:
                batch_results = future.result()
                results.extend(batch_results)
            except Exception as e:
                print(f"Batch failed: {e}")
    return results

def fetch_europepmc_identifiers(keywords, start_date, end_date, max_results=MAX_RESULTS):
    query = " OR ".join([f'"{kw}"' for kw in keywords])
    page_size = 1000
    results = []
    print("Fetching from Europe PMC...")
    for start in tqdm(range(1, max_results+1, page_size), desc="EuropePMC batches"):
        url = (
            "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
            f"?query=({query})+AND+OPEN_ACCESS:Y"
            f"&resultType=core&format=json&pageSize={page_size}&cursorMark=*&sort_date:y"
        )
        r = requests.get(url)
        r.raise_for_status()
        batch = r.json().get("resultList", {}).get("result", [])
        for result in batch:
            results.append({
                "doi": result.get("doi"),
                "pmid": result.get("pmid"),
                "pmcid": result.get("pmcid"),
                "source": "EuropePMC"
            })
        if len(batch) < page_size:
            break
        time.sleep(0.1)
    return results

def fetch_crossref_identifiers(keywords, start_date, end_date, max_results=MAX_RESULTS):
    query = " OR ".join(keywords)
    rows = 1000
    results = []
    print("Fetching from CrossRef...")
    for offset in tqdm(range(0, max_results, rows), desc="CrossRef batches"):
        url = (
            "https://api.crossref.org/works"
            f"?query={query}"
            f"&filter=from-pub-date:{start_date},until-pub-date:{end_date},type:journal-article"
            f"&rows={rows}&offset={offset}"
        )
        r = requests.get(url)
        r.raise_for_status()
        batch = r.json().get("message", {}).get("items", [])
        for item in batch:
            results.append({
                "doi": item.get("DOI"),
                "pmid": None,
                "pmcid": None,
                "source": "CrossRef"
            })
        if len(batch) < rows:
            break
        time.sleep(0.1)
    return results

def merge_and_rank_identifiers(*sources):
    merged = {}
    for source in sources:
        for item in source:
            key = item.get("doi") or item.get("pmid") or item.get("pmcid")
            if not key:
                continue
            if key not in merged:
                merged[key] = item
            else:
                for k in ["doi", "pmid", "pmcid"]:
                    if not merged[key].get(k) and item.get(k):
                        merged[key][k] = item[k]
    ranked = sorted(
        merged.values(),
        key=lambda x: (x.get("doi") is not None, x.get("pmid") is not None, x.get("pmcid") is not None),
        reverse=True
    )
    return ranked

if __name__ == "__main__":
    keywords = [
        "eczema", "atopic dermatitis", "skin inflammation", "eczema treatment",
        "healthcare for eczema patients", "mental health and eczema",
        "eczema and mental health", "eczema and depression", "eczema and anxiety"
    ]
    start_date = "2020-01-01"
    end_date = "2024-12-31"

    start_time = time.time()
    pubmed_results = fetch_pubmed_identifiers(keywords, start_date, end_date)
    europepmc_results = fetch_europepmc_identifiers(keywords, start_date, end_date)
    crossref_results = fetch_crossref_identifiers(keywords, start_date, end_date)
    all_results = merge_and_rank_identifiers(pubmed_results, europepmc_results, crossref_results)
    elapsed = time.time() - start_time

    with open(PMID_FILE, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["doi", "pmid", "pmcid"])
        for item in all_results:
            writer.writerow([
                item.get("doi") or "none",
                item.get("pmid") or "none",
                item.get("pmcid") or "none"
            ])
    print(f"Identifiers saved to {PMID_FILE}")


Fetching PMIDs from: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=("eczema" OR "atopic dermatitis" OR "skin inflammation" OR "eczema treatment" OR "healthcare for eczema patients" OR "mental health and eczema" OR "eczema and mental health" OR "eczema and depression" OR "eczema and anxiety" AND pubmed pmc open access[filter])&mindate=2020-01-01&maxdate=2024-12-31&datetype=pdat&retmode=json&retmax=10000&api_key=5e6b250a2385792b115d07ec55db5ec44908
Fetching PubMed summaries in 36 batches (concurrent)...


PubMed batches: 100%|██████████| 36/36 [00:14<00:00,  2.52it/s]


Fetching from Europe PMC...


EuropePMC batches: 100%|██████████| 10/10 [01:11<00:00,  7.12s/it]


Fetching from CrossRef...


CrossRef batches: 100%|██████████| 10/10 [04:28<00:00, 26.82s/it]

Identifiers saved to open_access_pmids.txt





In [None]:
import os
import time
import random
import requests
from tqdm import tqdm
from xml.etree import ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed

PDF_SAVE_FOLDER = "pubmed_pdfs"
PDF_RATE_LIMIT_DELAY = 0.3  # seconds between requests
PDF_RETRY_LIMIT = 5
MAX_WORKERS = 8  # Adjust for your bandwidth, 5-10 is usually safe

def extract_pmc_id_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        for article_id in root.iter("ArticleId"):
            if article_id.attrib.get("IdType") == "pmc":
                return article_id.text.replace("PMC", "")
    except Exception as e:
        print(f"❗ Error parsing {xml_path}: {e}")
    return None

def download_pmc_pdf(pmc_id, folder_path):
    pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"
    for attempt in range(PDF_RETRY_LIMIT):
        try:
            response = requests.get(pdf_url, allow_redirects=True, timeout=30)
            if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
                os.makedirs(folder_path, exist_ok=True)
                file_path = os.path.join(folder_path, f"PMC{pmc_id}.pdf")
                with open(file_path, "wb") as f:
                    f.write(response.content)
                return (pmc_id, True)
            elif response.status_code == 429:
                print(f"❗ Rate limited by NCBI. Waiting before retrying PMC{pmc_id}...")
                time.sleep(10 + random.uniform(0, 5))
            else:
                return (pmc_id, False)
        except requests.exceptions.RequestException as e:
            print(f"❗ PDF download error for PMC{pmc_id} (attempt {attempt+1}): {e}")
            time.sleep(2 ** attempt + random.uniform(0, 1))
    return (pmc_id, False)

def batch_download_open_access_pdfs(xml_folder, pdf_folder, limit=None, max_workers=MAX_WORKERS):
    xml_files = [os.path.join(xml_folder, f) for f in os.listdir(xml_folder) if f.endswith(".xml")]
    print(f"🔎 Scanning {len(xml_files)} XML files for open-access PDFs...")
    pmc_ids = []
    for xml_path in tqdm(xml_files, desc="Extracting PMC IDs"):
        pmc_id = extract_pmc_id_from_xml(xml_path)
        if pmc_id:
            pmc_ids.append(pmc_id)

    if limit is not None:
        pmc_ids = pmc_ids[:limit]
        print(f"⚡ Limiting download to first {limit} PMC IDs.")

    print(f"✅ Ready to download {len(pmc_ids)} PDFs concurrently with {max_workers} workers.")
    skipped = []
    downloaded = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_pmc = {executor.submit(download_pmc_pdf, pmc_id, pdf_folder): pmc_id for pmc_id in pmc_ids}
        for future in tqdm(as_completed(future_to_pmc), total=len(future_to_pmc), desc="Downloading PDFs"):
            pmc_id, success = future.result()
            if success:
                downloaded += 1
            else:
                skipped.append(pmc_id)
            time.sleep(PDF_RATE_LIMIT_DELAY)

    print(f"\nSummary:")
    print(f"Downloaded PDFs: {downloaded}")
    print(f"Skipped (no PDF found): {len(skipped)}")
    if skipped:
        with open("skipped_pmc_ids.txt", "w") as f:
            for pmc_id in skipped:
                f.write(f"{pmc_id}\n")
        print(f"Skipped PMC IDs saved to skipped_pmc_ids.txt")

# === USAGE ===
if __name__ == "__main__":
    # Only process files already in pubmed_articles
    # Set limit=50 for a test run, or remove for all
    batch_download_open_access_pdfs("pubmed_articles", "pubmed_pdfs", limit=10, max_workers=8)


🔎 Scanning 9951 XML files for open-access PDFs...


Extracting PMC IDs: 100%|██████████| 9951/9951 [00:03<00:00, 2990.37it/s]


⚡ Limiting download to first 10 PMC IDs.
✅ Ready to download 10 PDFs concurrently with 8 workers.


Downloading PDFs: 100%|██████████| 10/10 [00:04<00:00,  2.31it/s]


Summary:
Downloaded PDFs: 0
Skipped (no PDF found): 10
Skipped PMC IDs saved to skipped_pmc_ids.txt



