In [1]:
import requests
import json
import xml.etree.ElementTree as ET

# Define search terms for each tumor class
search_terms = {
    "Glioma": "Glioma MRI interpretation",
    "Meningioma": "Meningioma MRI interpretation",
    "Pituitary Tumor": "Pituitary tumor MRI interpretation",
    "No Tumor": "Normal brain MRI interpretation"
}

# Define the base URL for PubMed Central's E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# Function to fetch PubMed IDs (PMIDs) based on search terms
def fetch_pmids(search_term):
    params = {
        "db": "pubmed",
        "term": search_term,
        "retmax": "100",  # Number of results to fetch
        "retmode": "xml"
    }
    response = requests.get(base_url, params=params)
    root = ET.fromstring(response.content)
    return [id_elem.text for id_elem in root.findall(".//Id")]

# Function to fetch article details using PMIDs
def fetch_article_details(pmids):
    ids = ",".join(pmids)
    fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "pubmed",
        "id": ids,
        "retmode": "xml"
    }
    response = requests.get(fetch_url, params=params)
    root = ET.fromstring(response.content)
    articles = []
    for docsum in root.findall(".//DocSum"):
        article = {}
        for item in docsum.findall("Item"):
            name = item.get("Name")
            if name == "Title":
                article["title"] = item.text
            elif name == "Source":
                article["journal"] = item.text
            elif name == "PubDate":
                article["publication_date"] = item.text
            elif name == "AuthorList":
                authors = [author.text for author in item.findall("Author")]
                article["authors"] = authors
        articles.append(article)
    return articles

# Main function to scrape and save data
def scrape_and_save_data():
    all_data = {}
    for tumor_class, search_term in search_terms.items():
        print(f"Fetching data for {tumor_class}...")
        pmids = fetch_pmids(search_term)
        articles = fetch_article_details(pmids)
        all_data[tumor_class] = articles

    # Save the collected data to a JSON file
    with open("brain_tumor_mri_data.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    print("Data scraping complete and saved to 'brain_tumor_mri_data.json'.")

# Run the script
if __name__ == "__main__":
    scrape_and_save_data()


Fetching data for Glioma...
Fetching data for Meningioma...
Fetching data for Pituitary Tumor...
Fetching data for No Tumor...
Data scraping complete and saved to 'brain_tumor_mri_data.json'.


In [3]:
import requests
import xml.etree.ElementTree as ET
import json
import time

# Load your existing JSON
with open("brain_tumor_mri_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def fetch_abstract_by_title(title):
    # Step 1: Use ESearch to get PMID
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": title,
        "retmode": "json"
    }
    r = requests.get(search_url, params=search_params)
    r.raise_for_status()
    id_list = r.json().get("esearchresult", {}).get("idlist", [])
    if not id_list:
        return ""  # No match found
    pmid = id_list[0]

    # Step 2: Use EFetch to get abstract
    fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    fetch_params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml"
    }
    r = requests.get(fetch_url, params=fetch_params)
    r.raise_for_status()
    root = ET.fromstring(r.content)
    abstract_elem = root.find(".//Abstract")
    if abstract_elem is None:
        return ""
    abstract_text = " ".join([p.text for p in abstract_elem.findall("AbstractText") if p.text])
    return abstract_text

# Add abstracts to each article
for class_name, articles in data.items():
    print(f"Processing class: {class_name}")
    for article in articles:
        title = article.get("title")
        if title:
            try:
                article["abstract"] = fetch_abstract_by_title(title)
                time.sleep(0.3)  # avoid hitting PubMed too fast
            except Exception as e:
                print(f"Error fetching abstract for: {title}\n{e}")
                article["abstract"] = ""

# Save enriched JSON
with open("articles_with_abstracts.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print("Enriched JSON saved to 'articles_with_abstracts.json'")


Processing class: Glioma
Processing class: Meningioma
Processing class: Pituitary Tumor
Error fetching abstract for: Ectopic Thyrotropin-Secreting Tumor in the Nasopharynx Causing Central Hyperthyroidism.
("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
Processing class: No Tumor
Error fetching abstract for: Characterising ongoing brain aging and baseline effects from cross-sectional data.
500 Server Error: Internal Server Error for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=Characterising+ongoing+brain+aging+and+baseline+effects+from+cross-sectional+data.&retmode=json
Enriched JSON saved to 'articles_with_abstracts.json'


In [5]:
import json
import re

# Input: your enriched JSON
INPUT_FILE = "articles_with_abstracts.json"
OUTPUT_FILE = "tumor_class_summaries.json"

def clean_text(text: str) -> str:
    """Clean PubMed abstracts: remove citations, normalize spaces."""
    if not text:
        return ""
    # remove numeric citations like [1], [12]
    text = re.sub(r"\[\d+\]", "", text)
    # remove inline references with years e.g. (Smith et al., 2021)
    text = re.sub(r"\([^\)]*\d{4}[^\)]*\)", "", text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

def build_class_summaries(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    summaries = {}

    for class_name, articles in data.items():
        all_cleaned = []
        for article in articles:
            abs_text = article.get("abstract", "")
            cleaned = clean_text(abs_text)
            if cleaned:
                all_cleaned.append(cleaned)

        # merge all abstracts into one long string per class
        merged_text = " ".join(all_cleaned)

        summaries[class_name] = {
            "num_articles": len(all_cleaned),
            "summary_text": merged_text
        }

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(summaries, f, indent=2, ensure_ascii=False)

    print(f"✅ Summaries written to {output_path}")

if __name__ == "__main__":
    build_class_summaries(INPUT_FILE, OUTPUT_FILE)


✅ Summaries written to tumor_class_summaries.json
