In [47]:
from Bio import Entrez
from Bio.Medline import parse
from io import StringIO
import pandas as pd

def fetch_pubmed_data(search_term, email, retmax=50):
    """
    Fetches data from PubMed related to a specific search term.
    Parameters:
        search_term (str): The term to search for in the PubMed database.
        email (str): The email address to be used for accessing PubMed's API.
        retmax (int, optional): The maximum number of results to retrieve. Defaults to 100.
    Returns:
        pandas.DataFrame: A DataFrame containing the details of the PubMed entries, including
                          PMID, PMC ID, Title, Authors, Abstract, Publication Date, Journal, Volume,
                          Issue, Pages, Affiliation, Article ID, E-Publication Date, Place of
                          Publication, Journal Abbreviation, Language, Publication Type, and MeSH Terms.
    """

    Entrez.email = email

    # Step 1: Search for articles in PubMed
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()

    idlist = record["IdList"]
    
    # Step 2: Fetch the PubMed records
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    records = handle.read()
    handle.close()

    records = parse(StringIO(records))

    columns = ["PMID", "PMC ID", "Title", "Authors", "Abstract", "Publication Date", "Journal", "Volume", "Issue", "Pages", "Affiliation", "Article ID", "E-Publication Date", "Place of Publication", "Journal Abbreviation", "Language", "Publication Type", "MeSH Terms"]
    df = pd.DataFrame(columns=columns)

    # Step 3: Convert each PMID to PMC ID
    for record in records:
        pmid = record.get("PMID", "N/A")
        
        # Step 4: Fetch PMC ID for the PMID
        pmc_id = get_pmc_id(pmid)
        
        new_row = {
            "PMID": pmid,
            "PMC ID": pmc_id,
            "Title": record.get("TI", "N/A"),
            "Authors": ", ".join(record.get("AU", ["N/A"])),
            "Abstract": record.get("AB", "N/A"),
            "Publication Date": record.get("DP", "N/A"),
            "Journal": record.get("JT", "N/A"),
            "Volume": record.get("VI", "N/A"),
            "Issue": record.get("IP", "N/A"),
            "Pages": record.get("PG", "N/A"),
            "Affiliation": record.get("AD", "N/A"),
            "Article ID": ", ".join(record.get("AID", ["N/A"])),
            "E-Publication Date": record.get("DEP", "N/A"),
            "Place of Publication": record.get("PL", "N/A"),
            "Journal Abbreviation": record.get("TA", "N/A"),
            "Language": ", ".join(record.get("LA", ["N/A"])),
            "Publication Type": ", ".join(record.get("PT", ["N/A"])),
            "MeSH Terms": ", ".join(record.get("MH", ["N/A"])),
        }
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    return df

def get_pmc_id(pmid):
    """
    Converts a PubMed ID (PMID) to a PubMed Central ID (PMC ID) using Entrez.
    Parameters:
        pmid (str): The PubMed ID to convert.
    Returns:
        str: The corresponding PMC ID or "N/A" if not found.
    """
    handle = Entrez.elink(dbfrom="pubmed", id=pmid, db="pmc")
    link_record = Entrez.read(handle)
    handle.close()

    # Extract PMC ID from the link record
    if link_record and "LinkSetDb" in link_record[0]:
        for link_set in link_record[0]["LinkSetDb"]:
            if link_set["DbTo"] == "pmc":
                return link_set["Link"][0]["Id"]  # Return the first linked PMC ID

    return "N/A"

# Example usage
search_term = "Gut microbiome"
email = "your-email@example.com"
df = fetch_pubmed_data(search_term, email)

In [48]:
df

Unnamed: 0,PMID,PMC ID,Title,Authors,Abstract,Publication Date,Journal,Volume,Issue,Pages,Affiliation,Article ID,E-Publication Date,Place of Publication,Journal Abbreviation,Language,Publication Type,MeSH Terms
0,39324788,,Gill-associated ammonia oxidizers are widespre...,"Mes W, Lucker S, Jetten MS, Siepel H, Gorissen...",Recent advances in sequencing methods have gre...,2024 Sep 26,Microbiology spectrum,,,e0029524,"[Department of Microbiology, Radboud Institute...",10.1128/spectrum.00295-24 [doi],20240926.0,United States,Microbiol Spectr,eng,Journal Article,
1,39324491,,FMT rescues mice from DSS-induced colitis in a...,"Pu D, Yao Y, Zhou C, Liu R, Wang Z, Liu Y, Wan...",Fecal microbiota transplantation (FMT) is curr...,2024 Jan-Dec,Gut microbes,16.0,1,2397879,"[Department of Gastroenterology, The Second Af...",10.1080/19490976.2024.2397879 [doi],20240926.0,United States,Gut Microbes,eng,Journal Article,"Animals, *Colitis/therapy/chemically induced/i..."
2,39324257,,Seasonal variation in the stomach microbiota o...,"Yew WC, Adlard S, Dunn MJ, Alias SA, Pearce DA...",The gut microbiomes of Antarctic penguins are ...,2024 Sep,"Microbiology (Reading, England)",170.0,9,,"[Department of Applied Sciences, Faculty of He...",10.1099/mic.0.001503 [doi],,England,Microbiology (Reading),eng,Journal Article,"Animals, *Spheniscidae/microbiology, *Seasons,..."
3,39324223,,Identification of Key Genes in Fetal Gut Devel...,"Ma Q, Meng M, Zhou X, Guo W, Feng K, Huang T, ...",The study of fetal gut development is critical...,2024 Sep 26,Proteomics,,,e202400104,"[School of Life Sciences, Shanghai University,...",10.1002/pmic.202400104 [doi],20240926.0,Germany,Proteomics,eng,Journal Article,
4,39324143,,Role of cellular effectors in the induction an...,"Carreto-Binaghi LE, Sztein MB, Booth JS",The mucosal immune system is a critical first ...,2024,Frontiers in immunology,15.0,,1446072,[Center for Vaccine Development and Global Hea...,10.3389/fimmu.2024.1446072 [doi],20240911.0,Switzerland,Front Immunol,eng,"Journal Article, Review","Humans, Animals, *Immunoglobulin A/immunology,..."
5,39323893,,"Isolation, genomic analysis and functional cha...","Kim Y, Lee JH, Ha J, Cho EG",Probiotics and their derivatives offer signifi...,2024,Frontiers in microbiology,15.0,,1452127,"[Consumer Health 2 Center, CHA Advanced Resear...",10.3389/fmicb.2024.1452127 [doi],20240909.0,Switzerland,Front Microbiol,eng,Journal Article,
6,39323880,,Effect of intestinal microbiota transplantatio...,"Deng L, Guo X, Chen J, Li B, Liu N, Xia J, Ou ...",BACKGROUND: Research on the effects of intesti...,2024,Frontiers in microbiology,15.0,,1458754,"[Department of Infectious Diseases, the Fifth ...",10.3389/fmicb.2024.1458754 [doi],20240911.0,Switzerland,Front Microbiol,eng,Journal Article,
7,39323879,,"Causality investigation among gut microbiota, ...","Yue SY, Li WY, Xu S, Bai XX, Xu WL, Wang X, Di...",BACKGROUND: The gut microbiota has been demons...,2024,Frontiers in microbiology,15.0,,1445304,"[Department of Urology, The First Affiliated H...",10.3389/fmicb.2024.1445304 [doi],20240911.0,Switzerland,Front Microbiol,eng,Journal Article,
8,39323620,,Bioinformatics Approaches in the Development o...,"Ahlawat V, Sura K, Singh B, Dangi M, Chhillar AK",Fungal infections are considered a great threa...,2024,Current genomics,25.0,5,323-333,"[Centre for Biotechnology, M.D. University, Ro...","CG-25-323 [pii], 10.2174/011389202928160224042...",20240516.0,United Arab Emirates,Curr Genomics,eng,"Journal Article, Review",
9,39323363,,Probiotic therapy as a promising strategy for ...,"de Albuquerque Lemos DE, de Brito Alves JL, de...",INTRODUCTION: Gestational diabetes mellitus (G...,2024 Sep 26,Expert opinion on biological therapy,,,,"[Department of Nutrition, Health Sciences Cent...",10.1080/14712598.2024.2409880 [doi],20240926.0,England,Expert Opin Biol Ther,eng,"Journal Article, Review",


MONGO

In [178]:
from pymongo import MongoClient  # Import MongoDB client

def save_to_mongodb(articles):
    """Insert multiple articles into MongoDB."""
    try:
        client = MongoClient("mongodb+srv://261201brian:BrianLovesMongoDBAtlas.@knowledge.c44yv.mongodb.net/?retryWrites=true&w=majority&appName=Knowledge")
        client.admin.command('ping')
        print("Connection successful!")
    except Exception as e:
        print(f"Connection failed: {e}")
        return

    db = client['research_articles']  # Use 'research_articles' database
    # collection = db['trial_gut_microbiome']  # Use 'gut_microbiome' collection
    collection = db['machine learning']  # Use 'gut_microbiome' collection

    # Insert multiple documents into the collection
    result = collection.insert_many(articles)
    print(f"Inserted {len(result.inserted_ids)} documents with ids: {result.inserted_ids}")

In [181]:
from Bio import Entrez
import re
import json
import xml.etree.ElementTree as ET
import unicodedata

def extract_full_text(element):
    # Collect all text from the element and its children
    return ' '.join(element.itertext()).strip()

def clean_text(text):
    # Escape quotes and newlines
    text = text.replace('"', '\\"')  # Escape double quotes
    text = text.replace('\n', ' ')    # Replace newlines with space

    # Normalize Unicode characters (e.g., \u00f6 to ö, \u03bc to µ, \u00b0 to °, \u2212 to −)
    text = unicodedata.normalize("NFKD", text)

    # Remove in-text citations like (Author et al., Year) but keep scientific measurements
    text = re.sub(r'\(\s*[A-Za-z\s,;&-]+(?:et al\.)?,?\s*\d{4}[^\)]*\)', '', text)

    # Remove extra whitespace caused by citation removal
    text = re.sub(r'\s+', ' ', text)

    # Strip leading/trailing spaces
    text = text.strip()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and trim

    return text

def fetch_pmc_full_text(search_term, email, retmax=10):
    Entrez.email = email

    # Step 1: Search for articles in PMC
    handle = Entrez.esearch(db="pmc", term=search_term, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()

    idlist = record["IdList"]

    # Step 2: Fetch the PMC records in full text format
    articles = []
    if idlist:  # Check if there are any IDs returned
        for pmc_id in idlist:
            handle = Entrez.efetch(db="pmc", id=pmc_id, rettype="xml", retmode="xml")
            # handle = Entrez.efetch(db="pmc", id=pmc_id, rettype="fulltext", retmode="xml")
            records = handle.read()  # Directly read the string response
            handle.close()

            # Parse the full text response
            parsed_data = parse_full_text(records)
            filled_fields_count = count_filled_fields(parsed_data)
            articles.append((filled_fields_count, parsed_data))
        
        # Sort articles by the number of filled fields in descending order
        articles.sort(key=lambda x: x[0], reverse=True)
        return [article[1] for article in articles]  # Return only the article data
    else:
        return [{"error": "No articles found."}]

def parse_full_text(records):
    # Use XML parser to handle the response
    root = ET.fromstring(records)

    # Extract article metadata
    pmc_id = clean_text(root.findtext('.//article-id[@pub-id-type="pmc"]', default='N/A'))
    doi = clean_text(root.findtext('.//article-id[@pub-id-type="doi"]', default='N/A'))

    # Extract article title including italicized parts
    title_element = root.find('.//article-title')
    title = ''
    if title_element is not None:
        title = clean_text(''.join(title_element.itertext()).strip())

    # Extract authors
    authors = [
        f"{clean_text(author.findtext('name/surname', default=''))} {clean_text(author.findtext('name/given-names', default=''))}".strip()
        for author in root.findall('.//contrib') if author.find('name/surname') is not None
    ]
    authors_list = ', '.join(authors)

    # Extract publication date
    pub_date = clean_text(root.findtext('.//pub-date/year', default='N/A')) + "-" + \
               clean_text(root.findtext('.//pub-date/month', default='N/A')).zfill(2) + "-" + \
               clean_text(root.findtext('.//pub-date/day', default='N/A')).zfill(2)

    # Extract abstract
    abstract_section = root.find('.//abstract')
    abstract = clean_text(' '.join(abstract_section.itertext())) if abstract_section is not None else "N/A"

    # Initialize a dictionary to hold article sections
    sections = {
        "PMC ID": pmc_id,
        "DOI": doi,
        "Title": title,
        "Authors": authors_list,
        "Publication Date": pub_date,
        "Abstract": abstract,
        "Introduction": "N/A",
        "Methods": "N/A",
        "Results": "N/A",
        "Discussion": "N/A",
        # "Conclusion": "N/A"
    }

    # Define keywords for various sections
    section_keywords = {
        "Introduction": ["introduction"],
        "Methods": ["materials and methods", "materials", "methods"],
        "Results": ["results", "findings"],
        "Discussion": ["discussion", "discuss"],
        # "Conclusion": ["conclusion", "summary", "final remarks", "conclu"]
    }

    # Check for sections and fill in the content
    for section in root.findall('.//sec'):
        sec_title = section.findtext('title')
        if sec_title:
            sec_title_text = sec_title.strip().lower()

            # Search for matching section based on keywords
            for section_name, keywords in section_keywords.items():
                if any(keyword in sec_title_text for keyword in keywords):
                    sections[section_name] = clean_text(extract_full_text(section))
                    break  # Once a match is found, stop checking other keywords

    return sections

    # # Check for additional sections like Methods, Results, Discussion, and Conclusion
    # for section in root.findall('.//sec'):
    #     sec_title = section.findtext('title')
    #     if sec_title:
    #         sec_title_text = sec_title.strip()
    #         # Ensure we are capturing the right sections
    #         if sec_title_text in sections:
    #             sections[sec_title_text] = clean_text(extract_full_text(section))
    #         # Check for Introduction-related titles
    #         elif "introduction" in sec_title_text.lower():
    #             sections["Introduction"] = clean_text(extract_full_text(section))
    #         # Check for Conclusion-related titles
    #         elif "conclus" in sec_title_text.lower() or "summary" in sec_title_text.lower():
    #             sections["Conclusion"] = clean_text(extract_full_text(section))

    # return sections

def count_filled_fields(article_data):
    """Count the number of non-empty sections in an article."""
    filled_count = 0
    for key, value in article_data.items():
        if value != "N/A" and value.strip():
            filled_count += 1
    return filled_count

# Example usage: Fetch and rank multiple articles by the number of filled fields
search_term = "machine learning"
email = "your-email@example.com"
articles = fetch_pmc_full_text(search_term, email, retmax=15)

# top_5_articles = articles[:5]
# save_to_mongodb(top_5_articles)

# Print each article's parsed data
for idx, article in enumerate(articles, start=1):
    print(f"Article {idx}:")
    print(json.dumps(article, indent=2))
    print("\n" + "="*80 + "\n")

Article 1:
{
  "PMC ID": "11424011",
  "DOI": "10.2196/47562",
  "Title": "Quality of Male and Female Medical Content on English-Language Wikipedia: Quantitative Content Analysis",
  "Authors": "Mavragani Amaryllis, Kimmerle Joachim, Uhawenimana Dr. Thierry Claudien, Faric\u030c Nus\u030ca, Potts Henry WW, Heilman James M",
  "Publication Date": "2024-09-12",
  "Abstract": "Background Wikipedia is the largest free online encyclopedia and the seventh most visited website worldwide, containing >45,000 freely accessible English-language medical articles accessed nearly 1.6 billion times annually. Concerns have been expressed about the balance of content related to biological sex on Wikipedia. Objective This study aims to categorize the top 1000 most-read (most popular) English-language Wikipedia health articles for June 2019 according to the relevance of the article topic to each sex and quality. Methods In the first step, Wikipedia articles were identified using WikiProject Medicine Popu