In [1]:
import pandas as pd
from Bio import Entrez
from datetime import datetime

In [2]:
def search_pubmed(email):
    """
    Args:
    - email (str): Email address for Entrez login.

    Returns:
    - list : List of Pubmed ID's.
    """
    
    # Define email for entrez login    
    Entrez.email = email
    
    # Setup Date range for past 5 years
    current_year = datetime.now().year
    date_range = f"{current_year - 5}[PDAT] : {current_year}[PDAT]"
    
    # Create top 5 list of diseases
    diseases = ["Diabetes", "Cardiovascular disease", "Cancer", "Alzheimer's", "Dementia"]
    
    # Initialize list to collect all Pubmed IDs
    pubmed_ids = []
    
    for disease in diseases:
        query = f"{disease} AND {date_range}"
        handle = Entrez.esearch(db='pubmed', term=query, retmax=1000)
        record = Entrez.read(handle)
        handle.close()
        
        # Append the list of IDs for the current disease to the master list
        pubmed_ids.extend(record['IdList'])
    
    # Return the collected list of Pubmed IDs after the loop
    return pubmed_ids


In [3]:
from http.client import IncompleteRead

def fetch_articles(email, ids_list, retries=3):
    """
    Fetch details for a list of PubMed IDs.

    Args:
    - email (str): Email address for Entrez login.
    - ids_list (list): List of PubMed IDs.

    Returns:
    - list: List of dictionaries with article details.
    """
    ids = ','.join(ids_list)
    Entrez.email = email
    attempt = 0
    while attempt < retries:
        try:
            handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
            results = Entrez.read(handle)
            handle.close()
            return results
        except IncompleteRead as e:
            print(f"Incomplete read error encountered. Attempt {attempt + 1} of {retries}. Retrying...")
            attempt += 1
            if attempt == retries:
                print("Maximum retries reached. Raising last exception.")
                raise



In [4]:
def extract_article_details(paper):
    """
    Extract specific details from a PubMed article.

    Args:
    - paper (dict): Dictionary of article details.

    Returns:
    - tuple: Extracted article details.
    """
    
    title = paper.get('MedlineCitation', {}).get('Article', {}).get('ArticleTitle', 'No Title').lower()
    abstract_data = paper.get('MedlineCitation', {}).get('Article', {}).get('Abstract', {}).get('AbstractText', ['No Abstract'])
    abstract = abstract_data[0].lower() if isinstance(abstract_data, list) else abstract_data.lower()
    journal = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('Title', 'No Journal').lower()
    language = paper.get('MedlineCitation', {}).get('Article', {}).get('Language', ['No Language'])[0]
    pubdate = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('JournalIssue', {}).get('PubDate', {})
    year = pubdate.get('Year', 'No Data')
    month = pubdate.get('Month', 'No Data')
    authors_data = paper.get('MedlineCitation', {}).get('Article', {}).get('AuthorList', [])
    authors_list = []
    affiliations_list = []
    
    for author in authors_data:
        # Initialize variables for each author
        author_name = None
        affiliation = 'No Affiliation'

        # Check for author name and concatenate if present
        if 'LastName' in author and 'ForeName' in author:
            author_name = f"{author['LastName']} {author['ForeName']}"
            authors_list.append(author_name)
            
            # Check if 'AffiliationInfo' exists and is not an empty list
            affiliation_info = author.get('AffiliationInfo')
            if affiliation_info and isinstance(affiliation_info, list) and affiliation_info[0]:
                affiliation = affiliation_info[0].get('Affiliation', 'No Affiliation').lower()
        
        # Append affiliation to the list
        affiliations_list.append(affiliation)
    
    # Join the authors and affiliations into strings
    authors = ', '.join(authors_list)
    affiliations = ', '.join(affiliations_list)
    
    # Return the extracted information
    return title, abstract, journal, language, year, month, authors, affiliations


In [5]:
import re

def determine_study_type(abstract):
    study_keywords = {
        "Randomized Controlled Trial": [r"\brandomized controlled trial\b", r"\brandomly assigned\b", r"\bplacebo-controlled trial\b"],
        "Observational Study": [r"\bobservational study\b", r"\bcross-sectional study\b", r"\blongitudinal study\b", r"\bcohort study\b"],
        "Review": [r"\breview\b", r"\bmeta-analysis\b"],
        # ... add more study types and keywords as needed
    }
    
    abstract_lower = abstract.lower()
    
    for study_type, keywords in study_keywords.items():
        for keyword in keywords:
            if re.search(keyword, abstract_lower):
                return study_type
    return "Unknown"

In [6]:
ids_list = search_pubmed("fhirshotlearning@gmail.com")

In [11]:
import pandas as pd

def create_dataframe(emails, ids_list, chunk_size=1000):
    """
    Create a DataFrame containing details of PubMed articles.

    This function fetches articles from PubMed in chunks and extracts relevant details
    such as title, abstract, journal, etc., to populate a DataFrame.

    Args:
    - emails (str): Email address for Entrez login.
    - ids_list (list of str): List of PubMed IDs to fetch.
    - chunk_size (int, optional): The number of articles to fetch in each request. Default is 1000.

    Returns:
    - pandas.DataFrame: A DataFrame where each row represents an article and columns
      contain details like title, abstract, journal, language, year, month, study type,
      authors, and affiliations.

    Raises:
    - Exception: If an error occurs during fetching articles from PubMed or extracting article details.
    """
    pubmed_df = {
        'Title': [], 'Abstract': [], 'Journal': [], 'Language': [], 'Year': [], 'Month': [],
        'Study_type': [], 'Authors': [], 'Affiliations': []
    }
    
    for chunk_i in range(0, len(ids_list), chunk_size):
        chunk = ids_list[chunk_i:chunk_i + chunk_size]
        papers = fetch_articles(emails, chunk)
        
        if papers is None or 'PubmedArticle' not in papers:
            print(f"Warning: No data returned for chunk starting at index {chunk_i}")
            continue
        
        for paper in papers["PubmedArticle"]:
            # Extract article details from the paper
            title, abstract, journal, language, year, month, authors, affiliations = extract_article_details(paper)
            
            # Append the details to the respective lists in the dictionary
            pubmed_df['Title'].append(title)
            pubmed_df['Abstract'].append(abstract)
            pubmed_df['Journal'].append(journal)
            pubmed_df['Language'].append(language)
            pubmed_df['Year'].append(year)
            pubmed_df['Month'].append(month)
            pubmed_df['Study_type'].append(determine_study_type(abstract))
            pubmed_df['Authors'].append(authors)
            pubmed_df['Affiliations'].append(affiliations)
        
    # Convert the dictionary to a pandas DataFrame
    pubmed_df = pd.DataFrame(pubmed_df)
    
    return pubmed_df


In [12]:
pubmed_df= create_dataframe("fhirshotlearning@gmail.com",ids_list)

In [None]:
pubmed_df.head()

In [None]:
pubmed_df.info()

In [13]:
def merge_impact_factors(pubmed_df, impact_factor_csv_path, journal_col='Journal', impact_factor_col='Impact_Factor'):
    """
    Merge impact factors into the PubMed articles DataFrame, retain articles with impact factors,
    and drop columns that only contain NaN values.

    Args:
    - pubmed_df (DataFrame): DataFrame containing PubMed articles.
    - impact_factor_csv_path (str): Path to the CSV file with impact factors.
    - journal_col (str): Column name for journal titles in both DataFrames.
    - impact_factor_col (str): Column name for impact factors in the CSV file.

    Returns:
    - DataFrame: The merged DataFrame with impact factors and without NaN-only columns.
    """
    
    impact_factors_df = pd.read_csv(impact_factor_csv_path)

    # Format the journal titles consistently
    pubmed_df[journal_col] = pubmed_df[journal_col].str.strip().str.lower()
    impact_factors_df[journal_col] = impact_factors_df[journal_col].str.strip().str.lower()

    # Merge the DataFrames based on the journal title
    merged_df = pubmed_df.merge(impact_factors_df, left_on=journal_col, right_on=journal_col, how='left')

    # Retain only the articles that have an impact factor
    merged_df = merged_df.dropna(subset=[impact_factor_col])

    # Rename the impact factor column if necessary
    merged_df.rename(columns={impact_factor_col: 'Impact_Factor'}, inplace=True)

    # Drop columns that only contain NaN values
    merged_df = merged_df.dropna(axis=1, how='all')

    return merged_df

In [14]:
impact_factor_path='impact_factor.csv'

In [15]:
final_df = merge_impact_factors(pubmed_df,impact_factor_path , 'Journal', 'Impact_Factor')

In [None]:
final_df.info()

In [16]:
final_df.head()

Unnamed: 0,Title,Abstract,Journal,Language,Year,Month,Study_type,Authors,Affiliations,Sl. No.,Impact_Factor
1,maternal serum xenin-25 levels in gestational ...,xenin-25 is a polypeptide having an insulinotr...,european review for medical and pharmacologica...,eng,2023,Oct,Unknown,"Kucukbas G N, Komuroglu A U, Dirik D, Korpe B,...","department of obstetrics and gynecology, perin...",3502.0,3.3
2,are nonfunctioning adrenal incidentalomas real...,"in patients with adrenal incidentaloma (ai), c...",european review for medical and pharmacologica...,eng,2023,Oct,Unknown,"Güneş E, Güneş M","department of endocrinology, health sciences u...",3502.0,3.3
3,brain abscess incidence and microbial etiology...,"brain abscess, a localized purulent central ne...",european review for medical and pharmacologica...,eng,2023,Oct,Unknown,"Korkmaz S, Korkmaz D","department of neurosurgery, faculty of medicin...",3502.0,3.3
4,weighted gene co-expression network analysis o...,this work aimed to explore the key targets and...,european review for medical and pharmacologica...,eng,2023,Oct,Unknown,"Zeng S-N, Li Y, Li Y-M-Q, Wang S-R","department of nephrology, the third hospital o...",3502.0,3.3
6,current perspectives on prevention of vascular...,the true global burden of vascular cognitive i...,expert review of neurotherapeutics,eng,2023,Nov,Unknown,"Kalaria Raj N, Akinyemi Rufus O, Paddick Stell...","translational and clinical research institute,...",2343.0,4.3


In [17]:
final_df.to_csv("pubmed_data.csv")