In [1]:
import pandas as pd
from Bio import Entrez
from datetime import datetime

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
from Bio import Entrez
from datetime import datetime

def search_pubmed(email):
    """
    Args:
    - email (str): Email address for Entrez login.

    Returns:
    - list : List of PubMed IDs.
    """

    # Define email for entrez login
    Entrez.email = email

    # Setup Date range for past 5 years
    current_year = datetime.now().year
    date_range = f"{current_year - 5}[PDAT] : {current_year}[PDAT]"

    # Create top 5 list of diseases
    diseases = ["Diabetes", "Cardiovascular disease", "Cancer", "Alzheimer's", "Dementia"]

    # Initialize list to collect all PubMed IDs
    pubmed_ids = []

    for disease in diseases:
        query = f"{disease} AND {date_range}"
        handle = Entrez.esearch(db='pubmed', term=query, retmax=1000)
        record = Entrez.read(handle)
        handle.close()

        # Append the list of IDs for the current disease to the master list
        pubmed_ids.extend(record['IdList'])

    # Return the collected list of PubMed IDs after the loop
    return pubmed_ids


In [4]:
from http.client import IncompleteRead

def fetch_articles(email, ids_list, retries=3):
    """
    Fetch details for a list of PubMed IDs.

    Args:
    - email (str): Email address for Entrez login.
    - ids_list (list): List of PubMed IDs.

    Returns:
    - list: List of dictionaries with article details.
    """
    ids = ','.join(ids_list)
    Entrez.email = email
    attempt = 0
    while attempt < retries:
        try:
            # Fetch article details
            handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
            results = Entrez.read(handle)
            handle.close()
            
            # Add citation counts
            for paper in results['PubmedArticle']:
                pmid = paper['MedlineCitation']['PMID']
                #citation_count = get_citation_count(pmid, email)
                #paper['CitationCount'] = citation_count
            
            return results
        except IncompleteRead as e:
            print(f"Incomplete read error encountered. Attempt {attempt + 1} of {retries}. Retrying...")
            attempt += 1
            if attempt == retries:
                print("Maximum retries reached. Raising last exception.")
                raise


In [5]:
def extract_article_details(paper):
    """
    Extract specific details from a PubMed article, including citation count.

    Args:
    - paper (dict): Dictionary of article details.

    Returns:
    - tuple: Extracted article details, including citation count.
    """

    title = paper.get('MedlineCitation', {}).get('Article', {}).get('ArticleTitle', 'No Title').lower()
    abstract_data = paper.get('MedlineCitation', {}).get('Article', {}).get('Abstract', {}).get('AbstractText', ['No Abstract'])
    abstract = abstract_data[0].lower() if isinstance(abstract_data, list) else abstract_data.lower()
    journal = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('Title', 'No Journal').lower()
    language = paper.get('MedlineCitation', {}).get('Article', {}).get('Language', ['No Language'])[0]
    pubdate = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('JournalIssue', {}).get('PubDate', {})
    year = pubdate.get('Year', 'No Data')
    month = pubdate.get('Month', 'No Data')
    authors_data = paper.get('MedlineCitation', {}).get('Article', {}).get('AuthorList', [])
    authors_list = []
    affiliations_list = []

    for author in authors_data:
        # Initialize variables for each author
        author_name = None
        affiliation = 'No Affiliation'

        # Check for author name and concatenate if present
        if 'LastName' in author and 'ForeName' in author:
            author_name = f"{author['LastName']} {author['ForeName']}"
            authors_list.append(author_name)

            # Check if 'AffiliationInfo' exists and is not an empty list
            affiliation_info = author.get('AffiliationInfo')
            if affiliation_info and isinstance(affiliation_info, list) and affiliation_info[0]:
                affiliation = affiliation_info[0].get('Affiliation', 'No Affiliation').lower()

        # Append affiliation to the list
        affiliations_list.append(affiliation)

    # Get Citation Count
    #citation_count = paper.get('CitationCount', 'No Citation Count')

    # Join the authors and affiliations into strings
    authors = ', '.join(authors_list)
    affiliations = ', '.join(affiliations_list)

    # Return the extracted information
    return title, abstract, journal, language, year, month, authors, affiliations


In [6]:
import pandas as pd

def create_dataframe(email, ids_list, chunk_size=1000):
    """
    Create a DataFrame containing details of PubMed articles, including citation count.

    This function fetches articles from PubMed in chunks and extracts relevant details
    such as title, abstract, journal, etc., to populate a DataFrame.

    Args:
    - email (str): Email address for Entrez login.
    - ids_list (list of str): List of PubMed IDs to fetch.
    - chunk_size (int, optional): The number of articles to fetch in each request. Default is 1000.

    Returns:
    - pandas.DataFrame: A DataFrame where each row represents an article and columns
      contain details like title, abstract, journal, language, year, month, study type,
      authors, affiliations, and citation count.
    """
    pubmed_df = {
        'Title': [], 'Abstract': [], 'Journal': [], 'Language': [], 'Year': [], 'Month': [],
         'Authors': [], 'Affiliations': []
    }

    for chunk_i in range(0, len(ids_list), chunk_size):
        chunk = ids_list[chunk_i:chunk_i + chunk_size]
        papers = fetch_articles(email, chunk)

        if papers is None or 'PubmedArticle' not in papers:
            print(f"Warning: No data returned for chunk starting at index {chunk_i}")
            continue

        for paper in papers["PubmedArticle"]:
            # Extract article details from the paper
            title, abstract, journal, language, year, month, authors, affiliations = extract_article_details(paper)

            # Append the details to the respective lists in the dictionary
            pubmed_df['Title'].append(title)
            pubmed_df['Abstract'].append(abstract)
            pubmed_df['Journal'].append(journal)
            pubmed_df['Language'].append(language)
            pubmed_df['Year'].append(year)
            pubmed_df['Month'].append(month)
            pubmed_df['Authors'].append(authors)
            pubmed_df['Affiliations'].append(affiliations)

    # Convert the dictionary to a pandas DataFrame
    pubmed_df = pd.DataFrame(pubmed_df)

    return pubmed_df

In [7]:
ids_list = search_pubmed("fhirshotlearning@gmail.com")

In [8]:
pubmed_df= create_dataframe("fhirshotlearning@gmail.com",ids_list)

In [9]:
impact_factor_path='CopyofImpactFactor2024.csv'

In [10]:
import pandas as pd

def merge_impact_factors(pubmed_df, impact_factor_csv_path, journal_col='Journal'):
    """
    Merge impact factors into the PubMed articles DataFrame, retain articles with impact factors,
    and drop columns that only contain NaN values.

    Args:
    - pubmed_df (DataFrame): DataFrame containing PubMed articles.
    - impact_factor_csv_path (str): Path to the CSV file with impact factors.
    - journal_col (str): Column name for journal titles in the PubMed DataFrame.

    Returns:
    - DataFrame: The merged DataFrame with impact factors and without NaN-only columns.
    """

    # Load the impact factor CSV file
    impact_factors_df = pd.read_csv(impact_factor_csv_path)

    # Format the journal titles consistently (strip whitespaces and convert to lowercase)
    pubmed_df[journal_col] = pubmed_df[journal_col].str.strip().str.lower()
    impact_factors_df['Name'] = impact_factors_df['Name'].str.strip().str.lower()
    impact_factors_df['Abbr Name'] = impact_factors_df['Abbr Name'].str.strip().str.lower()

    # Attempt to merge based on multiple keys: Name, Abbreviated Name, ISSN, and EISSN
    merged_df = pubmed_df.merge(
        impact_factors_df,
        how='left',
        left_on=journal_col,
        right_on='Name'
    )

    # Attempt merging with additional identifiers if no matches are found
    if merged_df['JIF'].isna().all():
        merged_df = pubmed_df.merge(
            impact_factors_df,
            how='left',
            left_on=journal_col,
            right_on='Abbr Name'
        )
    elif merged_df['JIF'].isna().all() and 'ISSN' in pubmed_df.columns:
        merged_df = pubmed_df.merge(
            impact_factors_df,
            how='left',
            left_on='ISSN',
            right_on='ISSN'
        )
    elif merged_df['JIF'].isna().all() and 'EISSN' in pubmed_df.columns:
        merged_df = pubmed_df.merge(
            impact_factors_df,
            how='left',
            left_on='EISSN',
            right_on='EISSN'
        )

    # Rename relevant columns for clarity
    merged_df.rename(columns={
        'JIF': 'Impact_Factor',
        'JIF5Years': 'Impact_Factor_5Years',
        'Category': 'Journal_Category'
    }, inplace=True)

    # Retain only articles with available impact factors
    merged_df = merged_df.dropna(subset=['Impact_Factor'])

    # Drop columns that only contain NaN values
    merged_df = merged_df.dropna(axis=1, how='all')
    return merged_df

In [11]:
final_df = merge_impact_factors(pubmed_df,impact_factor_path , 'Journal')

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3267 entries, 0 to 4996
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Title                 3267 non-null   object
 1   Abstract              3267 non-null   object
 2   Journal               3267 non-null   object
 3   Language              3267 non-null   object
 4   Year                  3267 non-null   object
 5   Month                 3267 non-null   object
 6   Authors               3267 non-null   object
 7   Affiliations          3267 non-null   object
 8   Name                  3267 non-null   object
 9   Abbr Name             3267 non-null   object
 10  ISSN                  2299 non-null   object
 11  EISSN                 3252 non-null   object
 12  Impact_Factor         3267 non-null   object
 13  Impact_Factor_5Years  3199 non-null   object
 14  Journal_Category      3267 non-null   object
dtypes: object(15)
memory usage: 408.4+ KB


In [13]:
from gliner import GLiNER
import pandas as pd

# Load the GLiNER model
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

# Labels for entity prediction
labels_universities = ["Organization"]
labels_study_types = ["Study Type"]

# Function to extract universities from affiliations using GLiNER
def extract_universities_gliner(affiliation):
    """
    Extract universities from the affiliation string using GLiNER.

    Args:
    - affiliation (str): The affiliation string.

    Returns:
    - str: Extracted university names.
    """
    if not isinstance(affiliation, str) or affiliation.strip() == "":
        return "Unknown"

    # Perform entity prediction using GLiNER
    entities = model.predict_entities(affiliation, labels_universities, threshold=0.5)

    # Extract universities from the identified entities
    universities = [entity["text"] for entity in entities if entity["label"] == "Organization"]

    # Return universities as a comma-separated string or 'Unknown' if none found
    return ", ".join(universities) if universities else "Unknown"

# Function to extract study types from abstract using GLiNER
def extract_study_type_gliner(abstract):
    """
    Extract study types from the abstract text using GLiNER.

    Args:
    - abstract (str): Abstract of the study.

    Returns:
    - str: The type of study.
    """
    if not isinstance(abstract, str) or abstract.strip() == "":
        return "Unknown"

    # Perform entity prediction using GLiNER
    entities = model.predict_entities(abstract, labels_study_types, threshold=0.5)

    # Extract study type from the identified entities
    study_types = [entity["text"] for entity in entities if entity["label"] == "Study Type"]

    # Return the first matched study type or 'Unknown' if none found
    return study_types[0] if study_types else "Unknown"

# Apply the GLiNER extraction functions to the DataFrame
final_df['Universities'] = final_df['Affiliations'].apply(extract_universities_gliner)
final_df['Study_Type_Extracted'] = final_df['Abstract'].apply(extract_study_type_gliner)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
import re
def standardize_university_names(universities_column):
    standardized_names = []
    for university in universities_column:
        if university.lower() == 'unknown':
            standardized_names.append('Unknown')
            continue

        # Extract main university name using regex
        match = re.search(r'([a-zA-Z]+\s*(university|institute|college|academy|school))', university, re.IGNORECASE)
        if match:
            standardized_names.append(match.group(0).strip().lower())
        else:
            standardized_names.append('Unknown')

    return standardized_names

final_df['Standardized_University'] = standardize_university_names(final_df['Universities'])

In [16]:
import requests
import pandas as pd

# Function to fetch rankings from API and merge with final_df
def extract_and_merge_university_ranking(final_df, api_url):
    """
    Extracts university rankings from a given API and merges them with the existing DataFrame.

    Args:
        final_df (DataFrame): Existing DataFrame with a column named 'Standardized_University'.
        api_url (str): URL to the API that provides university rankings.

    Returns:
        DataFrame: Updated DataFrame containing 'Rank' and 'Research_Score' columns.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    try:
        # Sending GET request to the URL
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()

        # Extracting relevant data from the API response
        university_names = []
        ranks = []
        research_scores = []

        for university in data.get('data', []):
            uni_name = university.get('name')
            rank = university.get('rank')
            research_score = university.get('scores_research')

            university_names.append(uni_name.lower())  # Convert to lowercase for standardization
            ranks.append(rank)
            research_scores.append(research_score)

        # Creating DataFrame from extracted data
        ranking_df = pd.DataFrame({
            'University': university_names,
            'Rank': ranks,
            'Research_Score': research_scores
        })

        # Cleaning up rank values to remove symbols like '=' and converting to int
        ranking_df['Rank'] = ranking_df['Rank'].replace('=', '', regex=True).astype(str)

        # Standardizing 'Standardized_University' column in final_df to lowercase for matching
        final_df['Standardized_University'] = final_df['Standardized_University'].str.lower()

        # Merging rankings with the original DataFrame
        final_df = final_df.merge(ranking_df, left_on='Standardized_University', right_on='University', how='left', suffixes=('', '_Ranking'))
        return final_df

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching university rankings: {e}")
        return final_df

# Example usage
api_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2024_0__91239a4509dc50911f1949984e3fb8c5.json"


# Call the method and update final_df
pubmed_final = extract_and_merge_university_ranking(final_df, api_url)
print(pubmed_final.head())

                                               Title  \
0  corrigendum to "consensus report on glucagon-l...   
1  human primary macrophages can transmit coxsack...   
2  association between insulin-associated gene po...   
3  improvement of glycemia risk index and continu...   
4  transmucosal glucagon rapidly increases blood ...   

                                            Abstract  \
0                                        no abstract   
1                                        no abstract   
2  while statins are effective at managing lipid ...   
3  managing glycemia during ramadan is challengin...   
4  to evaluate the effect of transmucosal glucago...   

                                      Journal Language  Year Month  \
0  journal of diabetes science and technology      eng  2024   Nov   
1                 journal of medical virology      eng  2024   Dec   
2  european journal of clinical investigation      eng  2024   Nov   
3  journal of diabetes science and technology 

In [17]:
pubmed_final.to_csv('pubmed_data.csv')