In [None]:
pip install pandas metapub tqdm

In [None]:
import pandas as pd
import os
os.environ['NCBI_API_KEY'] = 'your_ncbi_api_key'
from functools import reduce
from metapub import PubMedFetcher
fetch = PubMedFetcher()
import time
from tqdm import tqdm
from xml.etree.ElementTree import ParseError
import logging

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [5]:
keyword = "(((delirium) AND ((detection) OR (prediction))) AND (ICU)) OR (((delirium) AND ((detection) OR (prediction))) AND (intensive care unit))"
num_of_articles = 1500

In [None]:
# First, fetch the PMIDs
logger.info("Fetching PMIDs from PubMed...")
try:
    pmids = fetch.pmids_for_query(keyword, retmax=num_of_articles)
    logger.info(f"Retrieved {len(pmids)} PMIDs")
except Exception as e:
    logger.error(f"Error fetching PMIDs: {str(e)}")
    raise

In [None]:
# Retrieve information for each article with error handling
articles = {}
failed_pmids = []

for pmid in tqdm(pmids, desc="Retrieving articles"):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            articles[pmid] = fetch.article_by_pmid(pmid)
            break  # Success, break the retry loop
        except ParseError as e:
            logger.warning(f"XML parsing error for PMID {pmid} on attempt {attempt + 1}: {str(e)}")
            if attempt == max_retries - 1:
                logger.error(f"Failed to retrieve article for PMID {pmid} after {max_retries} attempts")
                failed_pmids.append(pmid)
        except Exception as e:
            logger.warning(f"Error retrieving article for PMID {pmid} on attempt {attempt + 1}: {str(e)}")
            if attempt == max_retries - 1:
                logger.error(f"Failed to retrieve article for PMID {pmid} after {max_retries} attempts")
                failed_pmids.append(pmid)
        
        time.sleep(2 ** attempt)  # Exponential backoff

In [None]:
# Log summary
logger.info(f"Successfully retrieved {len(articles)} articles")
logger.info(f"Failed to retrieve {len(failed_pmids)} articles")

if failed_pmids:
    logger.info("Failed PMIDs: " + ", ".join(map(str, failed_pmids)))

In [10]:
def safe_get(article, attr):
    try:
        value = getattr(article, attr)
        if attr == 'citation':
            if getattr(article, 'book_accession_id', None):
                return generate_book_citation(article)
            elif callable(value):
                return value()
        elif attr == 'doi':  # Add special handling for DOI
            if callable(value):
                doi = value()
                return doi.lower() if doi else None  # Normalize DOI to lowercase
        elif callable(value):
            return value()
        return value
    except (AttributeError, IndexError):
        return None
    except Exception as e:
        return f"Error: {str(e)}"

In [11]:
def generate_book_citation(book):
    try:
        title = book.title or "Untitled"
        authors = ", ".join(book.authors) if book.authors else "Unknown Author"
        year = book.year or "Unknown Year"
        publisher = book.publisher or "Unknown Publisher"
        return f"{authors}. {title}. {publisher}, {year}."
    except Exception as e:
        return f"Error generating book citation: {str(e)}"

In [12]:
def generate_doi_link(doi):
    """Generate DOI link from DOI string"""
    if doi and isinstance(doi, str):
        return f"https://doi.org/{doi}"
    return None

In [13]:
# Extract relevant information and create DataFrames
titles = {}
abstracts = {}
authors = {}
years = {}
volumes = {}
issues = {}
journals = {}
citations = {}
links = {}
dois = {}
doi_links = {}


for pmid in pmids:
    article = articles[pmid]
    titles[pmid] = safe_get(article, 'title')
    abstracts[pmid] = safe_get(article, 'abstract')
    authors[pmid] = ', '.join(safe_get(article, 'authors') or [])
    years[pmid] = safe_get(article, 'year')
    volumes[pmid] = safe_get(article, 'volume')
    issues[pmid] = safe_get(article, 'issue')
    journals[pmid] = safe_get(article, 'journal')
    citations[pmid] = safe_get(article, 'citation')
    links[pmid] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
    dois[pmid] = safe_get(article, 'doi')
    doi_links[pmid] = generate_doi_link(dois[pmid])

In [14]:
# Create individual DataFrames
Title = pd.DataFrame(list(titles.items()), columns=['pmid', 'Title'])
Abstract = pd.DataFrame(list(abstracts.items()), columns=['pmid', 'Abstract'])
Author = pd.DataFrame(list(authors.items()), columns=['pmid', 'Author'])
Year = pd.DataFrame(list(years.items()), columns=['pmid', 'Year'])
Volume = pd.DataFrame(list(volumes.items()), columns=['pmid', 'Volume'])
Issue = pd.DataFrame(list(issues.items()), columns=['pmid', 'Issue'])
Journal = pd.DataFrame(list(journals.items()), columns=['pmid', 'Journal'])
Citation = pd.DataFrame(list(citations.items()), columns=['pmid', 'Citation'])
Link = pd.DataFrame(list(links.items()), columns=['pmid', 'Link'])
DOI = pd.DataFrame(list(dois.items()), columns=['pmid', 'DOI'])
DOI_Link = pd.DataFrame(list(doi_links.items()), columns=['pmid', 'DOI_Link'])

In [15]:
# Merge all DataFrames into a single one
data_frames = [Title, Abstract, Author, Year, Volume, Issue, Journal, Citation, Link, DOI, DOI_Link]
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['pmid'], how='outer'), data_frames)

In [None]:
# Display the first few rows of the merged DataFrame
print(df_merged.head())

In [None]:
import pandas as pd
import google.generativeai as genai
from pathlib import Path

In [None]:
def analyze_abstract_for_algorithm(abstract):
    """
    Analyze abstract using Gemini API to determine if it describes a delirium prediction/detection algorithm
    """
    if pd.isna(abstract):
        return False
        
    genai.configure(api_key='your_google_genai_api_key')
    
    prompt = f"""
    Analyze this abstract and determine if it reports a delirium prediction or detection algorithm.
    Answer only 'yes' if it describes developing or validating a prediction model, machine learning algorithm, 
    or detection system for delirium. Answer 'no' otherwise.
    
    Abstract: {abstract}
    """
    
    try:
        model = genai.GenerativeModel('gemini-pro')
        response = model.generate_content(prompt)
        return response.text.strip().lower() == 'yes'
    except Exception as e:
        print(f"Error analyzing abstract: {e}")
        return False

In [None]:
# Enable progress bar
tqdm.pandas(desc="Analyzing abstracts")

print(f"Processing {len(df_merged)} abstracts...")

In [None]:
# Add algorithm analysis column
df_merged['is_algorithm'] = df_merged['Abstract'].progress_apply(analyze_abstract_for_algorithm)

In [21]:
# Filter for only algorithm papers
algorithm_papers = df_merged[df_merged['is_algorithm'] == True]

In [22]:
# Save results
output_file = "delirium_algorithm_papers_pubmed.csv"
algorithm_papers.to_csv(output_file, index=False)

In [None]:
# Print summary
algorithm_papers = df_merged[df_merged['is_algorithm'] == True]
print(f"\nAnalysis complete!")
print(f"Found {len(algorithm_papers)} papers describing delirium algorithms")
print(f"Results saved to {output_file}")

In [None]:
# Optional: Display the count by year
year_count = algorithm_papers['Year'].value_counts().sort_index()
print("\nPapers by year:")
print(year_count)