In [39]:
# Imports the three libraries needed to (1) store/export results in a table (pandas), (2) optionally pretty-print raw PubMed XML records for debugging (json), and (3) query PubMed via NCBI E-utilities (Bio.Entrez).
import pandas as pd
from Bio import Entrez
import time

In [40]:
# Set the email address to avoid any potential issues with Entrez
Entrez.email = 'amjad.alghamyan@mail.utoronto.ca'

In [41]:
# Define lists of topics
topics = ['Knee', 'Arthroplasty', 'Implant', 'Cement']

In [42]:
# Define date range
date_range = '("2023/01/01"[Date - Create] : "2026/01/20"[Date - Create])'

In [43]:
# Build a list of PubMed query clauses, one per topic term, searched in Title OR Abstract.
topic_queries = [f'({t})[Title/Abstract]' for t in topics]
# Combine all topic clauses with AND so every returned paper must match *all* terms, and apply date filer
full_query = ' AND '.join(topic_queries) + ' AND ' + date_range

In [44]:
# Run the PubMed search with the query. retmax=400 limits how many PMIDs retrieved in this call.
handle = Entrez.esearch(db='pubmed', retmax=400, term=full_query)
# Parse the XML response into a Python dict-like structure.
record = Entrez.read(handle)
# Extract the list of PubMed IDs (PMIDs) returned by the search.
id_list = record['IdList']
handle.close()

In [45]:
# Creating the dataframe
df = pd.DataFrame(columns=['PMID', 'Title', 'Abstract', 'Authors', 'Journal', 'Mesh Terms', 'URL', 'Affiliations'])

In [46]:
batch_size = 200
for start in range(0, len(id_list), batch_size):
    batch_pmids = id_list[start:start+batch_size]
    handle = Entrez.efetch(db="pubmed", id=",".join(batch_pmids), retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    time.sleep(0.34)

    for record in records['PubmedArticle']:
        title = record['MedlineCitation']['Article']['ArticleTitle']

        abs_list = (record.get('MedlineCitation', {})
                    .get('Article', {})
                    .get('Abstract', {})
                    .get('AbstractText', []))
        abstract = " ".join(map(str, abs_list))

        author_list = record.get('MedlineCitation', {}).get('Article', {}).get('AuthorList', [])
        authors = []
        for a in author_list:
            if 'CollectiveName' in a:
                authors.append(a['CollectiveName'])
            else:
                authors.append(f"{a.get('LastName','')} {a.get('ForeName','')}".strip())
        authors = ", ".join([x for x in authors if x])

        affiliations = []
        for a in author_list:
            for aff in a.get("AffiliationInfo", []):
                if "Affiliation" in aff:
                    affiliations.append(aff["Affiliation"])

        seen = set()
        unique_affils = []
        for aff in affiliations:
            if aff not in seen:
                unique_affils.append(aff); seen.add(aff)
        affiliations = "; ".join(unique_affils)

        journal = record['MedlineCitation']['Article']['Journal']['Title']

        mesh = record.get('MedlineCitation', {}).get('MeshHeadingList', [])
        mesh_terms = ", ".join(str(m['DescriptorName']) for m in mesh) if mesh else ""

        pmid = str(record['MedlineCitation']['PMID'])
        url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"

        df.loc[len(df)] = {
            "PMID": pmid,
            "Title": title,
            "Abstract": abstract,
            "Authors": authors,
            "Journal": journal,
            "Mesh Terms": mesh_terms,
            "URL": url,
            "Affiliations": affiliations
        }



In [47]:
# Save DataFrame to CSV file
df.to_csv("PubMed_resultsx.csv", index=False)