In [10]:
from Bio import Entrez
import xml.dom.minidom as m
import requests
import json

An example website:
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=xml&retmax=10&term=Alzheimers

## Identify the PubMed IDs for 1000 Alzheimers papers from 2022

In [11]:
def getid_from_term(num, term):
    r = requests.get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        f"esearch.fcgi?db=pubmed&retmode=xml&retmax={num}&term={term}"
    )
    doc = m.parseString(r.text)
    IdLists = doc.getElementsByTagName("Id")
    IdList = [IdLists[i].childNodes[0].wholeText for i in range(num)]
    return IdList

AlzheimersList = getid_from_term(1000, 'Alzheimer+AND+2022[pdat]')
print(f"The IDs for 1000 Alzheimer papers from 2022 are {AlzheimersList}")

The IDs for 1000 Alzheimer papers from 2022 are ['36309183', '36309087', '36308033', '36306920', '36306735', '36306540', '36306459', '36306458', '36306386', '36305541', '36305459', '36305148', '36305125', '36304998', '36304823', '36304723', '36304124', '36303331', '36302977', '36302665', '36302659', '36302488', '36302464', '36301043', '36299613', '36299608', '36298279', '36297317', '36297313', '36296980', '36296969', '36296692', '36296686', '36296677', '36296574', '36296397', '36295605', '36295535', '36295014', '36294010', '36293946', '36293666', '36293539', '36293528', '36293516', '36293327', '36293221', '36293147', '36293049', '36292947', '36292945', '36292933', '36292931', '36292674', '36292623', '36292114', '36291714', '36291679', '36291666', '36291661', '36291639', '36291618', '36291595', '36291553', '36291536', '36291224', '36291125', '36291068', '36291020', '36291017', '36290612', '36290138', '36289878', '36289859', '36289565', '36289458', '36289390', '36289355', '36288997', '36

## Identify the PubMed IDs for 1000 Cancer papers from 2022

In [12]:
def getid_from_term(num, term):
    r = requests.get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        f"esearch.fcgi?db=pubmed&retmode=xml&retmax={num}&term={term}"
    )
    doc = m.parseString(r.text)
    IdLists = doc.getElementsByTagName("Id")
    IdList = [IdLists[i].childNodes[0].wholeText for i in range(num)]
    return IdList

CancerList = getid_from_term(1000, 'Cancer+AND+2022[pdat]')
print(f"The IDs for 1000 Cancer papers from 2022 are {CancerList}")

The IDs for 1000 Cancer papers from 2022 are ['36309847', '36309839', '36309838', '36309837', '36309831', '36309829', '36309827', '36309820', '36309805', '36309761', '36309760', '36309745', '36309740', '36309731', '36309713', '36309712', '36309698', '36309694', '36309693', '36309691', '36309687', '36309678', '36309672', '36309669', '36309667', '36309666', '36309662', '36309655', '36309653', '36309646', '36309638', '36309636', '36309630', '36309626', '36309619', '36309616', '36309615', '36309613', '36309604', '36309603', '36309602', '36309571', '36309561', '36309560', '36309559', '36309558', '36309551', '36309550', '36309544', '36309522', '36309516', '36309510', '36309507', '36309506', '36309503', '36309502', '36309495', '36309486', '36309485', '36309484', '36309482', '36309474', '36309424', '36309418', '36309416', '36309415', '36309414', '36309413', '36309406', '36309401', '36309400', '36309397', '36309395', '36309391', '36309388', '36309387', '36309385', '36309384', '36309383', '36309

In [93]:
set(AlzheimersList) & set(CancerList)

set()

# Create json file

In [82]:
def get_info(pmid, query):
    r = requests.get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        f"efetch.fcgi?db=pubmed&retmode=xml&id={pmid}"
    )
    doc = m.parseString(r.text)
    dict_articles = {}
    for i in doc.getElementsByTagName("PubmedArticle"):
        PMID = i.getElementsByTagName("PMID")[0].childNodes[0].wholeText
        title_i = i.getElementsByTagName("ArticleTitle")[0].childNodes # title containing italics
        title = ""
        for item in title_i:
            title += item.toxml()
        try:
            abstracts = i.getElementsByTagName("Abstract")[0].getElementsByTagName("AbstractText")
            abstract = ""
            for item in (abstracts[0].childNodes):
                abstract += item.toxml()
            # If there are more than one abstract, concancate with space between them
            if len(abstracts) > 1:
                for j in range(1,len(abstracts)):
                    abstract += " "
                    for item in (abstracts[j].childNodes):
                        abstract += item.toxml()
        except:
            abstract = ""
        dict_article = {"ArticleTitle": title,
                       "AbstractText": abstract,
                       "query": query}
        dict_articles[PMID] = dict_article
    return dict_articles

IDstr = ""
for item in AlzheimersList[:400]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
result = get_info(IDstr, "Alzheimer")

IDstr = ""
for item in AlzheimersList[400:800]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
temp = get_info(IDstr, "Alzheimer")
result.update(temp)

IDstr = ""
for item in AlzheimersList[800:]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
temp = get_info(IDstr, "Alzheimer")
result.update(temp)



with open("Alzheimers.json", "w") as outfile:
    json.dump(result, outfile)

In [83]:
IDstr = ""
for item in CancerList[:400]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
result = get_info(IDstr, "Cancer")

IDstr = ""
for item in CancerList[400:800]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
temp = get_info(IDstr, "Cancer")
result.update(temp)

IDstr = ""
for item in CancerList[800:]:
    IDstr += item
    IDstr += ","
IDstr = IDstr[:-1]
temp = get_info(IDstr, "Cancer")
result.update(temp)



with open("Cancer.json", "w") as outfile:
    json.dump(result, outfile)