In [12]:
import requests
import xml.dom.minidom as m
import time as time
import json
import xml.etree.ElementTree as ET


In [19]:
def get_PubMedIds(query, retmax=100):
    search_term = '&term=' + query
    search_retmax = "&retmax=" + str(retmax)
    search_rettype = '&retmode=json'
    search_url =  "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed"+search_term+search_retmax+search_rettype
    r = requests.get(search_url)
    return r.json()['esearchresult']['idlist']

def get_ids():
    AlzheimerTerm = 'Alzheimers+AND+2022[pdat]'
    retmax = 1000
    alz_ids = get_PubMedIds(AlzheimerTerm, retmax)

    CancerTerm = 'Cancer+AND+2022[pdat]'
    cancer_ids = get_PubMedIds(CancerTerm, retmax)
    return alz_ids, cancer_ids

alz_ids, cancer_ids = get_ids()

In [20]:
print(len(alz_ids))
print(len(cancer_ids))
print(alz_ids)
print(cancer_ids)

1000
1000
['36328129', '36327964', '36327171', '36326951', '36326588', '36326095', '36325883', '36325840', '36325692', '36325483', '36324417', '36324414', '36324408', '36324405', '36324401', '36324176', '36324157', '36324151', '36323521', '36323061', '36322888', '36322800', '36322495', '36322470', '36321981', '36321927', '36321882', '36321654', '36321615', '36321363', '36321205', '36321194', '36320609', '36320346', '36319674', '36319270', '36319269', '36319136', '36319095', '36319045', '36318754', '36318594', '36318545', '36318372', '36317468', '36317413', '36316970', '36316783', '36316708', '36316501', '36316487', '36316461', '36316282', '36316035', '36315527', '36315115', '36314730', '36314503', '36314232', '36314212', '36314211', '36314210', '36314209', '36314208', '36314207', '36314206', '36314205', '36314204', '36314203', '36314202', '36314201', '36314200', '36314199', '36314055', '36313968', '36313967', '36313955', '36313229', '36312018', '36311713', '36311031', '36310167', '3630

In [21]:
overlap=(set(cancer_ids)&set(alz_ids))
print("overlap is:",overlap)

overlap is: {'36321615', '36321363'}


In [16]:
def get_info(pmid, deep = 0):
    time.sleep(1)
    r = requests.get(
    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=" + pmid
    )
    
    tree = ET.fromstring(r.text)
    try:
        titles = tree.find('PubmedArticle/MedlineCitation/Article/ArticleTitle')
        title = ET.tostring(titles, method="text").decode()
    except:
        if deep > 5:
            print("Error: {} No tittle max deep reached".format(pmid))
            title = ""
        else:
            return get_info(pmid, deep + 1)
    abstracts = tree.find('PubmedArticle/MedlineCitation/Article/Abstract/AbstractText')
    try:
        abstract = ET.tostring(abstracts, method="text").decode()
    except:
        if deep > 5:
            print("Error: {} No abstract max deep reached".format(pmid))
            abstract = ""
        else:
            return get_info(pmid, deep + 1)
    return title, abstract

In [17]:
def json_output():
    dictionary = {}
    for i in alz_ids:
        title, abstract = get_info(i)
        dictionary[i] = {'ArticleTitle': title, 'ArticleAbstract': abstract, 'query': 'Alzheimer'}
    for i in cancer_ids:
        title, abstract = get_info(i)
        dictionary[i] = {'ArticleTitle': title, 'ArticleAbstract': abstract, 'query': 'Cancer'}
    json_obj = json.dumps(dictionary, indent=4)
    
    print(dictionary)
    with open("alz&cancer.json", "w") as outfile:
        outfile.write(json_obj)

In [18]:
json_output()

Error: 36319136 No abstract max deep reached
Error: 36316783 No abstract max deep reached
Error: 36314232 No abstract max deep reached
Error: 36310167 No abstract max deep reached
Error: 36306458 No abstract max deep reached
Error: 36304998 No abstract max deep reached
Error: 36299613 No abstract max deep reached
Error: 36284665 No abstract max deep reached
Error: 36284252 No abstract max deep reached
Error: 36284251 No abstract max deep reached
Error: 36281688 No abstract max deep reached
Error: 36281687 No abstract max deep reached
Error: 36281660 No abstract max deep reached
Error: 36281659 No abstract max deep reached
Error: 36281658 No abstract max deep reached
Error: 36278375 No abstract max deep reached
Error: 36278006 No abstract max deep reached
Error: 36275013 No abstract max deep reached
Error: 36271398 No abstract max deep reached
Error: 36270305 No abstract max deep reached
Error: 36268432 No abstract max deep reached
Error: 36266381 No abstract max deep reached
Error: 362

In [27]:
print("Overlap:",set(alz_ids) & set(cancer_ids))

Overlap: {'36321615', '36321363'}
