In [1]:
import os
import requests

from tqdm import tqdm


class Pubmed(object):
    
    def __init__(self):
        self.root_url = "https://eutils.ncbi.nlm.nih.gov/entrez"
    
    def init_data(self):
        pass
    
    def __query(self, query='', eutils='esearch', db='pubmed', retmode='json', date_range=None, retmax=10, retstart=0, sort='pub_date', usehistory=None, query_key=None, webenv=None, medline = False):
        url = self.root_url + \
              '/eutils/{eutils}.fcgi/'.format(eutils=eutils) + \
              '?db={db}&'.format(db=db) + \
              query
        
        if date_range:
            url += ' AND {date_range}[dp]'.format(date_range=date_range)
        if retmode:
            url += '&retmode={retmode}'.format(retmode=retmode)
        if retmax:
            url += '&retmax={retmax}'.format(retmax=retmax)
        if retstart:
            url += '&retstart={retstart}'.format(retstart=retstart)
        if sort:
            url += '&sort={sort}'.format(sort=sort)
        if usehistory:
            url += '&usehistory={usehistory}'.format(usehistory=usehistory)
        if query_key:
            url += '&query_key={query_key}'.format(query_key=query_key)
        if webenv:
            url += '&webenv={webenv}'.format(webenv=webenv)
        if medline:
            url += '&rettype=medline'
        
        # print()
        # print(url)
        # exit()
        
        payload = {}
        headers = {}
        response = requests.request("GET", url, headers=headers, data=payload)
        return response
        
    def __search(self, query, date_range=None, sort='pub_date', retmax=9000):
        id_list = list()
        response = self.__query(eutils='esearch',
                                db='pubmed',
                                query='term={query}'.format(query=query),
                                retmode='json',
                                date_range=date_range,
                                retmax='0',
                                retstart=0,
                                sort=sort,
                                usehistory='y'
                                )
        # pprint(response.json())
        
        count = int(response.json()['esearchresult']['count'])
        query_key = response.json()['esearchresult']['querykey']
        webenv = response.json()['esearchresult']['webenv']

        # print('Query = {query}'.format(query=query))
        # print('Count = {count}'.format(count=count))
        # print('Count type = {count}'.format(count=type(int(count))))
        # print('querykey = {querykey}'.format(querykey=query_key))
        # print('WebEnv = {webenv}'.format(webenv=webenv))
        # print('idlist = {idlist}'.format(idlist=response.json()['esearchresult']['idlist']))
        # exit()
        
        for offset in range(0, count, retmax):
            response = self.__query(eutils='efetch',
                                    db='pubmed',
                                    retmode='json',
                                    retmax=retmax,
                                    retstart=str(offset),
                                    query_key=query_key,
                                    webenv=webenv
                                    )
            tmp_pmids_list = response.text.split('\n')
            
            id_list.extend(tmp_pmids_list)
        
        return id_list
        
    def search(self, query, query_name, sort='pub_date', min_year=2000, max_year=2025):
        
        for year in tqdm(iterable=range(min_year, max_year), desc='Retrieving PMIDS for {q}'.format(q=query_name), unit='year'):
            
            year_pmid_list = self.__search(query=query,
                                           date_range='{y1}/1/1:{y2}/1/1'.format(y1=year, y2=year+1),
                                           sort=sort,
                                           )
        
            # Export query pmids
            tmp_output_name = 'tmp_{query_name}_{year}_pmids.txt'.format(query_name=query_name, year=year)
            with open(tmp_output_name, 'w') as f:
                for item in year_pmid_list:
                    if item.strip() != '':
                        f.write("%s\n" % item)
        
        # Merge all pmids
        query_pmids = list()
        for year in range(min_year, max_year):
            tmp_output_name = 'tmp_{query_name}_{year}_pmids.txt'.format(query_name=query_name, year=year)
            with open(tmp_output_name, 'r') as f:
                query_pmids.extend(f.readlines())
            os.remove(tmp_output_name)
        
        output_name = '{query_name}_pmids.txt'.format(query_name=query_name)
        with open(output_name, 'w') as f:
            for item in query_pmids:
                f.write("%s" % item)
        
        return output_name
    

if __name__ == "__main__":
    pubmed = Pubmed()
    
    #pubmed.search(
        #query='((nephro*[Title/Abstract] OR glomerul*[Title/Abstract]) AND (gene[Title/Abstract] OR *rna[Title/Abstract] OR transcript[Title/Abstract] OR protein[Title/Abstract]) AND English[Language] AND Medline[Filter])',
        #query_name='nephron',
        #sort='pub_date',
        #min_year=2000,
        #max_year=2025
        #)
    
    pubmed.search(
        query='(tooth[Title/Abstract]) OR (teeth[Title/Abstract]) OR (dental[Title/Abstract]) OR (dental enamel[Title/Abstract]) OR (dentin[Title/Abstract]) OR (periodontal ligament[Title/Abstract]) OR (alveolar bone[Title/Abstract]) OR (Periodontal Diseases[Title/Abstract]) OR (oral health[Title/Abstract]) OR (dentistry[Title/Abstract]) OR (odontogenesis[Title/Abstract]) OR (tooth development[Title/Abstract])',
        query_name='teeth',
        sort='pub_date',
        min_year=1980,
        max_year=2025
        )

Retrieving PMIDS for teeth: 100%|██████████| 45/45 [02:13<00:00,  2.97s/year]
