# Some helpful functions for querying NCBI's REST API

In [1]:
import urllib
import xmltodict
from xml.dom import minidom

# Query the Gene Expression Omnibus (GEO) using a key term

In [2]:
def esearch(term, db='gds'):
    """
    Queries NCBI using the esearch utility. GEO ('gds') database is used as default for search term.
    """
    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db}&term={term}&retmax=5000&usehistory=y'
    response = urllib.request.urlopen(url)
    return response.read()


def get_esummary(esearch_string, db='gds'):
    """
    Parses a http response in XML format to obtain the webenv and querykey tokens.
    Uses NCBI eutils to transform these tokens into web summaries of GEO (db='gds') datasets.
    """
    xmldoc = minidom.parseString(esearch_string)
    try:
        webenv = xmldoc.getElementsByTagName('WebEnv')[0].firstChild.data
        querykey = xmldoc.getElementsByTagName('QueryKey')[0].firstChild.data
        host = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
        params = f'?db={db}&version=2.0&query_key={querykey}&WebEnv={webenv}'
        url = host + params
        response = urllib.request.urlopen(url)
        return response.read()
    except IndexError as e:
        print(f"Unparsable publication string ({e}, search={esearch_string}")
        return ""

In [3]:
key_term = "eCLIP"
esearch_string = esearch(key_term, db='gds')
result = get_esummary(esearch_string=esearch_string)
result = xmltodict.parse(result)
result['eSummaryResult']['DocumentSummarySet']['DocumentSummary'][:3]  # get the first three studies

[{'@uid': '200199161',
  'Accession': 'GSE199161',
  'GDS': None,
  'title': 'Modulation of RNA splicing enhances response to BCL2 inhibition in leukemia',
  'summary': 'This SuperSeries is composed of the SubSeries listed below.',
  'GPL': '11154;24676',
  'GSE': '199161',
  'taxon': 'Homo sapiens',
  'entryType': 'GSE',
  'gdsType': 'Expression profiling by high throughput sequencing; Other',
  'ptechType': None,
  'valType': None,
  'SSInfo': None,
  'subsetInfo': None,
  'PDAT': '2022/10/31',
  'suppFile': 'BW',
  'Samples': {'Sample': [{'Accession': 'GSM5965526',
     'Title': 'eCLIP_MOLM13_RBM10_Replicate_3'},
    {'Accession': 'GSM5965512', 'Title': 'RNAseq_MOLM13_sgRBM10-1_Venetoclax'},
    {'Accession': 'GSM5965498',
     'Title': 'RNAseq_MOLM13_SM09419+Venetoclax_Replicate_2'},
    {'Accession': 'GSM5965515',
     'Title': 'RNAseq_MOLM13_sgROSA_Venetoclax_Replicate_1'},
    {'Accession': 'GSM5965518', 'Title': 'eCLIP_MOLM13_Rabbit_IgG_INPUT'},
    {'Accession': 'GSM5965495', 

# Return species name from an accession ID:

In [4]:
term = "AP025035.1"
esearch_string = esearch(term=term, db='nucleotide')
result = get_esummary(esearch_string=esearch_string, db='nucleotide')
result = xmltodict.parse(result)
result['eSummaryResult']['DocumentSummarySet']['DocumentSummary']['Title']

'Citrobacter freundii RTE-E5 DNA, complete genome: sequence1'