In [1]:
import urllib
from xml.dom import minidom
import xmltodict
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook

# Search Pubmed for a term (eclip)

In [2]:
def esearch(pubmed, db='pubmed', retmax=1000):
    '''
    returns list of pubmed IDs associated with pubmed term
    '''
    print(f"Starting esearch pubmed {pubmed}")
    host = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    url = host + f'esearch.fcgi?db={db}&term={pubmed}&retmax={retmax}&mindate=2000'
    try:
        response = urllib.request.urlopen(url)
    except urllib.error.HTTPError as e:
        print(e)
        response = urllib.request.urlopen(url, timeout=1)
    return response.read()

xmltodict.parse(esearch('eclip'))

Starting esearch pubmed eclip


{'eSearchResult': {'Count': '103',
  'RetMax': '103',
  'RetStart': '0',
  'IdList': {'Id': ['36085079',
    '35992092',
    '35927743',
    '35819951',
    '35767654',
    '35641157',
    '35622975',
    '35512829',
    '35507203',
    '35277503',
    '35263585',
    '35236841',
    '34976441',
    '34931239',
    '34857818',
    '34784346',
    '34752747',
    '34732726',
    '34694610',
    '34614161',
    '34503222',
    '34485935',
    '34406415',
    '34314509',
    '34313988',
    '34299096',
    '34235430',
    '34221536',
    '34216543',
    '34171463',
    '34108231',
    '34108230',
    '34075878',
    '34070162',
    '34056657',
    '34039271',
    '34038531',
    '34027039',
    '34017982',
    '33947766',
    '33863890',
    '33826604',
    '33789107',
    '33579825',
    '33471508',
    '33376190',
    '33334306',
    '33287884',
    '33268787',
    '33246929',
    '33242392',
    '33230114',
    '33179042',
    '32848060',
    '32821933',
    '32811564',
    '32728046',

# Search GEO for data


In [3]:
def esearch(term, db='gds', retmax=5000):
    """
    Queries NCBI using the esearch utility. GEO ('gds') database is used as default for search term.
    """
    print(f"Start esearch GDS ({term})")
    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db}&term={term}&retmax={retmax}&usehistory=y'
    response = urllib.request.urlopen(url)
    return response.read()

def get_esummary(esearch_string, db='gds'):
    """
    Parses a http response in XML format to obtain the webenv and querykey tokens.
    Uses NCBI eutils to transform these tokens into web summaries of GEO (db='gds') datasets.
    """
    print("Start esummary GDS")
    xmldoc = minidom.parseString(esearch_string)
    try:
        webenv = xmldoc.getElementsByTagName('WebEnv')[0].firstChild.data
        querykey = xmldoc.getElementsByTagName('QueryKey')[0].firstChild.data
        host = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
        params = f'?db={db}&version=2.0&query_key={querykey}&WebEnv={webenv}'
        url = host + params
        response = urllib.request.urlopen(url)
        return response.read()
    except IndexError as e:
        print(f"Unparsable publication string ({e}, search={esearch_string}")
        return ""


search_string = esearch('eclip')  # search for GEO datasets whose key word matches 'eclip'
summaries = get_esummary(search_string)  # get human-readable summary metadata for each sample.
d = xmltodict.parse(summaries)  # transform into a python dictionary

Start esearch GDS (eclip)
Start esummary GDS


# Write GEO accession IDs that mention 'eclip' in its summary.
- may need to further filter the data, as I've seen some datasets include 'eCLIP' in abstract, but not in the datasets themselves (e.g. a publication that has 'eCLIP' in its abstract, whose GEO data contains both RNAseq and eCLIP entries. The metadata would be buried somewhere in either the filename itself, or in another metadata field...

In [4]:
publications = set()

for publication in d['eSummaryResult']['DocumentSummarySet']['DocumentSummary']:
    if 'eclip' in publication['summary'].lower():
        publications.add(publication['Accession']) # contains PMID

with open('accessions.txt', 'w') as o:
    for p in publications:
        o.write(f'{p}\n')