# This notebooks shows how we can query NCBI's API to obtain GEO (and SRA) metadata.

In [1]:
#!/usr/bin/env python

"""
testing a new GEO parser.
"""
import argparse
import xml.etree.ElementTree as et 
import os
import glob
from xml.dom import minidom
from collections import defaultdict
import yaml

from yaml.representer import Representer
yaml.add_representer(defaultdict, Representer.represent_dict)

import json
import pandas as pd
import urllib
import xmltodict

In [2]:
def esearch(term, db='gds'):
    """
    Queries NCBI using the esearch utility. GEO ('gds') database is used as default for search term.
    """
    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db}&term={term}&retmax=5000&usehistory=y'
    response = urllib.request.urlopen(url)
    return response.read()

    
def get_esummary(esearch_string, db='gds'):
    """
    Parses a http response in XML format to obtain the webenv and querykey tokens.
    Uses NCBI eutils to transform these tokens into web summaries of GEO (db='gds') datasets.
    """
    xmldoc = minidom.parseString(esearch_string)
    webenv = xmldoc.getElementsByTagName('WebEnv')[0].firstChild.data
    querykey = xmldoc.getElementsByTagName('QueryKey')[0].firstChild.data
    host = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    params = f'?db={db}&version=2.0&query_key={querykey}&WebEnv={webenv}'
    url = host + params
    response = urllib.request.urlopen(url)
    return response.read()



In [14]:
def parse_geo_esummary(input_string):
    o = xmltodict.parse(input_string)
    series_metadata = defaultdict()  # should only be one series per xml string
    sample_metadata = []  # one or more samples
    platform_metadata = []  # one or more associated platforms

    for document_summary in o['eSummaryResult']['DocumentSummarySet']['DocumentSummary']:
        acc = document_summary['Accession']
        title = document_summary['title']
        description = document_summary['summary']
        if acc.startswith('GSE'):  # Series
            series_metadata = {'accession': acc, 'title': title, 'description': description}
        if acc.startswith('GSM'):  # Sample
            sra = ""
            try:
                if document_summary['ExtRelations']['ExtRelation']['RelationType'] == 'SRA':
                    sra = document_summary['ExtRelations']['ExtRelation']['TargetObject']
            except KeyError:
                print(f"Error parsing GEO Summary. No known SRA or malformed entry {acc}.")
                raise
            except TypeError:
                print(f"Error parsing GEO Summary {acc}. Make sure the GEO accession ID is not a superSeries.")
                raise
            metadata = {'accession': acc, 'title': title, 'description': description, 'SRA': sra}
            sample_metadata.append(metadata)
        if acc.startswith('GPL'):  # Platform
            platform_metadata.append({'accession': acc, 'title': title, 'description': description})
    
    return series_metadata, sample_metadata, platform_metadata


def efetch(srx, db='sra'):
    host = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    url = host + f'efetch.fcgi?db={db}&id={srx}'
    response = urllib.request.urlopen(url)
    return response.read()




In [108]:
def parse_srx(input_string):
    urls = []
    try:
        o = xmltodict.parse(input_string)
    except TypeError as e:
        print(e)
    accession_metadata = []
    for _, experiment in o['EXPERIMENT_PACKAGE_SET'].items():
        label = experiment['EXPERIMENT']['TITLE']
        for _, run in experiment['RUN_SET'].items():
            try:  # one RUN per RUN_SET
                srr = run['IDENTIFIERS']['PRIMARY_ID']
                label2 = label + f' (SRR: {srr})'
                accession_metadata.append((label2, srr))
            except TypeError:  # more than one RUN per RUN_SET
                for r in run:
                    srr = r['IDENTIFIERS']['PRIMARY_ID']
                    label2 = label + f' (SRR: {srr})'
                    accession_metadata.append((label2, srr))
    return accession_metadata


In [109]:
term = 'GSE128781'
# term = 'GSE128854'
# term = 'GSE129609'
# term = 'GSE130119'  # superseries

In [110]:
def get_srr_from_geo_accession(geo_term):
    esummary = get_esummary(esearch(geo_term, db='gds'), db='gds')
    series_metadata, sample_metadata, platform_metadata = parse_geo_esummary(esummary)
    # print(series_metadata)
    accessions = []
    for sample in sample_metadata: # one for each GSM id

        try:
            fetched = efetch(srx=sample['SRA'])
            accessions.append(parse_srx(fetched))
        except IndexError:
            fetched = ""

    return accessions

In [111]:
get_srr_from_geo_accession(term)

[[('GSM3684893: NMG_5; Mus musculus; RNA-Seq (SRR: SRR11235407)',
   'SRR11235407')],
 [('GSM3684892: NMG_4; Mus musculus; RNA-Seq (SRR: SRR11235406)',
   'SRR11235406')],
 [('GSM3684891: NMG_3; Mus musculus; RNA-Seq (SRR: SRR11235405)',
   'SRR11235405')],
 [('GSM3684890: NMG_1; Mus musculus; RNA-Seq (SRR: SRR11235404)',
   'SRR11235404')],
 [('GSM3684889: NCPM_4; Mus musculus; RNA-Seq (SRR: SRR11235403)',
   'SRR11235403')],
 [('GSM3684888: NCPM_3; Mus musculus; RNA-Seq (SRR: SRR11235402)',
   'SRR11235402')],
 [('GSM3684887: NCPM_2; Mus musculus; RNA-Seq (SRR: SRR11235401)',
   'SRR11235401')],
 [('GSM3684886: NCPM_1; Mus musculus; RNA-Seq (SRR: SRR11235400)',
   'SRR11235400')],
 [('GSM3684885: PM_4; Mus musculus; RNA-Seq (SRR: SRR11235399)',
   'SRR11235399')],
 [('GSM3684884: PM_3; Mus musculus; RNA-Seq (SRR: SRR11235398)',
   'SRR11235398')],
 [('GSM3684883: PM_2; Mus musculus; RNA-Seq (SRR: SRR11235397)',
   'SRR11235397')],
 [('GSM3684882: PM_1; Mus musculus; RNA-Seq (SRR: SRR