In [1]:
import requests
import pandas as pd
import json
try:
    from StringIO import StringIO ## for Python 2
except ImportError:
    from io import StringIO ## for Python 3
    
import os
import glob
import numpy as np
import ast
from tqdm import tqdm_notebook as tq
import sys
sys.path.append("../Files/")
sys.path.append('../Code/')
try:
    reload  # Python 2.7
except NameError:
    try:
        from importlib import reload  # Python 3.4+
    except ImportError:
        from imp import reload  # Python 3.0 - 3.3

In [2]:
def makeConfDict(confPath='/Users/yovosko/public_html/Hegemon/explore.conf',skipTest=True,overWrite=False):
    """
    makeConfDict
        Inputs:
            confPath: str or list (of strs), path to explore.conf file wanted information, 
                            can also be a list of multiple paths to multiple explore.conf files 
            skipTest, boolean, if True will skip the Test dataset (T1) in the confPath file
                    default: True
            overWrite: boolean, if mulitple instances of the same id, 
                            if True will overwrite with the newest dataset
                            if False will not overwrite and keep first dataset instance
                            deafult: False
        Outputs:
            dictionary with all the information about all the datasets in confPath file"""
    confDict={}
    if type(confPath) is list:
        confPathLst=confPath
    else:
        confPathLst=[confPath]
    for confPathIn in confPathLst:
        with open(confPathIn) as f:
            for line in f:
                if '[' in line:
                    idd=line.split(']')[0].split('[')[1]
                    if idd in confDict and not overWrite:
                        idd=''
                    if not skipTest or not idd=='T1':
                        confDict[idd]={}
                if 'name' in line and '=' in line:
                    name=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Name']=name
                if 'expr' in line and '=' in line:
                    expr=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Expr']=expr
                if 'index' in line and '=' in line:
                    index=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Index']=index
                if 'survival' in line and '=' in line:
                    survival=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Survival']=survival
                if 'indexHeader' in line and '=' in line:
                    indexheader=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['IndexHeader']=indexheader
                if 'info' in line and '=' in line:
                    info=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Info']=info
                if 'key' in line and '=' in line:
                    keys=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Keys']=keys
                if 'source' in line and '=' in line:
                    gse=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Source']=gse
    return confDict

def getValueFromConfDict(value='Source',idd='All',confDict={},returnType='dict',
                         confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getValueFromConfDict
        Inputs:
            value, str, value key that has info wanted, default='Source
            idd, str, Id of dataset with the wanted info, 
                default='All'
                if 'All' will return dictionary or list of values for all dataset, 
                    with the dataset Ids as the keys of the dictionary if returnType=='dict'
                    and a list of the unique values if returnType=='list'
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            returnType, str, if idd=='All' then tells if to return info as a dictionary or a list of the values,
                default='dict'
                curently only works if input is either 'list' or 'dict'
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            the information for the selected value for the dataset selected in idd
            or a dictionary/list of those values if idd='All'
    '"""
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    if idd=='All':
        if returnType=='dict':
            val={}
        elif returnType=='list':
            val=set()
        for i in list(confDict.keys()):
            if value in confDict[i]:
                val1=confDict[i][value]
            else:
                val1=''
            if returnType=='dict':
                val[i]=val1
            elif returnType=='list':
                val.add(val1)
        if returnType=='list':
            val=list(val)
    else:
        val=confDict[idd][value]
    return val

def getDatasetExpressionFile(idd,confDict={},returnType='dict',
                             confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Expr',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexFile(idd,confDict={},returnType='dict',
                        confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Index',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSurvivalFile(idd,confDict={},returnType='dict',
                           confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Survival',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexHeaderFile(idd,confDict={},returnType='dict',
                              confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='IndexHeader',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetInfoFile(idd,confDict={},returnType='dict',
                       confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Info',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetKeys(idd,confDict={},returnType='dict',
                   confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Keys',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSource(idd,confDict={},returnType='dict',
                     confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Source',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIdFromSource(source,confDict={},confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getDatasetIdFromSource
        Inputs:
            source, str, source which you want to get the Id for
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            returns a list of all dataset Ids that have data from that source 
            will include datasets which have data from multiple sources if source is one of them
    """
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    out=[]
    for i in confDict.keys():
        if 'Source' in confDict[i]:
            source1=confDict[i]['Source'].split(' ')
            if source in source1:
                out.append(i)
    if len(out)==0:
        print('Source '+source+' not in this explore.conf file, check if you gave the correct confPath, and try setting confDict={}')
    return out

In [3]:
def getHegemonDataset(dbid):
    url = urlbase + "?go=getdatasetjson&id=" + dbid
    response = requests.get(url)
    obj = json.loads(response.text)
    return  obj

def getHegemonDataExpression(dbid, urlbase):
    opt = {'go': 'dataDownload', 'id': dbid}
    response = requests.post(urlbase, opt)
    data = StringIO(response.text)
    df = pd.read_csv(data, sep="\t")
    return df

def getHegemonPatientInfo(dbid, urlbase):
    url = urlbase + "?go=getpatientinfojson" + "&id=" + dbid
    response = requests.get(url)
    obj = json.loads(response.text)
    return  obj

def getHegemonPatientData(dbid, name, urlbase):
    hdr = getHegemonPatientInfo(dbid, urlbase)
    obj=None
    clinical = 0
    if name in hdr:
        clinical = hdr.index(name)
        url = urlbase + "?go=getpatientdatajson" + \
        "&id=" + dbid + "&clinical=" + str(clinical)
        response = requests.get(url)
        obj = json.loads(response.text)
    return  obj

def getHegemonDataSurvival(dbid,urlbase):
    colLst=getHegemonPatientInfo(dbid,urlbase)
    df=None
    for col in colLst:
        if col!='Select Patient Information':
            df1=getHegemonPatientData(dbid=dbid,name=col,urlbase=urlbase)
            df1=pd.DataFrame(df1).T
            df1.columns=['ArrayId',col]
            df1=df1.drop([0,1])
            if df is None:
                df=df1
            else:
                df=df.merge(df1,how='outer',on='ArrayId')
    return df

def getHegemonData(source,save=True,verbose=True):
    if verbose:
        print('Finding Dataset')
    try:
        confPathLst=['./Files/Yeti_explore.conf','./Files/Debashis_explore.conf']
        confDict=makeConfDict(confPathLst)
        dbid=getDatasetIdFromSource(source,confDict)[0]
    except Exception as e:
        print('Error with given source:',e)
    if 'Yotam' in confDict[dbid]['Expr']:
        urlbase='http://hegemon.ucsd.edu/~yovosko/Hegemon/explore.php'
    else:
        urlbase='http://hegemon.ucsd.edu/Tools/explore.php'
    if verbose:
        print('Getting Data Files')
    try:
        df=getHegemonDataExpression(dbid,urlbase)
    except Exception as e:
        print('Error getting the Expression Data:',e)
    try:
        survival=getHegemonDataSurvival(dbid,urlbase)
    except Exception as e:
        print('Error getting the Annotation Data:',e)
    if save:
        if verbose:
            print('Saving to /Files/Downloads/')
        df.to_csv('./Files/Downloads/'+source+'_expression_data.csv',index=False)
        survival.to_csv('./Files/Downloads/'+source+'_annotation_data.csv',index=False)
    if verbose:
        print('Finished')
    return df,survival

In [4]:
# To download data use GSE Id from the MacrophageExercise_DatasetsInfo.csv file
# To save the files, set save=True, will save the files in the Files/Downloads/ folder
expr,annot=getHegemonData(source='GSE165630',save=True)

Finding Dataset
Getting Data Files
Saving to /Files/Downloads/
Finished


In [5]:
expr

Unnamed: 0,ProbeID,Name,GSM5047132,GSM5047133,GSM5047134,GSM5047135,GSM5047136,GSM5047137,GSM5047138,GSM5047139,GSM5047140,GSM5047141,GSM5047142,GSM5047143,GSM5047144,GSM5047145
0,ENSG00000261103,RP11-298D21.3,0.270013,0.329700,0.000000,0.331363,0.000000,0.114259,0.105969,0.122967,0.419784,0.098124,0.340350,0.251091,0.151185,0.000000
1,ENSG00000110514,MADD,5.640445,5.714369,5.650003,5.800607,5.440543,6.200241,6.184283,6.015788,6.325475,6.115614,6.154632,6.287033,6.315827,6.068513
2,ENSG00000268358,FLJ11235,0.782715,1.158603,1.209941,0.755774,1.096935,0.657123,0.383904,0.236272,0.815482,0.357860,0.529511,0.464891,0.288018,0.278081
3,ENSG00000086015,MAST2,6.841908,6.802308,7.287654,7.495455,7.275259,8.425004,8.676994,8.292189,8.509356,8.487953,8.348518,8.198981,8.647643,8.438690
4,ENSG00000211769,TRBJ2-5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57226,ENSG00000268351,AP001888.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57227,ENSG00000182872,RBM10,5.752090,5.183054,5.267993,5.302757,5.998413,5.315403,4.937739,5.172133,5.157910,5.177538,5.382244,5.191797,5.026691,5.411412
57228,ENSG00000182873,RP11-181G12.2,0.693726,0.944340,0.976496,1.140739,1.096935,1.312402,0.753564,0.530932,0.419784,0.707903,0.529511,0.735846,0.634505,0.802428
57229,ENSG00000182870,GALNT9,0.000000,0.141042,0.000000,0.000000,0.000000,0.114259,0.105969,0.122967,0.000000,0.000000,0.000000,0.131001,0.000000,0.278081


In [6]:
annot

Unnamed: 0,ArrayId,time,status,c title,c source_name (ch1),c organism (ch1),c taxid (ch1),c gender (ch1),c age (years) (ch1),c % of resistance training (ch1),c % of endurance training (ch1),c average weekly hours of training (ch1),c molecule (ch1),c group (ch1)
0,GSM5047132,,,muscle of sedentary aged subject # 1,muscle vastus lateralis,Homo sapiens,9606,male,70.0,0.0,0.0,0.0,total RNA,sedentary
1,GSM5047133,,,muscle of sedentary aged subject # 2,muscle vastus lateralis,Homo sapiens,9606,male,77.0,0.0,0.0,0.0,total RNA,sedentary
2,GSM5047134,,,muscle of sedentary aged subject # 3,muscle vastus lateralis,Homo sapiens,9606,male,72.0,0.0,0.0,0.0,total RNA,sedentary
3,GSM5047135,,,muscle of sedentary aged subject # 4,muscle vastus lateralis,Homo sapiens,9606,male,75.0,0.0,0.0,0.0,total RNA,sedentary
4,GSM5047136,,,muscle of sedentary aged subject # 5,muscle vastus lateralis,Homo sapiens,9606,male,71.0,0.0,0.0,0.0,total RNA,sedentary
5,GSM5047137,,,muscle of well resistance trained aged subject #1,muscle vastus lateralis,Homo sapiens,9606,male,69.5,100.0,0.0,4.5,total RNA,resistance trained
6,GSM5047138,,,muscle of well resistance trained aged subject #2,muscle vastus lateralis,Homo sapiens,9606,male,66.0,89.0,11.0,9.0,total RNA,resistance trained
7,GSM5047139,,,muscle of well resistance trained aged subject #3,muscle vastus lateralis,Homo sapiens,9606,male,71.6,100.0,0.0,9.0,total RNA,resistance trained
8,GSM5047140,,,muscle of well resistance trained aged subject #4,muscle vastus lateralis,Homo sapiens,9606,male,70.3,65.0,35.0,6.9,total RNA,resistance trained
9,GSM5047141,,,muscle of well endurance trained aged subject #1,muscle vastus lateralis,Homo sapiens,9606,male,69.2,12.5,87.5,16.0,total RNA,endurance trained
