In [90]:
import requests
import pandas as pd
import json
try:
    from StringIO import StringIO ## for Python 2
except ImportError:
    from io import StringIO ## for Python 3
    
import os
import glob
import numpy as np
import ast
from tqdm import tqdm_notebook as tq
import sys
sys.path.append("../Files/")
sys.path.append('../Code/')
try:
    reload  # Python 2.7
except NameError:
    try:
        from importlib import reload  # Python 3.4+
    except ImportError:
        from imp import reload  # Python 3.0 - 3.3

In [91]:
def makeConfDict(confPath='/Users/yovosko/public_html/Hegemon/explore.conf',skipTest=True,overWrite=False):
    """
    makeConfDict
        Inputs:
            confPath: str or list (of strs), path to explore.conf file wanted information, 
                            can also be a list of multiple paths to multiple explore.conf files 
            skipTest, boolean, if True will skip the Test dataset (T1) in the confPath file
                    default: True
            overWrite: boolean, if mulitple instances of the same id, 
                            if True will overwrite with the newest dataset
                            if False will not overwrite and keep first dataset instance
                            deafult: False
        Outputs:
            dictionary with all the information about all the datasets in confPath file"""
    confDict={}
    if type(confPath) is list:
        confPathLst=confPath
    else:
        confPathLst=[confPath]
    for confPathIn in confPathLst:
        with open(confPathIn) as f:
            for line in f:
                if '[' in line:
                    idd=line.split(']')[0].split('[')[1]
                    if idd in confDict and not overWrite:
                        idd=''
                    if not skipTest or not idd=='T1':
                        confDict[idd]={}
                if 'name' in line and '=' in line:
                    name=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Name']=name
                if 'expr' in line and '=' in line:
                    expr=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Expr']=expr
                if 'index' in line and '=' in line:
                    index=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Index']=index
                if 'survival' in line and '=' in line:
                    survival=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Survival']=survival
                if 'indexHeader' in line and '=' in line:
                    indexheader=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['IndexHeader']=indexheader
                if 'info' in line and '=' in line:
                    info=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Info']=info
                if 'key' in line and '=' in line:
                    keys=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Keys']=keys
                if 'source' in line and '=' in line:
                    gse=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Source']=gse
    return confDict

def getValueFromConfDict(value='Source',idd='All',confDict={},returnType='dict',
                         confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getValueFromConfDict
        Inputs:
            value, str, value key that has info wanted, default='Source
            idd, str, Id of dataset with the wanted info, 
                default='All'
                if 'All' will return dictionary or list of values for all dataset, 
                    with the dataset Ids as the keys of the dictionary if returnType=='dict'
                    and a list of the unique values if returnType=='list'
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            returnType, str, if idd=='All' then tells if to return info as a dictionary or a list of the values,
                default='dict'
                curently only works if input is either 'list' or 'dict'
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            the information for the selected value for the dataset selected in idd
            or a dictionary/list of those values if idd='All'
    '"""
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    if idd=='All':
        if returnType=='dict':
            val={}
        elif returnType=='list':
            val=set()
        for i in list(confDict.keys()):
            if value in confDict[i]:
                val1=confDict[i][value]
            else:
                val1=''
            if returnType=='dict':
                val[i]=val1
            elif returnType=='list':
                val.add(val1)
        if returnType=='list':
            val=list(val)
    else:
        val=confDict[idd][value]
    return val

def getDatasetExpressionFile(idd,confDict={},returnType='dict',
                             confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Expr',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexFile(idd,confDict={},returnType='dict',
                        confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Index',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSurvivalFile(idd,confDict={},returnType='dict',
                           confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Survival',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexHeaderFile(idd,confDict={},returnType='dict',
                              confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='IndexHeader',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetInfoFile(idd,confDict={},returnType='dict',
                       confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Info',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetKeys(idd,confDict={},returnType='dict',
                   confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Keys',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSource(idd,confDict={},returnType='dict',
                     confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Source',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIdFromSource(source,confDict={},confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getDatasetIdFromSource
        Inputs:
            source, str, source which you want to get the Id for
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            returns a list of all dataset Ids that have data from that source 
            will include datasets which have data from multiple sources if source is one of them
    """
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    out=[]
    for i in confDict.keys():
        if 'Source' in confDict[i]:
            source1=confDict[i]['Source'].split(' ')
            if source in source1:
                out.append(i)
    if len(out)==0:
        print('Source '+source+' not in this explore.conf file, check if you gave the correct confPath, and try setting confDict={}')
    return out

In [136]:
def getHegemonDataset(dbid):
    url = urlbase + "?go=getdatasetjson&id=" + dbid
    response = requests.get(url)
    obj = json.loads(response.text)
    return  obj

def getHegemonDataExpression(dbid, urlbase):
    opt = {'go': 'dataDownload', 'id': dbid}
    response = requests.post(urlbase, opt)
    data = StringIO(response.text)
    df = pd.read_csv(data, sep="\t")
    return df

def getHegemonPatientInfo(dbid, urlbase):
    url = urlbase + "?go=getpatientinfojson" + "&id=" + dbid
    response = requests.get(url)
    obj = json.loads(response.text)
    return  obj

def getHegemonPatientData(dbid, name, urlbase):
    hdr = getHegemonPatientInfo(dbid, urlbase)
    obj=None
    clinical = 0
    if name in hdr:
        clinical = hdr.index(name)
        url = urlbase + "?go=getpatientdatajson" + \
        "&id=" + dbid + "&clinical=" + str(clinical)
        response = requests.get(url)
        obj = json.loads(response.text)
    return  obj

def getHegemonDataSurvival(dbid,urlbase):
    colLst=getHegemonPatientInfo(dbid,urlbase)
    df=None
    for col in colLst:
        if col!='Select Patient Information':
            df1=getHegemonPatientData(dbid=dbid,name=col,urlbase=urlbase)
            df1=pd.DataFrame(df1).T
            df1.columns=['ArrayId',col]
            df1=df1.drop([0,1])
            if df is None:
                df=df1
            else:
                df=df.merge(df1,how='outer',on='ArrayId')
    return df

def getHegemonData(source,save=True,verbose=True):
    if verbose:
        print('Finding Dataset')
    try:
        confPathLst=['../Files/Yeti_explore.conf','../Files/Debashis_explore.conf']
        confDict=makeConfDict(confPathLst)
        dbid=getDatasetIdFromSource(source,confDict)[0]
    except Exception as e:
        print('Error with given source:',e)
    if 'Yotam' in confDict[dbid]['Expr']:
        urlbase='http://hegemon.ucsd.edu/~yovosko/Hegemon/explore.php'
    else:
        urlbase='http://hegemon.ucsd.edu/Tools/explore.php'
    if verbose:
        print('Getting Data Files')
    try:
        df=getHegemonDataExpression(dbid,urlbase)
    except Exception as e:
        print('Error getting the Expression Data:',e)
    try:
        survival=getHegemonDataSurvival(dbid,urlbase)
    except Exception as e:
        print('Error getting the Annotation Data:',e)
    if save:
        if verbose:
            print('Saving to /Files/Downloads/')
        df.to_csv('../Files/Downloads/'+source+'_expression_data.csv',index=False)
        survival.to_csv('../Files/Downloads/'+source+'_annotation_data.csv',index=False)
    if verbose:
        print('Finished')
    return df,survival

In [137]:
confPathLst=['../Files/Yeti_explore.conf','../Files/Debashis_explore.conf']
confDict=makeConfDict(confPathLst)
dbid=getDatasetIdFromSource(i,confDict)[0]

In [138]:
getDatasetIdFromSource(i,confDict)
dbid=getDatasetIdFromSource(i,confDict)[0]

In [139]:
opt = {'go': 'dataDownload', 'id': dbid}
response = requests.post(urlbase, opt)
data = StringIO(response.text)
df = pd.read_csv(data, sep="\t")

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

In [122]:
data=pd.read_csv('../Files/MacrophageExercise_DatasetsInfo.csv',index_col=0)

In [155]:
data

Unnamed: 0_level_0,Number Samples,Species,Experiment Type,Tissues,Genders,Ages,Citation,Exercise Type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E-MEXP-740,32,Homo sapiens,transcription profiling by array,{},{'Male': 32},{'69 to 73': 32},https://doi.org/10.1093/gerona/62.10.1088,Resistance/Strength
GSE104079,58,Mus musculus,Expression profiling by array,"{'Soleus': 27, 'Liver': 31}",{'Male': 58},{'6 month old': 58},https://doi.org/10.1096/fj.201701378RR,Aerobic/Endurance
GSE104999,24,Homo sapiens,Expression profiling by array,{'Vastus': 24},{'Male': 24},{},https://doi.org/10.1152/ajpregu.00452.2017,Immobilization
GSE109657,22,Homo sapiens,Expression profiling by array,{'Vastus': 22},{'Male': 22},{},https://doi.org/10.1038/s41598-018-35115-x,HITT
GSE110747,44,Mus musculus,Expression profiling by array,{'Liver': 44},{'Male': 44},{'11-12 weeks': 44},https://doi.org/10.3390/nu10050547,Aerobic/Endurance
...,...,...,...,...,...,...,...,...
GSE87748,20,Homo sapiens,Expression profiling by high throughput sequen...,{'Skeletal': 20},{},{},https://doi.org/10.3389/fendo.2016.00165,Resistance/Strength
GSE9103,40,Homo sapiens,Expression profiling by array,{},{},{},https://doi.org/10.2337/db08-0349,Aerobic/Endurance
GSE9405,21,Homo sapiens,Expression profiling by array,{},{},{},https://doi.org/10.1249/MSS.0b013e31818c6be9,Mixed
GSE97084,103,Homo sapiens,Expression profiling by high throughput sequen...,{},{},"{'Young': 57, 'Old': 46}",https://doi.org/10.1016/j.cmet.2017.02.009,Mixed


In [157]:
done=glob.glob('../Files/Downloads/*')
done=[d.split('_')[0].split('\\')[-1] for d in done]
done=set(done)
for i in tq(data.index):
    if not i in done:
        if i!='GSE155933':
            print(i)
            df,survival=getHegemonData(i,verbose=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tq(data.index):


  0%|          | 0/75 [00:00<?, ?it/s]

GSE165630
GSE16907
GSE17190
GSE178262
GSE1786
GSE179394
GSE1832
GSE19062
GSE19420
GSE198266
GSE199225
GSE21496
GSE221210
GSE230102
GSE236600
GSE23697
GSE24235
GSE242354
GSE250122
GSE252357
GSE27285
GSE27536
GSE28422
GSE28498
GSE28998
GSE33603
GSE33886
GSE34788
GSE3606
GSE40551
GSE41769
GSE4252
GSE43219
GSE43760
GSE43856
GSE44051
GSE467
GSE51216
GSE53598
GSE58249
GSE59088
GSE59363
GSE60591
GSE68072
GSE68585
GSE71972
GSE72462
GSE7286
GSE83352
GSE83578
GSE8479
GSE87748
GSE9103
GSE9405
GSE97084
GSE99963


In [156]:
i

'GSE155933'