In [1]:
import os
import glob
import pandas as pd
import numpy as np
import requests
import ast
from tqdm import tqdm_notebook as tq
import YetiUtils as yeti
try:
    reload  # Python 2.7
except NameError:
    try:
        from importlib import reload  # Python 3.4+
    except ImportError:
        from imp import reload  # Python 3.0 - 3.3
reload(yeti)

<module 'YetiUtils' from '/mnt/booleanfs2/sahoo/Data/BooleanLab/Yotam/YetiUtils.py'>

In [2]:
confPath='/Users/yovosko/public_html/Hegemon/explore.conf'

In [3]:
def makeConfDict(confPath='/Users/yovosko/public_html/Hegemon/explore.conf',skipTest=True,overWrite=False):
    """
    makeConfDict
        Inputs:
            confPath: str or list (of strs), path to explore.conf file wanted information, 
                            can also be a list of multiple paths to multiple explore.conf files 
            skipTest, boolean, if True will skip the Test dataset (T1) in the confPath file
                    default: True
            overWrite: boolean, if mulitple instances of the same id, 
                            if True will overwrite with the newest dataset
                            if False will not overwrite and keep first dataset instance
                            deafult: False
        Outputs:
            dictionary with all the information about all the datasets in confPath file"""
    confDict={}
    if type(confPath) is list:
        confPathLst=confPath
    else:
        confPathLst=[confPath]
    for confPathIn in confPathLst:
        with open(confPathIn) as f:
            for line in f:
                if '[' in line:
                    idd=line.split(']')[0].split('[')[1]
                    if idd in confDict and not overWrite:
                        idd=''
                    if not skipTest or not idd=='T1':
                        confDict[idd]={}
                if 'name' in line and '=' in line:
                    name=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Name']=name
                if 'expr' in line and '=' in line:
                    expr=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Expr']=expr
                if 'index' in line and '=' in line:
                    index=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Index']=index
                if 'survival' in line and '=' in line:
                    survival=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Survival']=survival
                if 'indexHeader' in line and '=' in line:
                    indexheader=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['IndexHeader']=indexheader
                if 'info' in line and '=' in line:
                    info=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Info']=info
                if 'key' in line and '=' in line:
                    keys=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Keys']=keys
                if 'source' in line and '=' in line:
                    gse=line.split('=')[-1].split('\n')[0].strip()
                    if not skipTest or not idd=='T1':
                        confDict[idd]['Source']=gse
    return confDict

def getValueFromConfDict(value='Source',idd='All',confDict={},returnType='dict',
                         confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getValueFromConfDict
        Inputs:
            value, str, value key that has info wanted, default='Source
            idd, str, Id of dataset with the wanted info, 
                default='All'
                if 'All' will return dictionary or list of values for all dataset, 
                    with the dataset Ids as the keys of the dictionary if returnType=='dict'
                    and a list of the unique values if returnType=='list'
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            returnType, str, if idd=='All' then tells if to return info as a dictionary or a list of the values,
                default='dict'
                curently only works if input is either 'list' or 'dict'
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            the information for the selected value for the dataset selected in idd
            or a dictionary/list of those values if idd='All'
    '"""
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    if idd=='All':
        if returnType=='dict':
            val={}
        elif returnType=='list':
            val=set()
        for i in list(confDict.keys()):
            if value in confDict[i]:
                val1=confDict[i][value]
            else:
                val1=''
            if returnType=='dict':
                val[i]=val1
            elif returnType=='list':
                val.add(val1)
        if returnType=='list':
            val=list(val)
    else:
        val=confDict[idd][value]
    return val

def getDatasetExpressionFile(idd,confDict={},returnType='dict',
                             confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Expr',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexFile(idd,confDict={},returnType='dict',
                        confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Index',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSurvivalFile(idd,confDict={},returnType='dict',
                           confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Survival',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetIndexHeaderFile(idd,confDict={},returnType='dict',
                              confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='IndexHeader',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetInfoFile(idd,confDict={},returnType='dict',
                       confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Info',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetKeys(idd,confDict={},returnType='dict',
                   confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Keys',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

def getDatasetSource(idd,confDict={},returnType='dict',
                     confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    return getValueFromConfDict(value='Source',idd=idd,confDict=confDict,confPath=confPath,returnType=returnType)

In [4]:
def getDatasetIdFromSource(source,confDict={},confPath='/Users/yovosko/public_html/Hegemon/explore.conf'):
    """
    getDatasetIdFromSource
        Inputs:
            source, str, source which you want to get the Id for
            confDict, dictionary, dictionary with info for datasets in confPath file, 
                default={}
                if confDict is empty will create a new one from confPath (will increase total runtime)
                to save runtime use makeConfDict() to create a confDict beforehand
            confPath, str, path to explore.conf file to be used if not given confDict, 
                default='/Users/yovosko/public_html/Hegemon/explore.conf'
        Output:
            returns a list of all dataset Ids that have data from that source 
            will include datasets which have data from multiple sources if source is one of them
    """
    if len(confDict)==0:
        confDict=makeConfDict(confPath)
    out=[]
    for i in confDict.keys():
        if 'Source' in confDict[i]:
            source1=confDict[i]['Source'].split(' ')
            if source in source1:
                out.append(i)
    if len(out)==0:
        print('Source '+source+' not in this explore.conf file, check if you gave the correct confPath, and try setting confDict={}')
    return out

In [5]:
def getEMtabData(emtab,confDict,verbose=True):
    """getEMtabData
        Inputs:
            emtab, str, accession id for a uk biostudies dataset
        Output:
            return the E-M-tab  dataset acession id, # of samples, species, 
                    experiment type,tissue dict, age dict, sex dict and citation
    """
    urlBase='https://www.ebi.ac.uk/biostudies/files/'
    url=urlBase+emtab+'/'+emtab+'.json'
    r = requests.get(url)
    text = r.text
    species=text.split('"Organism",\n')[1].split('\n')[0].split(':')[-1].strip().replace('"','')
    samNum=int(text.split('Sample count",\n')[-1].split('\n')[0].split(':')[-1].strip().replace('"',''))
    if 'PMID' in text:
        ref=text.split('PMID",\n')[-1].split('\n')[0].split(':')[-1].strip().replace('"','')
        
        refId=ref.split('/')[-1]
        urlBase2='https://pubmed.ncbi.nlm.nih.gov/'
        url=urlBase2+refId
        r2 = requests.get(url)
        text2 = str(r2.content)
        doi=text2.split('doi:')[1].split(',')[0]
        citation = 'https://doi.org/'+doi
    else:
        if emtab=='E-MTAB-1788':
            citation='https://doi.org/10.1186/s13395-015-0059-1'
        elif emtab=='E-MEXP-740':
            citation='https://doi.org/10.1093/gerona/62.10.1088'
        else:
            citation='Missing'
    exprType=text.split('"Study type",\n')[1].split('\n')[0].split(':')[-1].strip().replace('"','').replace(',','')
    tissues,genders,ages,vo2,dieases,times,trains=getTissSexAge(emtab,confDict,verbose=verbose)
    return emtab,samNum,species,exprType,tissues,genders,ages,vo2,diseases,times,trains,citation 



def getGSEData(gse,confDict={},verbose=True):
    """getGSEData
        Inputs:
            gse, str, accession id for a gene expression omnibus or uk biostudies dataset
            confDict, dictionary, dictionary of processed datasets, if not porcessed will be ignored
                    default: {}
            verbose, boolean, if True will print out non-error messages
                    default:True
        Output:
            return the dataset acession id, # of samples, species, tissue dict, age dict, sex dict and citation
    """
    if 'GSE' in gse:
        urlBase='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
        url=urlBase+gse
        r = requests.get(url)
        text = str(r.content)
        numSamples=getNumSamplesGSE(text,gse)
        species = text.split('geoaxema_organismus')[-1].split('>')[1].split('<')[0]
        try:
            experType=text.split('Experiment type')[1].split('>')[2].split('<')[0]
        except:
            experType='unk' 
        if len(confDict)>0:
            tissues,ages,genders,vo2,diseases,times,trains=getTissSexAge(gse,confDict=confDict,verbose=verbose)
        else:
            tissues={};genders={};ages={};vo2={};dieases={};times={};trains={}
        citation=getCitationGSE(gse,text)
    elif 'E-MTAB' in gse or 'E-MEXP' in gse:
        gse,numSamples,species,experType,tissues,genders,ages,vo2,dieases,times,trains,citation=getEMtabData(gse,
                                                                                                             confDict=confDict,
                                                                                                             verbose=verbose)
    else:
        raise TypeError('Non GSE files not currently supported')
    return gse,numSamples,species,experType,tissues,genders,ages,vo2,dieases,times,trains,citation

def getNumSamplesGSE(text,gse):
    if gse=='GSE123878':
        numSamples=87
    elif gse=='GSE151066':
        numSamples=57
    else:
        numSamples=text.split('Samples')[-1].split(')')[0].split('(')[1]
        numSamples=int(numSamples)
    return numSamples

def getCitationGSE(gse,text):
    hasCitation=text.split('Citation')[1].split('<')[0]
    if hasCitation == ' missing':
        citation='Missing'
        if gse=='GSE68585':
            citation='https://doi.org/10.1371/journal.pone.0160327'
        if gse=='GSE126001':
            citation='https://doi.org/10.1126/science.aat3987'
        if gse=='GSE83578':   
            citation='http://doi.org/10.1186/s12974-016-0758-5'
        if gse=='GSE4252':
            citation='https://doi.org/10.1096/fj.04-3149fje'
        if gse=='GSE23697':
            citation='https://doi.org/10.1249/01.MSS.0000384251.51943.52'
        if gse=='GSE111212':   
            citation='https://doi.org/10.3390/cells11233864'
        if gse=='GSE93546':
            citation='https://doi.org/10.1096/fj.201900453R'
        if gse=='GSE221210':
            citation='https://doi.org/10.1038/s42255-023-00891-y'
        if gse=='GSE111555':
            citation='https://doi.org/10.1002/lipd.12155'
    else:
        ref=text.split('Citation')[1].split('href=')[1].split('"')[1]
        if 'pubmed' in ref:
            refId=ref.split('/')[-1]
            urlBase2='https://pubmed.ncbi.nlm.nih.gov/'
            url=urlBase2+refId
            r2 = requests.get(url)
            text2 = str(r2.content)
            doi=text2.split('doi:')[1].split(',')[0]
            citation = 'https://doi.org/'+doi
        else:
            if 'http' in ref:
                citation=ref
                citation=citation.replace('/dx.','/')
            elif 'doi' in text:
                doi=text.split('Citation')[1].split('doi:')[1].split('<')[0]
                citation='https://doi.org/'+doi
            else:
                citation=ref
    return citation

def getDataDf(dataFiles,save=False,saveName='TodoDatasets.csv',confDict={},verbose=True):
    dfDict={}
    for i in tq(dataFiles):
        if 'GSE' in i or 'E-MTAB' in i or 'EMTAB' in i or 'EMEXP' in i or 'E-MEXP' in i:
            try:
                out=getGSEData(i,confDict=confDict,verbose=verbose)
                lst=[i for i in out]
                dfDict[i]=lst
            except Exception as error:
                print('Error on '+str(i),error)
        else:
            print("Didn't try: "+i+', not currently supported')
    df=pd.DataFrame(dfDict)
    df=df.T
    df.columns=['ID','Number Samples','Species','Experiment Type','Tissues','Genders','Ages','Vo2','Disease','Citation']
    df=df.sort_values(by='ID',ascending=True)
    if save:
        df.to_csv(saveName)
    return df

def __shouldIRunTissGenderAge(surv):
    """
    __shouldIRunTissGenderAge
        Inputs:
            surv, dataframe, survival file with info on tissue/age/gender of samples
        Output:
            returns True if there are columns in surv that include tissue/age/gender info, False if not
                If True will also give the names of the columns in surv with that info
    """
    tissueCol='';ageCol='';genderCol='';vo2Col='';diseaseCol='';timeCol='';trainCol=''
    for i in surv.columns:
        i2=i.replace('Age','age').replace('Tissue','tissue').replace('Sex','sex').replace('Gender','gender')
        i2=i2.replace('VO2','vo2').replace('Vo2','vo2').replace('vO2','vo2').replace('Disease','disease')
        i2=i2.replace('VO 2','vo2').replace('vO 2','vo2').replace('Vo 2','vo2')
        i2=i2.replace('Time','time').replace('Training','training')
        if 'age' in i2:
            ageCol=i
            continue
        if 'tissue' in i2:
            tissueCol=i
            continue
        if 'sex' in i2:
            genderCol=i
            continue
        if 'gender' in i2:
            genderCol=i
            continue
        if 'max' in i2:
            vo2Col=i
            continue
        if 'vo2' in i2:
            vo2Col=i
            continue
        if 'disease' in i2:
            diseaseCol=i
            continue
        if 'time' in i2 and 'c' in i2:
            timeCol=i
        if 'training' in i2:
            trainCol=i
    if tissueCol!='' or ageCol!='' or genderCol!='' or vo2Col!='' or diseaseCol!='' or timeCol!='' or trainCol!='':
        return True,tissueCol,ageCol,genderCol,vo2Col,diseaseCol,timeCol,trainCol
    else:
        return False,tissueCol,ageCol,genderCol,vo2Col,diseaseCol,timeCol,trainCol

def getTissSexAge(gse,confDict,verbose=True):
    ids=getDatasetIdFromSource(gse,confDict)
    if len(ids)>1:
        if verbose:
            print('multi id',gse,ids)
        maxSams=-1
        for idd in ids:
            if not 'MUSCLE' in idd:
                survival_file=getDatasetSurvivalFile(idd,confDict=confDict)
                surv=pd.read_csv(survival_file,sep='\t',index_col=0)
                sams=set(surv.index)
                if len(sams)>maxSams:
                    maxSams=len(sams)
                    finSurv=surv
        surv=finSurv
    else:
        idd=ids[0]
        survival_file=getDatasetSurvivalFile(idd,confDict=confDict)
        surv=pd.read_csv(survival_file,sep='\t',index_col=0)
    run,tissueCol,ageCol,genderCol,vo2Col,diseaseCol,timeCol,trainCol = __shouldIRunTissGenderAge(surv)
    if run:
        tissDict,ageDict,genderDict,vo2Dict,diseaseDict,timeDict,trainDict=__getTissGenderAgeHelper(surv,
                                                                                                    tissueCol=tissueCol,
                                                                                                    ageCol=ageCol,
                                                                                                    genderCol=genderCol,
                                                                                                    vo2Col=vo2Col,
                                                                                                    diseaseCol=diseaseCol,
                                                                                                    timeCol=timeCol,
                                                                                                    trainCol=trainCol)
    else:
        tissDict={};ageDict={};genderDict={};vo2Dict={};diseaseDict={};timeDict={};trainDict={}
    return tissDict,ageDict,genderDict,vo2Dict,diseaseDict,timeDict,trainDict

def __getTissGenderAgeHelper(surv,tissueCol,ageCol,genderCol,vo2Col,diseaseCol,timeCol,trainCol):
    tissDict={};ageDict={};genderDict={};vo2Dict={};diseaseDict={};timeDict={};trainDict={}
    if tissueCol!='':
        tissLst=list(surv[tissueCol])
        for i in set(tissLst):
            if type(i) is not float:
                j=i.capitalize()
            else:
                j=i
            if j in tissDict:
                tissDict[j]+=tissLst.count(i)
            else:
                tissDict[j]=tissLst.count(i)
    if ageCol!='':
        ageLst=list(surv[ageCol])
        for i in set(ageLst):
            ageDict[i]=ageLst.count(i)
    if genderCol!='':
        genderLst=list(surv[genderCol])
        for i in set(genderLst):
            if type(i) is not float:
                j=i.split(' ')[0].capitalize()
                j=j.replace('Female','F').replace('Male',"M").replace('Mixed','mixed')
                j=j.replace('M','Male').replace('F','Female').replace('mixed','Mixed')
            else:
                j=i
            if j in genderDict:
                genderDict[j]+=genderLst.count(i)
            else:
                genderDict[j]=genderLst.count(i)
    if vo2Col!='':
        vo2Lst=list(surv[vo2Col])
        for i in set(vo2Lst):
            vo2Dict[i]=vo2Lst.count(i)
    if diseaseCol!='':
        diseaseLst=list(surv[diseaseCol])
        for i in set(diseaseLst):
            j=i.capitalize()
            if j in diseaseDict:
                diseaseDict[j]+=diseaseLst.count(i)
            else:
                diseaseDict[j]=diseaseLst.count(i)
    if timeCol!='':
        timeLst=list(surv[timeCol])
        for i in set(timeLst):
            timeDict[i]=timeLst.count(i)
    if trainCol!='':
        trainLst=list(surv[trainCol])
        for i in set(trainLst):
            trainDict[i]=trainLst.count(i)
    return tissDict,ageDict,genderDict,vo2Dict,diseaseDict,timeDict,trainDict

def readDataFrameFile(file,index_col=None,header=0,sep=None,convertDicts=False,dictLst=['Tissues','Ages','Genders']):
    """Inputs:
        file, str, name of path to input file
        index_col, int/None, number of column to be used as index, default = None
        header, int, number of row to be used as header/first row, deafult = 0
        sep, str, if the name of the file does not conventionally match its seperation
            example: file is .txt but seperator is ' ', default = None
    Output:
        pandaas dataframe of the inputed file (currently only supports csv, tsv/txt and xlsx files)"""
    if '.xlsx' in file:
        df=pd.read_excel(file,index_col=index_col,header=header)
    else:
        if '.csv' in file:
            sepF=','
        elif '.tsv' in file or '.txt' in file:
            sepF='\t'
        else:
            raise ValueError('File type currently not supported, please use either a .csv, .tsv, .txt or .xlsx file')
        if sep is None:
            sep=sepF
        df=pd.read_csv(file,sep=sep,index_col=index_col,header=header)
    if convertDicts:
        for i in df.index:
            for j in dictLst:
                string=df.at[i,j].replace('nan','""')
                res = ast.literal_eval(string)
                df.at[i,j]=res
    return df

In [6]:
confPathLst=['/Users/yovosko/public_html/Hegemon/explore.conf','/booleanfs2/sahoo/Hegemon/explore.conf',
             "/Users/aglina/public_html/Hegemon/explore.conf"]

confDict=makeConfDict(confPathLst)
gsmLst=getDatasetSource(idd='All',confDict=confDict,returnType='list')
conf=[c for c in gsmLst if not '/' in c]
conf2=[c for c in conf if c!='']
conf3=[c for c in gsmLst if '/' in c]

In [7]:
prePost=pd.read_csv('Macrophages_Pre_Post.txt',sep='\t',index_col=0)
sedEx=pd.read_csv('Macrophages_Sed_Ex.txt',sep='\t',index_col=0)

prePost['Exercise Type']='Immediate'
sedEx['Exercise Type']='Training'

exerciseDf=pd.concat([prePost,sedEx])

gses=list(exerciseDf.index)

In [8]:
gses.remove('GSE86931')
gses.remove('GSE1718')
gses.remove('GSE5105')
gses.append('GSE3606')
gses=[i.replace('EMTAB','E-MTAB-').replace('EMEXP','E-MEXP-') for i in gses]
gses=list(set(gses))

In [108]:
tissLst=[];ageLst=[];gendLst=[];vo2Lst=[];disLst=[];timeLst=[];trainLst=[]
for gse in tq(dataDf.index,'GSES'):
    tissues,ages,genders,vo2s,diseases,times,trains=getTissSexAge(gse,confDict=confDict,verbose=True)
    tissLst.append(tissues)
    ageLst.append(ages)
    gendLst.append(genders)
    vo2Lst.append(vo2s)
    disLst.append(diseases)
    timeLst.append(times)
    trainLst.append(trains)
    
dataDf['Tissues']=tissLst
dataDf['Ages']=ageLst
dataDf['Genders']=gendLst
dataDf['Vo2']=vo2Lst
dataDf['Diseases']=disLst
dataDf['Time Points']=timeLst
dataDf['Trainings']=trainLst
    
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, description='GSES', max=76.0, style=ProgressStyle(description_width='i…

multi id E-MTAB-1788 ['MSL1', 'MSL2']
multi id GSE104079 ['M6', 'MUSCLE8']
multi id GSE11803 ['M21-0', 'MUSCLE9']
multi id GSE126296 ['M2', 'MUSCLE11']
multi id GSE139258 ['M33', 'MUSCLE12']
multi id GSE155933 ['M002', 'MUSCLE3', 'M1-2']
multi id GSE230102 ['M105-0', 'M105-1']
multi id GSE242354 ['MUSL79-0', 'MUSL79-1']
multi id GSE44051 ['M23', 'MUSCLE7']
multi id GSE68072 ['MB7', 'MUSCLE16']



In [111]:
dataDf.to_excel('MacrophageExercise_DatasetsInfo.xlsx')

In [9]:
infoFile='MacrophageExercise_DatasetsInfo.xlsx'
#try:
dataDf=readDataFrameFile(infoFile,convertDicts=True,index_col=0,
                         dictLst=['Tissues','Ages','Genders','Time Points','Vo2','Diseases','Trainings'])
#except:
#    dataDf=getDataDf(gses,save=True,saveName=infoFile,confDict=confDict)

In [10]:
dataDf.to_excel('MacrophageExercise_DatasetsInfo.xlsx')

In [118]:
gendLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Genders'])>1:
        gendLst.append(i)
yeti.saveLst(gendLst,'GendersDatasets.txt')

ageLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Ages'])>1:
        ageLst.append(i)
yeti.saveLst(ageLst,'AgesDatasets.txt')

vo2Lst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Vo2'])>1:
        vo2Lst.append(i)
yeti.saveLst(vo2Lst,'Vo2Datasets.txt')

disLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Diseases'])>1:
        disLst.append(i)
yeti.saveLst(disLst,'DiseaseDatasets.txt')

tissLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Tissues'])>1:
        tissLst.append(i)
yeti.saveLst(tissLst,'TissueDatasets.txt')

timeLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Time Points'])>2:
        timeLst.append(i)
yeti.saveLst(timeLst,'TimeDatasets.txt')

trainLst=[]
for i in dataDf.index:
    if len(dataDf.at[i,'Trainings'])>1:
        trainLst.append(i)
yeti.saveLst(tissLst,'TrainingsDatasets.txt')

'TrainingsDatasets.txt'

In [1]:
dbid

NameError: name 'dbid' is not defined

In [39]:
infoFile='MacrophageExercise_DatasetsInfo.xlsx'
dataDf=getDataDf(gses,save=True,saveName=infoFile,confDict=confDict)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))

Error on GSE27536 HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /geo/query/acc.cgi?acc=GSE27536 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f4530890748>: Failed to establish a new connection: [Errno 101] Network is unreachable',))



KeyboardInterrupt: 

In [150]:
def sumDict(lst):
    allDict={}
    for dic in lst:
        for key,num in dic.items():
            if not key is np.nan:
                if type(key) is str:
                    key=key.upper()
                    if len(key)>0:
                        key=key[0]
                if key in allDict:
                    allDict[key]+=num
                else:
                    allDict[key]=num
    return allDict


In [16]:
lst=yeti.readLst('AgesDatasets.txt')
for i in lst:
    print(i,getDatasetIdFromSource(i,confDict))


E-MTAB-1788 ['MSL1', 'MSL2']
GSE111555 ['MB2']
GSE117525 ['M66']
GSE120862 ['M5']
GSE126296 ['M2', 'MUSCLE11']
GSE140089 ['MUSL100']
GSE144304 ['MUSL98']
GSE155933 ['M002', 'MUSCLE3', 'M1-2']
GSE165630 ['M28']
GSE1786 ['M47']
GSE19062 ['M55']
GSE19420 ['M72']
GSE198266 ['MUSL104']
GSE199225 ['M35']
GSE236600 ['MUSL18']
GSE23697 ['M56']
GSE252357 ['MUSL97']
GSE28422 ['M58']
GSE28498 ['MB3']
GSE33603 ['M41']
GSE41769 ['M17']
GSE44051 ['M23', 'MUSCLE7']
GSE46075 ['MB4']
GSE51216 ['MB6']
GSE53598 ['M13']
GSE59363 ['M44']
GSE68585 ['M45']
GSE72462 ['M52']
GSE83352 ['M73']
GSE8479 ['M71']
GSE97084 ['M75']
