In [1]:
import altair as alt
import pandas as pd
import numpy as np
import requests
import glob
from tqdm import tqdm_notebook as tq
import matplotlib.pyplot as plt
import random

In [61]:
df1=pd.read_csv('Pattern1_Data.txt',sep='\t')
df2=pd.read_csv('Pattern2_Data.txt',sep='\t')
df1['Exercise Type']='Acute'
df2['Exercise Type']='Chronic'
df1.loc[25]=['GSE104079',58,0.3,0.0156,'Acute']
df=df1.merge(df2,how='outer')
dfPval=df[['Sample Name','Pval']].copy()

df['-Log10 P value']=-np.log10(df['Pval'])
df['Significant']=df['Pval']<=0.05

df.index=df['Sample Name']

In [62]:
df3=pd.read_csv('ExtraPvals.txt',sep='\t')
dfPval=dfPval.merge(df3,how='outer')
dfPval['-Log10 P value']=-np.log10(dfPval['Pval'])
dfPval['Significant']=dfPval['Pval']<=0.05
dfPval.index=dfPval['Sample Name']
dfPval=dfPval.drop(columns=['Sample Name'])

In [63]:
folders=['Athlete','Bed_Rest','Immindiate_Post_Train','Long_Term_Training','Supplementation']

In [64]:
athleteGSE=[];otherGSE=[]
for fold in folders:
    files=glob.glob('../Figs/'+fold+'/*GSE*')
    files=[f.split('\\')[-1].split('_')[0] for f in files]
    
    if fold=='Athlete':
        athleteGSE.extend(files)
    else:
        otherGSE.extend(files)
    
athleteGSE=list(set(athleteGSE))
otherGSE=list(set(otherGSE))

In [4]:
def makeDotPlot(df,saveName='DotPlot.png',title='Macrophage Exercise Score, Post v Pre',
                x='ROC-AUC',y='Sample Name',color='Exercise Type',size='-Log10 P value',opac='Significant',rev=True,save=True):
    chart=alt.Chart(df).mark_circle().encode(x=alt.X(x,scale=alt.Scale(domain=[0, 1])),
                                             y=y,
                                             color=color,
                                             opacity=opac,
                                             size=alt.Size(size,
                                                           scale=alt.Scale(range=[100,500],
                                                                           reverse=rev))).properties(title=title)
    if save:
        chart.save(saveName)
    return chart

In [5]:
def getGSEData(gse):
    urlBase='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
    url=urlBase+gse

    r = requests.get(url)
    test = str(r.content)
    
    numSamples=getNumSamples(test)
    species = test.split('geoaxema_organismus')[-1].split('>')[1].split('<')[0]
    try:
        experType=test.split('Experiment type')[1].split('>')[2].split('<')[0].split('Expression profiling by ')[1]
    except:
        experType='array'
    dupNum=0
    if 'SuperSeries' in test:
        if gse=='GSE155933' or gse=='GSE58249' or gse=='GSE111555':
            dupNum=3
        if gse=='GSE11803' or gse=='GSE165630' or gse=='GSE4252' or gse=='GSE183239' or gse=='GSE99963' or gse=='GSE87748':
            dupNum=2
    citation=getCitation(gse,test)
    exerType=getExerciseType(gse,test)
    return dupNum,numSamples,species,experType,exerType,citation

def getNumSamples(test):
    numSamples=test.split('Samples')[-1].split(')')[0].split('(')[1]
    numSamples=int(numSamples)
    return numSamples

def getCitation(gse,test):
    hasCitation=test.split('Citation')[1].split('<')[0]
    if hasCitation == ' missing':
        citation='Missing'
        if gse=='GSE68585':
            citation='https://doi.org/10.1371/journal.pone.0160327'
        if gse=='GSE126001':
            citation='https://doi.org/10.1126/science.aat3987'
        if gse=='GSE83578':   
            citation='http://doi.org/10.1186/s12974-016-0758-5'
        if gse=='GSE4252':
            citation='https://doi.org/10.1096/fj.04-3149fje'
        if gse=='GSE23697':
            citation='https://doi.org/10.1249/01.MSS.0000384251.51943.52'
    else:
        ref=test.split('Citation')[1].split('href=')[1].split('"')[1]
        if 'pubmed' in ref:
            refId=ref.split('/')[-1]
            urlBase2='https://pubmed.ncbi.nlm.nih.gov/'
            url=urlBase2+refId
            r2 = requests.get(url)
            test2 = str(r2.content)
            doi=test2.split('doi:')[1].split(',')[0]
            citation = 'https://doi.org/'+doi
        else:
            if 'http' in ref:
                citation=ref
                citation=citation.replace('/dx.','/')
            elif 'doi' in test:
                doi=test.split('Citation')[1].split('doi:')[1].split('<')[0]
                citation='https://doi.org/'+doi
            else:
                citation=ref
    return citation

def getExerciseType(gse,test):
    exerType=[]
    if 'resistance' in test:
        exerType.append('resistance')
    if 'endurance' in test:
        exerType.append('endurance')
    if 'loading' in test:
        exerType.append('resistance')
    if 'extension exercise' in test:
        exerType.append('resistance')
    if 'cyclists' in test or 'cycling' in test or 'running' in test or 'runners' in test:
        exerType.append('endurance')
    if 'lengthening contractions' in test or 'shortening contractions' in test:
        exerType.append('resistance')
    if 'aerobic exercise' in test or 'aerobic training' in test:
        exerType.append('endurance')
    if 'strength exercise' in test or 'strength training' in test:
        exerType.append('resistance')
    if 'electrical stimulation' in test:
        exerType.append('EPS')
    if gse=='GSE34788':
        exerType.append('resistance')
        exerType.append('endurance')
    if gse=='GSE111555' or gse=='GSE1786':
        exerType.append('endurance')
    exerType=list(set(exerType))
    if len(exerType)>1:
        exerType='Multi'
    elif len(exerType)==0:
        exerType='Unk'
    else:
        exerType=exerType[0]
    return exerType
        

In [6]:
def getGSEDataDf(df,allIds=True):
    data=df.copy()
    sampleNums=[];specs=[];experTypes=[];cites=[];duplicates=[];exerTypes=[]
    for gse in tq(data.index):
        if 'GSE' in gse: 
            dupNum,numSamples,species,experType,exerType,cite=getGSEData(gse)
        else:
            numSamples=32;species='Homo sapiens';cite='https://doi.org/10.1093/gerona/62.10.1088';experType='array';dupNum=0
            exerType='resistance'
        sampleNums.append(numSamples)
        specs.append(species)
        experTypes.append(experType)
        cites.append(cite)
        duplicates.append(dupNum)
        exerTypes.append(exerType)

    data['N Samples']=sampleNums
    data['Species']=specs
    data['Experiment Type']=experTypes
    data['Exercise Type']=exerTypes
    data['DOI']=cites
    data['Duplicates']=duplicates
    data['Method']='Muscle Biopsy'
    if allIds:
        bloodLst=list(data[data['ID.1'].str.contains('MB')].index)
        for i in bloodLst:
            data.at[i,'Method']='Blood Draw'
    if 'Notes' in data.columns:
        data.drop(columns=['Notes'],inplace=True)
    return data

In [11]:
data=pd.read_csv('../Figs/Datasets Processed.tsv',index_col=0,sep='\t')

In [12]:
goodlst=[]
for i in df['Sample Name']:
    if i in data.index:
        goodlst.append(i)
    else:
        print(i)

In [13]:
data2=data.loc[goodlst]
data2=getGSEDataDf(data2,allIds=True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for gse in tq(data.index):


  0%|          | 0/47 [00:00<?, ?it/s]

In [73]:
mouseLst=list(data2[data2['Species']=='Mus musculus'].index)
mouseLst.extend(list(data2[data2['Species']=='Rattus norvegicus'].index))

In [48]:
humanLst=list(data2[data2['Species']=='Homo sapiens'].index)

In [53]:
import random
humanDf=[]
seen=set()
ones=0;twos=0
for i in range(5):
    noStop=True
    while noStop:
        j = random.randint(0,len(humanLst)-1)
        if not j in seen:
            sample=humanLst[j]
            if sample in df1.index:
                ones+=1
                if ones<=3:
                    noStop=False
            else:
                twos+=1
                if twos<=3:
                    noStop=False
    seen.add(j)
    humanDf.append(sample)
humanDf

['GSE34788', 'GSE44051', 'GSE1832', 'GSE28498', 'GSE1718']

In [74]:
saveName='MacrophageExerciseDotPlot_All_log10Pval.png'
title='Macrophage Exercise Score, Post v Pre'
save=True
dfIn=df

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [75]:
saveName='MacrophageExerciseDotPlot_Mouse_log10Pval.png'
title='Macrophage Exercise Score, Mouse Post v Pre'
save=True
dfIn=df.loc[mouseLst]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [76]:
saveName='MacrophageExerciseDotPlot_Human_log10Pval.png'
title='Macrophage Exercise Score, Human Post v Pre'
save=True
dfIn=df.loc[humanDf]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [77]:
rnaSeqLst=list(data2[data2['Experiment Type']=='high throughput sequencing'].index)
saveName='MacrophageExerciseDotPlot_RNASeq_log10Pval.png'
title='Macrophage Exercise Score, RNA-Seq Post v Pre'
save=True
dfIn=df.loc[rnaSeqLst]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [58]:
df1Array=[]
df2Array=[]
arrayLst=list(data2[data2['Experiment Type']=='array'].index)
for i in arrayLst:
    if i in df1.index:
        df1Array.append(i)
    if i in df2.index:
        df2Array.append(i)

arrayDf=[]
for i in range(7):
    if i%2==0:
        arrayLst=df1Array.copy()
    else:
        arrayLst=df2Array.copy()
    noStop=True
    while noStop:
        j = random.randint(0,len(arrayLst)-1)
        sample=arrayLst[j]
        if not j in seen:
            noStop=False
    seen.add(j)
    arrayDf.append(sample)
arrayDf

['GSE16907',
 'GSE28498',
 'GSE44051',
 'GSE117525',
 'GSE122671',
 'GSE9103',
 'GSE104079']

In [78]:

saveName='MacrophageExerciseDotPlot_MicroArray_log10Pval.png'
title='Macrophage Exercise Score, Microarray Post v Pre'
save=True
dfIn=df.loc[arrayDf]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [79]:
bloodLst=list(data2[data2['Method']=='Blood Draw'].index)
saveName='MacrophageExerciseDotPlot_BloodSample_log10Pval.png'
title='Macrophage Exercise Score, PBMC Blood Post v Pre'
save=True
dfIn=df.loc[bloodLst]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [63]:
muscleLst=list(data2[data2['Method']=='Muscle Biopsy'].index)
df1Mus=[]
df2Mus=[]
for i in muscleLst:
    if i in df1.index:
        df1Mus.append(i)
    if i in df2.index:
        df2Mus.append(i)
        
musDf=[]
seen=set()
for i in range(7):
    if i%2==0:
        musLst=df2Mus.copy()
    else:
        musLst=df1Mus.copy()
    noStop=True
    while noStop:
        j = random.randint(0,len(musLst)-1)
        sample=musLst[j]
        if not j in seen:
            noStop=False
    seen.add(j)
    musDf.append(sample)
musDf

['GSE9103',
 'GSE71972',
 'GSE99963',
 'GSE1718',
 'GSE165630',
 'GSE104079',
 'GSE97084']

In [80]:
saveName='MacrophageExerciseDotPlot_MuscleSample_log10Pval.png'
title='Macrophage Exercise Score, Muscle Biopsy Post v Pre'
save=True
dfIn=df.loc[musDf]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [40]:
resLst=list(data2[data2['Exercise Type']=='resistance'].index)
saveName='MacrophageExerciseDotPlot_Resistance_Training_log10Pval.png'
title='Macrophage Exercise Score, Resistance Exercise Post v Pre'
save=True
dfIn=df.loc[resLst]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart

In [39]:
endurLst=list(data2[data2['Exercise Type']=='endurance'].index)
saveName='MacrophageExerciseDotPlot_Endurance_Training_log10Pval.png'
title='Macrophage Exercise Score, Endurance Exercise Post v Pre'
save=True
dfIn=df.loc[endurLst]

chart=makeDotPlot(dfIn,saveName=saveName,title=title,rev=False,save=save)
chart