In [1]:
import os,sys,glob
from matplotlib import colors as mcolors
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import seaborn as sns
from itertools import combinations
from sklearn.metrics import mean_squared_error
from scipy import stats

In [5]:
def checkForDuplicates(obj,finish_dates,subjects,sessions,paths):
    
    if (obj['output']['meta']['subject'] in subjects) and (obj['output']['meta']['session'] in sessions):
        index = np.where(np.logical_and(subjects == obj['output']['meta']['subject'],sessions == obj['output']['meta']['session']))
        if finish_dates[index] <= obj["finish_date"]:
            subjects = np.delete(subjects,index)
            paths = np.delete(paths,index)
            sessions = np.delete(sessions,index)
            finish_dates = np.delete(finish_dates,index)
    
    return finish_dates, subjects, sessions, paths

def addSubjectsSessions(subject,session,path,data):
    
    if 'subjectID' not in data.keys():
        data['subjectID'] = [ str(subject) for f in range(len(data)) ]
    
    if 'sessionID' not in data.keys():
        data['sessionID'] = [ str(session) for f in range(len(data)) ]
        
    return data

def appendData(subjects,sessions,paths,finish_dates,obj,filename):
        
    # check for duplicates. if so, remove
    finish_dates, subjects, sessions, paths = checkForDuplicates(obj,finish_dates,subjects,sessions,paths)

    subjects = np.append(subjects,obj['output']['meta']['subject'])
    sessions = np.append(sessions,obj['output']['meta']['session'])
    paths = np.append(paths,"input/"+obj["path"]+"/"+filename)
    finish_dates = np.append(finish_dates,obj['finish_date'])
    
    return finish_dates, subjects, sessions, paths

def compileData(paths,subjects,sessions,data):
    
    for i in range(len(paths)):
        if '.json.gz' in paths[i]:
            tmpdata = pd.read_json(paths[i],orient='index').reset_index(drop=True)
            tmpdata = addSubjectsSessions(subjects[i],sessions[i],paths[i],tmpdata)
        else:
            tmpdata = pd.read_csv(paths[i])
            tmpdata = addSubjectsSessions(subjects[i],sessions[i],paths[i],tmpdata)

        data = data.append(tmpdata,ignore_index=True)

    # replace empty spaces with nans
    data = data.replace(r'^\s+$', np.nan, regex=True)
    
    return data
    
def collectData(datatype,datatype_tags,tags,filename,outPath):

    import requests
    import pandas as pd

    # grab path and data objects
    objects = requests.get('https://brainlife.io/api/warehouse/secondary/list/%s'%os.environ['PROJECT_ID']).json()
    
    # subjects and paths
    subjects = []
    sessions = []
    paths = []
    finish_dates = []
    
    # set up output
    data = pd.DataFrame()

    for obj in objects:
        if obj['datatype']['name'] == datatype:
            if datatype_tags:
                if set(datatype_tags).issubset(obj['output']['datatype_tags']):
                    if tags:
                        if set(tags).issubset(obj['output']['tags']):
                            finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                        elif '!' in str(tags):
                            tag = [ f for f in tags if '!' in str(f) ]
                            tag_drop = [ f for f in tags if f not in tag ]
                            if not set([ f.replace('!','') for f in tag]).issubset(obj['output']['tags']):
                                if set(tag_drop).issubset(obj['output']['tags']):
                                    finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                    else:
                        finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                elif '!' in str(datatype_tags):
                    datatype_tag = [ f for f in datatype_tags if '!' in str(f) ]
                    datatype_tag_drop = [ f for f in datatype_tags if f not in datatype_tag ]
                    if not set([ f.replace('!','') for f in datatype_tag]).issubset(obj['output']['datatype_tags']):
                        if tags:
                            if set(tags).issubset(obj['output']['tags']):
                                finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                            elif '!' in str(tags):
                                tag = [ f for f in tags if '!' in str(f) ]
                                tag_drop = [ f for f in tags if f not in tag ]
                                if not set([ f.replace('!','') for f in tag]).issubset(obj['output']['tags']):
                                    if set(tag_drop).issubset(obj['output']['tags']):
                                        finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                        else:
                            finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
            else:
                if tags:
                    if set(tags).issubset(obj['output']['tags']):
                        finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                    elif '!' in str(tags):
                        tag = [ f for f in tags if '!' in str(f) ]
                        tag_drop = [ f for f in tags if f not in tag ]
                        if not set([ f.replace('!','') for f in tag]).issubset(obj['output']['tags']):
                            if set(tag_drop).issubset(obj['output']['tags']):
                                finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)
                else:
                    finish_dates, subjects, sessions, paths = appendData(subjects,sessions,paths,finish_dates,obj,filename)

    # shuffle data
    paths = [z for _,_,z in sorted(zip(subjects,sessions,paths))]
    subjects = [x for x,_,_ in sorted(zip(subjects,sessions,paths))]
    sessions = [y for _,y,_ in sorted(zip(subjects,sessions,paths))]
    
    # compile data
    data = compileData(paths,subjects,sessions,data)

    # output data structure for records and any further analyses
    # subjects.csv
    data.to_csv(outPath,index=False)
    
    return data

In [6]:
# this will collect the data and save it to where you want
# collectData takes 5 input parameters:
# 1. datatype: this is the full datatype name for the datatype you want to grab
# 2. datatype_tags: this is a list of datatype tags corresponding to the appropriate datatype tags. this is useful for filtering only the datatype tags you'd like. you can set this
# as empty by setting this to ''
# 3. tags: this is a list of tags corresponding to the appropriate tags. this is useful for filtering only the tags you'd like. you can set this
# as empty by setting this to ''
# 4. filename: this is the filename of the specific file within the datatype.
# 5. outPath: this is the path to where you would like to save the data. example: './roi_diffusion.csv'


# here is an example for how to grab the tractmeasures data with corresponding datatype_tags of 'cleand' and 'macro' and with tags 'bl_paper' and 'macro'. the file we're interested in is 'MEAN.csv'
# and we want to save the file to the current directory and name it 'rois_diffusion.csv'
tract_data = collectData('neuro/tractmeasures',['cleaned','macro'],['bl_paper','macro'],'output_FiberStats.csv','./test.csv')

In [7]:
tract_data

Unnamed: 0,TractName,StreamlineCount,volume,avgerageStreamlineLength,streamlineLengthStdev,averageFullDisplacement,fullDisplacementStdev,ExponentialFitA,ExponentialFitB,StreamlineLengthTotal,...,stdevOfEndpointDistanceFromCentroid1,stdevEndpointDistanceFromCentroid2,MidpointDensity,averageMidpointDistanceFromCentroid,stDevOfMidpointDistanceFromCentroid,TotalVolumeProportion,TotalCountProportion,TotalWiringProportion,subjectID,sessionID
0,wbfg,3000000,895851,67.126547,37.982942,40.086872,22.960855,0.021252,-0.020306,2.013796e+08,...,,,,,,1.000000,1.000000,1.000000,3_038,1
1,forcepsMinor,4921,55628,105.684566,14.258453,44.123196,19.527367,,,5.200738e+05,...,5.777661,5.836439,1.765698,7.370342,3.501676,0.062095,0.001640,0.002583,3_038,1
2,forcepsMajor,10175,79337,137.978398,23.594155,54.358919,21.518185,,,1.403930e+06,...,7.439675,7.566302,2.692511,8.676504,4.634782,0.088560,0.003392,0.006972,3_038,1
3,parietalCC,21676,77323,112.440640,16.655272,31.635558,10.941237,,,2.437263e+06,...,6.990855,6.792900,3.647930,8.440904,4.029586,0.086312,0.007225,0.012103,3_038,1
4,middleFrontalCC,34336,119200,71.813391,31.610578,28.813819,12.507721,,,2.465785e+06,...,,,,,,0.133058,0.011445,0.012244,3_038,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,rightContraAnterioFrontoCerebellar,85,11395,170.836113,34.305296,114.504239,21.631204,,,1.452107e+04,...,8.285051,5.862982,1.024096,13.024434,7.540757,0.015365,0.000028,0.000073,4_074,1
1793,leftVOF,1605,12488,51.998776,15.904268,33.036614,9.859701,,,8.345804e+04,...,5.850438,4.490725,1.459091,9.395261,4.247937,0.016839,0.000535,0.000419,4_074,1
1794,rightVOF,2638,12863,56.804778,15.201160,38.760445,12.125481,,,1.498510e+05,...,5.420596,3.811942,1.998485,7.161866,3.599057,0.017345,0.000879,0.000752,4_074,1
1795,leftCST,3386,41120,111.518161,24.903917,85.655431,21.248894,,,3.776005e+05,...,7.048529,7.109425,1.453219,10.806145,4.862979,0.055448,0.001129,0.001895,4_074,1
