In [2]:
from pathlib import Path
from bids import BIDSLayout
import pandas as pd
import numpy as np

Failed to import duecredit due to No module named 'duecredit'


In [3]:
# Set a bunch of paths
mbdu_bids_root = Path('/data/MBDU/ABCD/BIDS/NKI_script/MID')
dsst_bids_root = Path('/data/ABCD_DSST/bids_20190215/')
mriqc_outdir = Path('/data/ABCD_DSST/bids_20190215/derivatives/mriqc')
container_path = Path('/data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg')
swarm_file = Path('/data/ABCD_DSST/swarms/mriqc_swarm/mriqc_swarm')
swarm_log = Path('/data/ABCD_DSST/swarms/mriqc_swarm/logs')
# TODO: use semantic versioning or something to find the latest release
release_dir = Path('/data/ABCD_DSST/releases/1.1/ABCDstudyDEAP')
nprocs = 20

In [4]:
# make sure that there aren't any subjects 
# in DSST bids root that aren't in MBDU bids root

mbdu_subs = sorted(list(mbdu_bids_root.glob('sub-*')))
mbdu_subs = set([ss.parts[-1] for ss in mbdu_subs])
dsst_subs = sorted(list(dsst_bids_root.glob('sub-*')))
dsst_subs = set([ss.parts[-1] for ss in dsst_subs])

print(f"{len(dsst_subs.difference(mbdu_subs))} are in DSST that aren't in MBDU")
if len(dsst_subs.difference(mbdu_subs)) > 0:
    for ss in dsst_subs.difference(mbdu_subs):
        assert ss != ''
        ! rm -rf {dsst_bids_root / ss}
    
mbdu_subs = sorted(list(mbdu_bids_root.glob('sub-*')))
mbdu_subs = set([ss.parts[-1] for ss in mbdu_subs])
dsst_subs = sorted(list(dsst_bids_root.glob('sub-*')))
dsst_subs = set([ss.parts[-1] for ss in dsst_subs])
assert len(dsst_subs.difference(mbdu_subs)) == 0

0 are in DSST that aren't in MBDU


In [5]:
subs = sorted(list(dsst_bids_root.glob('sub-*')))

In [6]:
bad_subs = []
for sub in subs:
    tmp = (sub / 'tmp')
    rest = (sub / 'ses-1' / 'rest')
    if (tmp.exists()) & (rest.exists()):
        bad_subs.append(sub)
        
# make sure no subjects have the tmp or rest directories
assert len(bad_subs) == 0

In [7]:
# get the list of subjects in the most recent release
# We're assuming that all subjects are in the site table
site_df = pd.read_csv(release_dir / 'abcd_lt01.txt',
                      skiprows = [1],
                      header = 0,
                      sep='\t')

In [45]:
# Get a list of all the jsons that have been produced by MRIQC
jsons = sorted(list(mriqc_outdir.glob('**/*.json')))

json_df = []
for sub_json in jsons:
    row = {}
    row['scan_name'] = sub_json.parts[-1].split('.')[0]
    row['path'] = sub_json
    json_df.append(row)
json_df = pd.DataFrame(json_df)

In [46]:
# get a list of all the nifitis and find out which have corresponding html files
niis = sorted(list(dsst_bids_root.glob('**/*.nii.gz')))

nii_df = []
for nii in niis:
    row = {}
    row['subject'] = nii.parts[4]
    row['participant_label'] = row['subject'].split('-')[-1]
    row['session'] = nii.parts[5]
    row['modality'] = nii.parts[6]
    row['scan_name'] = nii.parts[7].split('.')[0]
    row['path'] = nii
    nii_df.append(row)
nii_df = pd.DataFrame(nii_df)

In [47]:
# Figure out which subjects have run, and which had problems
mriqc_res_df = nii_df.merge(json_df, how='left', on='scan_name',
                            suffixes={'_nii', '_json'}, indicator=True)
# _merge variable created as categorical, cast it back to string
mriqc_res_df['_merge'] = mriqc_res_df._merge.astype('str')

# Make sure that there aren't rows getting duplicated
assert len(mriqc_res_df) == len(nii_df)

subj_df = pd.DataFrame(mriqc_res_df.groupby('subject').agg({'_merge':['unique', 'nunique']}))

problem_subjects = subj_df.index[(subj_df['_merge','nunique'] > 1)].values

unrun_subjects = subj_df.index[
    (subj_df['_merge','unique'].str[0] == 'left_only')
    & (subj_df['_merge','nunique'] == 1)].values

finished_subjects = subj_df.index[
    (subj_df['_merge','unique'].str[0] == 'both') 
    & (subj_df['_merge','nunique'] == 1)].values

# Make sure that all subjects are either problem, unrun or finished
assert len(subj_df) == (len(problem_subjects) 
                        + len(unrun_subjects)
                        + len(finished_subjects))

In [48]:
len(unrun_subjects)

3005

In [49]:
# Find the unfinished subjects that are in our most recent release
site_df['participant_label'] = site_df.subjectkey.str.replace('_', '')

unrun_participant_labels = set([urs[4:] for urs in unrun_subjects])
finished_participant_labels = set([urs[4:] for urs in finished_subjects])
problem_participant_labels = set([urs[4:] for urs in problem_subjects])

unrun_released_participants = unrun_participant_labels.intersection(set(site_df.participant_label))
finished_released_participants = finished_participant_labels.intersection(set(site_df.participant_label))
problem_released_participants = problem_participant_labels.intersection(set(site_df.participant_label))

print(f"There are {len(unrun_released_participants)} unrun released particpants,"
      + f" {len(finished_released_participants)} finished released participants,"
      + f" and {len(problem_released_participants)} released participants with problems")

There are 22 unrun released particpants, 4170 finished released participants, and 164 released participants with problems


In [50]:
unrun_unreleased_participants = unrun_participant_labels.difference(set(site_df.participant_label))

In [23]:
# If you don't already have your singularity bind path set,
# append this to the front of your command:
# export SINGULARITY_BINDPATH=/gs3,/gs4,/gs5,/gs6,/gs7,/gs8,/gs9,/gs10,/gs11,/spin1,/scratch,/fdb,/data,/lscratch &&

In [51]:
cmds = []
for sub in unrun_released_participants:
    subj_dir = dsst_bids_root / ('sub-' + sub)
    cmd = ('mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids '
           + f' && mkdir -p /lscratch/$SLURM_JOB_ID/mriqc_work_{sub} '
           + f' && rsync -ach {subj_dir} /lscratch/$SLURM_JOB_ID/tmp_bids/ ' 
           + f' && singularity run {container_path} --participant_label={sub} --nprocs={nprocs}'
           + f' -w /lscratch/$SLURM_JOB_ID/mriqc_work_{sub}'
           + f' /lscratch/$SLURM_JOB_ID/tmp_bids/ {mriqc_outdir} participant')
    cmds.append(cmd)
    
for sub in unrun_unreleased_participants:
    subj_dir = dsst_bids_root / ('sub-' + sub)
    cmd = ('mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids '
           + f' && mkdir -p /lscratch/$SLURM_JOB_ID/mriqc_work_{sub} '
           + f' && rsync -ach {subj_dir} /lscratch/$SLURM_JOB_ID/tmp_bids/ ' 
           + f' && singularity run {container_path} --participant_label={sub} --nprocs={nprocs}'
           + f' -w /lscratch/$SLURM_JOB_ID/mriqc_work_{sub}'
           + f' /lscratch/$SLURM_JOB_ID/tmp_bids/ {mriqc_outdir} participant')
    cmds.append(cmd)

In [52]:
cmds_to_run = cmds[:1000]

In [53]:
swarm_file.write_text('\n'.join(cmds_to_run))
swarm_file.read_text().split('\n')

['mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids  && mkdir -p /lscratch/$SLURM_JOB_ID/mriqc_work_NDARINVTDLKZKEB  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINVTDLKZKEB /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINVTDLKZKEB --nprocs=20 -w /lscratch/$SLURM_JOB_ID/mriqc_work_NDARINVTDLKZKEB /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids  && mkdir -p /lscratch/$SLURM_JOB_ID/mriqc_work_NDARINVGXEX8Z13  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINVGXEX8Z13 /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINVGXEX8Z13 --nprocs=20 -w /lscratch/$SLURM_JOB_ID/mriqc_work_NDARINVGXEX8Z13 /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 '

In [54]:
assert len(swarm_file.read_text().split('\n')) <= 1000

In [55]:
!swarm -m singularity,webproxy -f {swarm_file} -g 24 -t {nprocs} --maxrunning 100 --partition norm,quick --logdir {swarm_log} --time 1:00:00 --gres=lscratch:100

20839124
