In [112]:
from pathlib import Path
from bids import BIDSLayout
import pandas as pd

In [82]:
mbdu_bids_root = Path('/data/MBDU/ABCD/BIDS/NKI_script/MID')
dsst_bids_root = Path('/data/ABCD_DSST/bids_20190215/')
mriqc_outdir = Path('/data/ABCD_DSST/bids_20190215/derivatives/mriqc')
container_path = Path('/data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg')
swarm_file = Path('/data/ABCD_DSST/swarms/mriqc_swarm/mriqc_swarm')
swarm_log = Path('/data/ABCD_DSST/swarms/mriqc_swarm/logs')
nprocs = 20

In [40]:
# make sure that there aren't any subjects 
# in DSST bids root that aren't in MBDU bids root

mbdu_subs = sorted(list(mbdu_bids_root.glob('sub-*')))
mbdu_subs = set([ss.parts[-1] for ss in mbdu_subs])
dsst_subs = sorted(list(dsst_bids_root.glob('sub-*')))
dsst_subs = set([ss.parts[-1] for ss in dsst_subs])

print(f"{len(dsst_subs.difference(mbdu_subs))} are in DSST that aren't in MBDU")
if len(dsst_subs.difference(mbdu_subs)) > 0:
    for ss in dsst_subs.difference(mbdu_subs):
        assert ss != ''
        ! rm -rf {dsst_bids_root / ss}
    
mbdu_subs = sorted(list(mbdu_bids_root.glob('sub-*')))
mbdu_subs = set([ss.parts[-1] for ss in mbdu_subs])
dsst_subs = sorted(list(dsst_bids_root.glob('sub-*')))
dsst_subs = set([ss.parts[-1] for ss in dsst_subs])
assert len(dsst_subs.difference(mbdu_subs)) == 0

In [54]:
subs = sorted(list(dsst_bids_root.glob('sub-*')))

In [55]:
bad_subs = []
for sub in subs:
    tmp = (sub / 'tmp')
    rest = (sub / 'ses-1' / 'rest')
    if (tmp.exists()) & (rest.exists()):
        bad_subs.append(sub)
        
# make sure no subjects have the tmp or rest directories
assert len(bad_subs) == 0

In [23]:
# If you don't already have your singularity bind path set,
# append this to the front of your command:
# export SINGULARITY_BINDPATH=/gs3,/gs4,/gs5,/gs6,/gs7,/gs8,/gs9,/gs10,/gs11,/spin1,/scratch,/fdb,/data,/lscratch &&

In [84]:
cmds = []
for sub in subs:
    participant_label = sub.parts[-1].split('-')[-1]
    cmd = (f'singularity run {container_path} --participant_label={participant_label} --nprocs={nprocs}' 
           + f' {dsst_bids_root} {mriqc_outdir} participant')
    cmds.append(cmd)

In [72]:
len(cmds)

8985

In [88]:
# test swarm command with two subjects
swarm_file.write_text('\n'.join(cmds[28:]))
swarm_file.read_text().split('\n')

['singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV02RCED7D --nprocs=20 /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV02UVMTY7 --nprocs=20 /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV02WP3TP6 --nprocs=20 /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV030W95VP --nprocs=20 /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-2

In [89]:
!swarm -m singularity,webproxy -f {swarm_file} -g 24 -t {nprocs} -b 10 --partition norm --logdir {swarm_log} --time 01:00:00

20603102


In [90]:
# a bunch of those jobs got killed by an admin due to thrashing the file system, I guess I'll need to bundle more

In [96]:
# get a list of all the nifitis and find out which have corresponding html files
niis = sorted(list(dsst_bids_root.glob('**/*.nii.gz')))

In [243]:
htmls = sorted(list(mriqc_outdir.glob('*.html')))

In [244]:
html_df = []
for html in htmls:
    row = {}
    row['scan_name'] = html.parts[-1].split('.')[0]
    row['path'] = html
    html_df.append(row)
html_df = pd.DataFrame(html_df)

In [245]:
nii_df = []
for nii in niis:
    row = {}
    row['subject'] = nii.parts[4]
    row['participant_label'] = row['subject'].split('-')[-1]
    row['session'] = nii.parts[5]
    row['modality'] = nii.parts[6]
    row['scan_name'] = nii.parts[7].split('.')[0]
    row['path'] = nii
    nii_df.append(row)
nii_df = pd.DataFrame(nii_df)

In [246]:
mriqc_res_df = nii_df.merge(html_df, how='left', on='scan_name', suffixes={'_nii', '_html'}, indicator=True)

In [247]:
mriqc_res_df['_merge'] = mriqc_res_df._merge.astype('str')

In [248]:
mriqc_res_df.groupby('_merge').count()

Unnamed: 0_level_0,modality,participant_label,path_nii,scan_name,session,subject,path_html
_merge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
both,24410,24410,24410,24410,24410,24410,24410
left_only,61992,61992,61992,61992,61992,61992,0


In [249]:
mriqc_res_df.dtypes

modality             object
participant_label    object
path_nii             object
scan_name            object
session              object
subject              object
path_html            object
_merge               object
dtype: object

In [250]:
subj_df = pd.DataFrame(mriqc_res_df.groupby('subject').agg({'_merge':['unique', 'nunique']}))

In [165]:
subj_df['_merge','nunique'].unique()

array([1, 2])

In [251]:
unfinished_subjects = subj_df.index[(subj_df['_merge','unique'].str[0] != 'both') | (subj_df['_merge','nunique'] != 1)].values

In [252]:
finished_subjects = subj_df.index[(subj_df['_merge','unique'].str[0] == 'both') & (subj_df['_merge','nunique'] == 1)].values

In [253]:
us_labels = [sub.split('-')[-1] for sub in unfinished_subjects]

In [254]:
len(unfinished_subjects)

6445

In [256]:
cmds = []
per_chunk=20
for usi in range(0,len(us_labels),per_chunk):
    chunk = ' '.join(us_labels[usi:usi+per_chunk])
    cmd = (f'singularity run {container_path} --participant_label {chunk} --nprocs=30'
           + ' -w /lscratch/$SLURM_JOB_ID'
           + f' {dsst_bids_root} {mriqc_outdir} participant')
    cmds.append(cmd)

In [257]:
# test swarm command with two subjects
swarm_file.write_text('\n'.join(cmds[0:10]))
swarm_file.read_text().split('\n')

['singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label NDARINV0AAKGYA2 NDARINV0MPBK7TU NDARINV0RA4PBPV NDARINV10MWD99M NDARINV138RG20T NDARINV15MX84A5 NDARINV16RVU5PF NDARINV170X8DA0 NDARINV173ZYYKX NDARINV174DUV2F NDARINV174LD3GC NDARINV19JV1ZX5 NDARINV1A8C7PRA NDARINV1AYCA8E3 NDARINV1AYVNP3L NDARINV1AYXT588 NDARINV1B4T30A9 NDARINV1BUXN6LH NDARINV1BWHMRFY NDARINV1CRC8UTU --nprocs=30 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label NDARINV1CTLWW8V NDARINV1D8KBZHP NDARINV1E35BEZ6 NDARINV1E4F8UMB NDARINV1ECBLYGG NDARINV1ETGGL9R NDARINV1JW5HLBW NDARINV1K1285TU NDARINV1K3LDK3L NDARINV1KBEC5AT NDARINV1KHXW139 NDARINV1KP8VFVR NDARINV1KWBW4RD NDARINV1KX54GYB NDARINV1KXK7MDF NDARINV1LMYDNRY NDARINV1LT8UZYC NDARINV1NV489TJ NDARINV1NY2HJJM NDARINV1P4JN8JH --npr

In [258]:
!swarm -m singularity,webproxy -f {swarm_file} -g 45 -t 32 --partition norm --logdir {swarm_log} --time 12:00:00 --gres=lscratch:400

20679847


In [194]:
len(unfinished_subjects)

6708

In [196]:
chunk10 = 'NDARINV0UPVEC1J NDARINV0VVG7LYB NDARINV0VXEC29A NDARINV0WXLR6V1 NDARINV0X02CUCY NDARINV0XTVAGV2 NDARINV0XU7Z6RH NDARINV0XVGNCYR NDARINV0YVKYMJX NDARINV0Z5FF0JL'.split(' ')

In [198]:
sscmds = []
for sub in chunk10:
    cmd = (f'singularity run {container_path} --participant_label={sub} --nprocs={nprocs}'
           + ' -w /lscratch/$SLURM_JOB_ID'
           + f' {dsst_bids_root} {mriqc_outdir} participant')
    sscmds.append(cmd)

In [199]:
sscmds

['singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0UPVEC1J --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0VVG7LYB --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0VXEC29A --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0WXLR6V1 --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190

In [200]:
# test swarm command with two subjects
swarm_file.write_text('\n'.join(sscmds))
swarm_file.read_text().split('\n')

['singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0UPVEC1J --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0VVG7LYB --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0VXEC29A --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0WXLR6V1 --nprocs=20 -w /lscratch/$SLURM_JOB_ID /data/ABCD_DSST/bids_20190215 /data/ABCD_DSST/bids_20190

In [201]:
!swarm -m singularity,webproxy -f {swarm_file} -g 24 -t {nprocs} --partition norm,quick --logdir {swarm_log} --time 1:00:00 --gres=lscratch:100

20652567


In [212]:
chunk11='NDARINV0ZAYELPK NDARINV10FDVE0L NDARINV10HWA6YU NDARINV10J5M8LB NDARINV10K9CVX2 NDARINV10MWD99M NDARINV1147ELD2 NDARINV114VT9DX NDARINV1164KR5T NDARINV11E1FCZ0'.split(' ')

In [213]:
ssrscmds = []
for sub in chunk11:
    subj_dir = dsst_bids_root / ('sub-' + sub)
    cmd = ('mkdir /lscratch/$SLURM_JOB_ID/tmp_bids '
           + f' && rsync -ach {subj_dir} /lscratch/$SLURM_JOB_ID/tmp_bids/ ' 
           + f' && singularity run {container_path} --participant_label={sub} --nprocs={nprocs}'
           + ' -w /lscratch/$SLURM_JOB_ID'
           + f' /lscratch/$SLURM_JOB_ID/tmp_bids/ {mriqc_outdir} participant')
    ssrscmds.append(cmd)

In [214]:
ssrscmds

['mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV0ZAYELPK /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0ZAYELPK --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV10FDVE0L /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV10FDVE0L --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV10HWA6YU /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/pol

In [216]:
swarm_file.write_text('\n'.join(ssrscmds))
swarm_file.read_text().split('\n')

['mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV0ZAYELPK /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV0ZAYELPK --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV10FDVE0L /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV10FDVE0L --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV10HWA6YU /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/pol

In [217]:
!swarm -m singularity,webproxy -f {swarm_file} -g 24 -t {nprocs} --partition norm,quick --logdir {swarm_log} --time 1:00:00 --gres=lscratch:100

20661604


In [259]:
chunks = ' '.join([cc.split('--participant_label ')[-1].split('--nprocs=30')[0].strip() for cc in cmds[10:20]]).split(' ')

In [263]:
ssrscmds = []
for sub in chunks:
    subj_dir = dsst_bids_root / ('sub-' + sub)
    cmd = ('mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids '
           + f' && rsync -ach {subj_dir} /lscratch/$SLURM_JOB_ID/tmp_bids/ ' 
           + f' && singularity run {container_path} --participant_label={sub} --nprocs={nprocs}'
           + ' -w /lscratch/$SLURM_JOB_ID'
           + f' /lscratch/$SLURM_JOB_ID/tmp_bids/ {mriqc_outdir} participant')
    ssrscmds.append(cmd)

In [264]:
ssrs_swarm_file = Path('/data/ABCD_DSST/swarms/mriqc_swarm/mriqc_swarm_ssrs')
swarm_log = Path('/data/ABCD_DSST/swarms/mriqc_swarm/logs')

In [266]:
ssrs_swarm_file.write_text('\n'.join(ssrscmds))
ssrs_swarm_file.read_text().split('\n')

['mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV49D4EDYG /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV49D4EDYG --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV49EG90M3 /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/containers/poldracklab_mriqc-2018-08-21-8efddd374773.simg --participant_label=NDARINV49EG90M3 --nprocs=20 -w /lscratch/$SLURM_JOB_ID /lscratch/$SLURM_JOB_ID/tmp_bids/ /data/ABCD_DSST/bids_20190215/derivatives/mriqc participant',
 'mkdir -p /lscratch/$SLURM_JOB_ID/tmp_bids  && rsync -ach /data/ABCD_DSST/bids_20190215/sub-NDARINV49YTZ5Z4 /lscratch/$SLURM_JOB_ID/tmp_bids/  && singularity run /data/ABCD_DSST/conta

In [268]:
!swarm -m singularity,webproxy -f {ssrs_swarm_file} -g 24 -t {nprocs} --maxrunning 10 --partition norm --logdir {swarm_log} --time 1:00:00 --gres=lscratch:100

20693355


In [267]:
!swarm --help

Usage: swarm [swarm options] [sbatch options]

  -f,--file [file]       name of file with list of command lines to execute,
                         with a single command line per subjob

  -g,--gb-per-process    gb per process (can be fractions of GB, e.g. 3.5)
  [float]

  -t,                    threads per process (can be an integer or the word
  --threads-per-process  auto).  This option is only valid for multi-
  [int]/"auto"           threaded swarms (-p 1).

  -p,                    processes per subjob (default = 1).  This option is
  --processes-per-subjob only valid for single-threaded swarms (-t 1).
  [int]                  
                      
  --noht                 don't use hyperthreading, equivalent to slurm option
                         --threads-per-core=1

  -b,--bundle [int]      bundle more than one command line per subjob and run
                         sequentially (this automatically multiplies the time
                         needed