arguments

**NOTE `662551` not in `HCP_1200`**

`s3://profile-hcp-west/hcp_reliability/single_shell/hcp_1200_afq/`

In [20]:
subjects = [
    '103818', '105923', '111312', '114823', '115320',
    '122317', '125525', '130518', '135528', '137128',
    '139839', '143325', '144226', '146129', '149337',
    '149741', '151526', '158035', '169343', '172332',
    '175439', '177746', '185442', '187547', '192439',
    '194140', '195041', '200109', '200614', '204521',
    '250427', '287248', '341834', '433839', '562345',
    '599671', '601127', '627549', '660951', # '662551', 
    '783462', '859671', '861456', '877168', '917255'
]
session_names = ['HCP_1200', 'HCP_Retest']
bundle_names = ['SLF_L', 'SLF_R']
# bundle_names = ['SLF_L', 'SLF_R', 'ARC_L', 'ARC_R', 'CST_L', 'CST_R']
# n_clusters = [2, 3, 4]
# n_clusters = [3]

In [30]:
import itertools
args = list(itertools.product(subjects, session_names, bundle_names))
# args = list(itertools.product(subjects, session_names, bundle_names, n_clusters))

In [31]:
def job_args(ids):
    import numpy as np
    
    return list(map(tuple, np.array(args)[ids].tolist()))
    
# TODO use knot to determine failed and running jobs
# may want to stop running and rerun these with larger memory and disk allocations
def failed_jobs():
    import numpy as np
    # failed_job_ids = [19,22,81,101,148,149]
    failed_job_ids = [19,22,81,101]
#     print(len(failed_job_ids))
    return job_args(np.array(failed_job_ids)+8)

def running_jobs():
    import numpy as np
    running_job_ids = [1,3,13,15,35,59,83,117,119,131,137,139,141,143,147,163]
#     print(len(running_job_ids))
    return job_args(np.array(running_job_ids)+8)

def rerun_targets():
    import numpy as np
    # failed twice
    targets = list(map(tuple, np.array([*failed_jobs(), *running_jobs()])[[0,5,6,7,10,12]]))
    return targets

# print([*failed_jobs(), *running_jobs()])
print(rerun_targets())

[('125525', 'HCP_Retest', 'SLF_R'), ('111312', 'HCP_Retest', 'SLF_R'), ('122317', 'HCP_1200', 'SLF_R'), ('122317', 'HCP_Retest', 'SLF_R'), ('185442', 'HCP_Retest', 'SLF_R'), ('287248', 'HCP_Retest', 'SLF_R')]


all-in-one: `subbundle3`, `subbundle4`, and `subbundle5`

In [None]:
def subbundle(subject, session, bundle_name, n_clusters):
    import s3fs
    import numpy as np
    import pandas as pd
    from dipy.io.streamline import load_tractogram
    from dipy.tracking.streamline import set_number_of_points, values_from_volume, bundles_distances_mdf
    from sklearn.metrics import r2_score
    import nibabel as nib
    from fastdtw import fastdtw
    from sklearn.cluster import SpectralClustering
    from graspologic.embed import MultipleASE as MASE
    from graspologic.cluster import KMeansCluster
        
    ### fractional anisotropy scalar file ###
    
    fs = s3fs.S3FileSystem()
    
    scalar_filename = 'FA.nii.gz'
    
    fs.get(
        (
            f'profile-hcp-west/hcp_reliability/single_shell/'
            f'{session.lower()}_afq/sub-{subject}/ses-01/'
            f'sub-{subject}_dwi_model-DTI_FA.nii.gz'
        ),
        f'{scalar_filename}'
    )
    
    scalar_data = nib.load(scalar_filename).get_fdata()
    
    ### clean single shell deterministic bundle tractography ###
    
    tractogram_filename = f'{bundle_name}.trk'
    
    fs.get(
        (
            f'profile-hcp-west/hcp_reliability/single_shell/'
            f'{session.lower()}_afq/sub-{subject}/ses-01/'
            f'clean_bundles/sub-{subject}_dwi_space-RASMM_model-DTI_desc-det-afq-{bundle_name}_tractography.trk'
        ),
        f'{tractogram_filename}'
    )
    
    tractogram = load_tractogram(tractogram_filename, 'same')
    
    ### streamline profile ###
    
    n_points = 100
    
    fgarray = set_number_of_points(tractogram.streamlines, n_points)
    
    if len(fgarray) == 0:
        return
    
    values = np.array(values_from_volume(scalar_data, fgarray, tractogram.affine))
    np.save('streamline_profile_fa.npy', values)
    
    ### pairwise warped streamline profile ###
    
    dtw_values = np.zeros((values.shape[0], values.shape[0], values.shape[1]))
                
    for i, a in enumerate(values):
        for j, b in enumerate(values):
            _, path = fastdtw(a,b)
            path = np.array(path)
            dtw_value = a[np.append(path[np.where(path[:,1][:-1] != path[:,1][1:]),0][0], len(values.T)-1)]
            dtw_values[i,j] = dtw_value
            
    dtw_pairwise_warped_filename = 'streamline_profile_dtw_pairwise_warped_fa.npy'
    np.save(dtw_pairwise_warped_filename, dtw_values)
    
    ### inverse scaled mdf (minimum average direct-flip) ###
    
    mdf = bundles_distances_mdf(fgarray, fgarray)
    
    # enforce symmetry
    mdf = (mdf + mdf.T) / 2
    
    # inverse scale
    is_mdf = (mdf.max() - mdf)
    is_mdf = is_mdf / is_mdf.max()

    np.save('adjacency_is_mdf.npy', is_mdf)
    
    ### pairwise warped fa r2 ###
    
    unwarped_fa_values = values
    dtw_pairwise_warped_fa_values = dtw_values
    
    # To calculate a NxN for the R^2, compare each warped streamline profile 
    # to the unwarped target streamline
    dtw_pairwise_warped_fa_r2 = pd.DataFrame(
        [
            [
                r2_score(dtw_pairwise_warped_fa_values[i][j], unwarped_fa_values[i]) 
                 for j in np.ndindex(dtw_pairwise_warped_fa_values.shape[1])
            ]
            for i in np.ndindex(dtw_pairwise_warped_fa_values.shape[0])
        ]
    )

    # enforce symmetric
    dtw_pairwise_warped_fa_r2 += dtw_pairwise_warped_fa_r2.T
    dtw_pairwise_warped_fa_r2 = dtw_pairwise_warped_fa_r2/2
    
    np.save('adjacency_pairwise_warped_fa_r2.npy', dtw_pairwise_warped_fa_r2)
    
    ### weighted adjacencies ###
    
    alphas = np.linspace(0,10,11)/10
    
    for alpha in alphas:
        wt_dtw_pairwise_warped_fa_r2_is_mdf = alpha * dtw_pairwise_warped_fa_r2 + (1 - alpha) * is_mdf
        np.save(
            f'adjacency_wt_{int(alpha*10)}_pairwise_warped_fa_r2_{int((1 - alpha)*10)}_is_mdf.npy',
            wt_dtw_pairwise_warped_fa_r2_is_mdf
        )
    
    ### spectral clustering ###
    
    sc = SpectralClustering(affinity="precomputed", n_clusters=n_clusters)
    
    alphas = np.linspace(0,10,11)/10
    
    for alpha in alphas:
        name = f'wt_{int(alpha*10)}_pairwise_warped_fa_r2_{int((1 - alpha)*10)}_is_mdf'
        
        adjacency = np.load(f'adjacency_{name}.npy')
        
        sc_idx = sc.fit(np.absolute(adjacency)).labels_
    
        np.save(f'sc_{name}_idx.npy', sc_idx)
    
    ### mase ###
    
    embedder = MASE()
    
    tissue = np.load('adjacency_pairwise_warped_fa_r2.npy')
    distance = np.load('adjacency_is_mdf.npy')
    
    V_hat = embedder.fit_transform([tissue, distance])
    clusterer = KMeansCluster(n_clusters)
    mase_idx = clusterer.fit_predict(V_hat)
    np.save('mase_pairwise_warped_fa_r2_is_mdf_idx.npy', mase_idx)
    
    ### upload everything to s3 ###
    fs.put('*.npy', f'hcp-subbundle/{session}/{bundle_name}/{subject}/{n_clusters}')

In [13]:
import cloudknot as ck
ck.set_region('us-west-2')

In [14]:
knot = ck.Knot(name='hcp-subbundle-2021-01-11T11-56-26')

In [32]:
# result_futures = knot.map([*failed_jobs(), *running_jobs()], starmap=True)
# result_futures = knot.map(args, starmap=True)
result_futures = knot.map(rerun_targets(), starmap=True)

In [37]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
d7a903f1-9f85-4ccb-8858-cba546ebcee7        hcp-subbundle-2021-01-11T11-56-26-2        PENDING  
9d202b03-024b-4cf4-b921-cbffd71a3e60        hcp-subbundle-2021-01-11T11-56-26-0        PENDING  
c3b85839-2d23-40da-a6db-e1773a4050c9        hcp-subbundle-2021-01-11T11-56-26-1        FAILED   


In [40]:
knot.jobs[0].status

{'status': 'FAILED',
 'statusReason': 'Array Child Job failed',
 'attempts': [],
 'arrayProperties': {'statusSummary': {'STARTING': 0,
   'FAILED': 7,
   'RUNNING': 0,
   'SUCCEEDED': 13,
   'RUNNABLE': 0,
   'SUBMITTED': 0,
   'PENDING': 0},
  'size': 20}}

In [None]:
# knot.clobber(clobber_pars=True, clobber_repo=True, clobber_image=True)