arguments

**NOTE `662551` not in `HCP_1200`**

`s3://profile-hcp-west/hcp_reliability/single_shell/hcp_1200_afq/`

In [1]:
subjects = [
    '103818', '105923', '111312', '114823', '115320',
    '122317', '125525', '130518', '135528', '137128',
    '139839', '143325', '144226', '146129', '149337',
    '149741', '151526', '158035', '169343', '172332',
    '175439', '177746', '185442', '187547', '192439',
    '194140', '195041', '200109', '200614', '204521',
    '250427', '287248', '341834', '433839', '562345',
    '599671', '601127', '627549', '660951', # '662551', 
    '783462', '859671', '861456', '877168', '917255'
]
session_names = ['HCP_1200', 'HCP_Retest']
bundle_names = ['SLF_L']
# bundle_names = ['SLF_L', 'SLF_R']
# bundle_names = ['SLF_L', 'SLF_R', 'ARC_L', 'ARC_R', 'CST_L', 'CST_R']
# n_clusters = [2, 3, 4]
n_clusters = [3]

In [2]:
import itertools
# args = list(itertools.product(subjects, session_names, bundle_names))
args = list(itertools.product(subjects, session_names, bundle_names, n_clusters))

In [3]:
args[:4]

[('103818', 'HCP_1200', 'SLF_L', 3),
 ('103818', 'HCP_Retest', 'SLF_L', 3),
 ('105923', 'HCP_1200', 'SLF_L', 3),
 ('105923', 'HCP_Retest', 'SLF_L', 3)]

useful for when individual jobs fail

all-in-one: `subbundle3`, `subbundle4`, and `subbundle5`

**Spectral Clustering (SC)** and **Multiple Adjacency Spectral Emdedding (MASE)** 

- UNWARPED **FA Coefficient of Determination (FA R2)** 

- **Inverse Scaled Minimum average Direct-Flip (IS MDF) distance**

**Multiple Adjacency Spectral Emdedding (MASE)** 

- UNWARPED **FA Coefficient of Determination (FA R2)** 

- UNWARPED **MD Coefficient of Determination (MD R2)** 

- **Inverse Scaled Minimum average Direct-Flip (IS MDF) distance**

In [15]:
def subbundle(subject, session, bundle_name, n_clusters):
    import time
    import s3fs
    import numpy as np
    import pandas as pd
    from dipy.io.streamline import load_tractogram, save_tractogram
    from dipy.io.stateful_tractogram import StatefulTractogram
    from dipy.io.utils import create_nifti_header, get_reference_info
    from dipy.tracking.streamline import set_number_of_points, values_from_volume, bundles_distances_mdf
    import dipy.tracking.utils as dtu
    import nibabel as nib
    from graspologic.embed import MultipleASE as MASE
    from graspologic.embed import AdjacencySpectralEmbed as ASE
    from graspologic.cluster import KMeansCluster
    
    def coeff_of_determination(data, model, axis=-1):
        """
         http://en.wikipedia.org/wiki/Coefficient_of_determination
                  _                                            _
                 |    sum of the squared residuals              |
        R^2 =    |1 - ---------------------------------------   | * 100
                 |_    sum of the squared mean-subtracted data _|
        """
        X = np.empty((data.shape[0], model.shape[0]))
        demeaned_data = data - np.mean(data, axis=axis)[...,np.newaxis]
        ss_tot = np.sum(demeaned_data **2, axis=axis)
        # Don't divide by 0:
        if np.all(ss_tot==0.0):
            X[:, :] = np.nan
            return X
        for ii in range(X.shape[0]):
            for jj in range(X.shape[1]):
                # There's no point in doing any of this: 
                if np.all(data[ii]==0.0) and np.all(model[ii]==0.0):
                    X[ii, jj] = np.nan
                else:
                    residuals = data[ii] - model[jj]
                    ss_err = np.sum(residuals ** 2, axis=axis)
                    X[ii, jj] = 1 - (ss_err/ss_tot[ii])
        return X

    def rrss(y, yhat):        
        residuals = y - yhat
        rss = np.dot(residuals.T, residuals)
        rrss = np.sqrt(rss)
        
        return rrss

    def relabel_clusters(cluster_labels):
        from_values = np.flip(np.argsort(np.bincount(cluster_labels))[-(np.unique(cluster_labels).size):])
        to_values = np.arange(from_values.size)

        d = dict(zip(from_values, to_values))

        new_cluster_labels = np.copy(cluster_labels)

        for k, v in d.items():
            new_cluster_labels[cluster_labels == k] = v

        return new_cluster_labels
        
    def density_map(tractogram):
        affine, vol_dims, voxel_sizes, voxel_order = get_reference_info(tractogram)
        tractogram_density = dtu.density_map(tractogram.streamlines, np.eye(4), vol_dims)
        tractogram_density = np.uint8(tractogram_density)
        nifti_header = create_nifti_header(affine, vol_dims, voxel_sizes)
        density_map_img = nib.Nifti1Image(tractogram_density, affine, nifti_header)
        
        return density_map_img
        
    def save_cluster_tractograms_and_density_maps(bundle_tractogram, model_prefix, cluster_labels):        
        for cluster_label in np.unique(cluster_labels):
            # tractogram
            f_name = f'{model_prefix}_cluster_{cluster_label}.trk'
            cluster_indicies = np.array(np.where(cluster_labels == cluster_label)[0])
            tg = StatefulTractogram.from_sft(bundle_tractogram.streamlines[cluster_indicies], bundle_tractogram)
            save_tractogram(tg, f_name, bbox_valid_check=False)
            print('saving cluster tractogram: ', f_name)
            
            # density map -- 8-bit unsigned int and gz
            f_name = f'{model_prefix}_cluster_{cluster_label}_density_map.nii.gz'
            tg.to_vox()
            nib.save(density_map(tg), f_name)
            print('saving cluster density map: ', f_name)
            
            # TODO save cluster afq_profile
            

    print("begin", subject, session, bundle_name, n_clusters)
    
    fs = s3fs.S3FileSystem()
    
    ### fractional anisotropy scalar file ###
    fa_scalar_filename = 'FA.nii.gz'
    print('loading FA scalar file: ', fa_scalar_filename)
    tic = time.perf_counter()
    
    fs.get(
        (
            f'profile-hcp-west/hcp_reliability/single_shell/'
            f'{session.lower()}_afq/sub-{subject}/ses-01/'
            f'sub-{subject}_dwi_model-DTI_FA.nii.gz'
        ),
        f'{fa_scalar_filename}'
    )
    
    fa_scalar_data = nib.load(fa_scalar_filename).get_fdata()
    toc = time.perf_counter()
    print(f'scalar file: {toc - tic:0.4f} seconds')
    
    ### mean diffusivity scalar file ###
    md_scalar_filename = 'MD.nii.gz'
    print('loading scalar file: ', md_scalar_filename)
    tic = time.perf_counter()
    
    fs.get(
        (
            f'profile-hcp-west/hcp_reliability/single_shell/'
            f'{session.lower()}_afq/sub-{subject}/ses-01/'
            f'sub-{subject}_dwi_model-DTI_MD.nii.gz'
        ),
        f'{md_scalar_filename}'
    )
    
    md_scalar_data = nib.load(md_scalar_filename).get_fdata()
    toc = time.perf_counter()
    print(f'scalar file: {toc - tic:0.4f} seconds')
    
    ### clean single shell deterministic bundle tractography ###
    tractogram_filename = f'{bundle_name}.trk'
    print('loading tractogram: ', tractogram_filename)
    tic = time.perf_counter()

    fs.get(
        (
            f'profile-hcp-west/hcp_reliability/single_shell/'
            f'{session.lower()}_afq/sub-{subject}/ses-01/'
            f'clean_bundles/sub-{subject}_dwi_space-RASMM_model-DTI_desc-det-afq-{bundle_name}_tractography.trk'
        ),
        f'{tractogram_filename}'
    )
    
    tractogram = load_tractogram(tractogram_filename, 'same')
    toc = time.perf_counter()
    print(f'tractogram file: {toc - tic:0.4f} seconds')
    
    # TODO save bundle_density map
    
    # TODO save bundle afq_profile/values_from_volume
    
    ### streamline profiles ###
    print('calculating streamline profiles')
    tic = time.perf_counter()
    n_points = 100
    
    fgarray = set_number_of_points(tractogram.streamlines, n_points)
    
    if len(fgarray) == 0:
        return
    
    # TODO compare to afq_profile
    
    # FA Values
    fa_values = np.array(values_from_volume(fa_scalar_data, fgarray, tractogram.affine))
    f_name = 'streamline_profile_fa.npy'
    np.save(f_name, fa_values)
    print('saving: ', f_name)
    toc = time.perf_counter()

    print('fa values:', fa_values.shape)
    
    # MD Values
    md_values = np.array(values_from_volume(md_scalar_data, fgarray, tractogram.affine))
    f_name = 'streamline_profile_md.npy'
    np.save(f_name, md_values)
    print('saving: ', f_name)
    toc = time.perf_counter()

    print('md values:', md_values.shape)
    
    print(f'streamline profile: {toc - tic:0.4f} seconds')
    
    ### Inverse Scaled MDF (Minimum average Direct-Flip) ###
    print('calculating mdf')
    tic = time.perf_counter()
    mdf = bundles_distances_mdf(fgarray, fgarray)
    
    # enforce symmetry
    mdf = (mdf + mdf.T) / 2
    
    # inverse scale
    is_mdf = (mdf.max() - mdf)
    is_mdf = is_mdf / is_mdf.max()

    f_name = 'adjacency_is_mdf.npy'
    np.save(f_name, is_mdf)
    print('saving: ', f_name)
    toc = time.perf_counter()
    
    print('is_mdf:', is_mdf.shape)
    print(f'mdf {toc - tic:0.4f} seconds')
    
    ### streamline r2 ###
    print('calculating streamline r2')
    tic = time.perf_counter()
    
    # calculate FA R2
    fa_r2 = coeff_of_determination(fa_values, fa_values)
    
    # enforce symmetry
    fa_r2 += fa_r2.T
    fa_r2 = fa_r2/2
    
    # save file
    f_name = 'adjacency_fa_r2.npy'
    np.save(f_name, fa_r2)
    print('saving: ', f_name)
    
    print('adjacency_fa_r2:', fa_r2.shape)
    
    # calculate MD R2
    md_r2 = coeff_of_determination(md_values, md_values)
    
    # enforce symmetry
    md_r2 += md_r2.T
    md_r2 = md_r2/2
    
    # save file
    f_name = 'adjacency_md_r2.npy'
    np.save(f_name, md_r2)
    print('saving: ', f_name)
    
    print('adjacency_md_r2:', md_r2.shape)
    
    toc = time.perf_counter()
    
    print(f'streamline r2: {toc - tic:0.4f} seconds')
    
    ### multiple adjacency spectral embedding (mase) ###
    print('mase')
    tic = time.perf_counter()
    
    # MDF
    embedder = ASE()
    
    distance = np.load('adjacency_is_mdf.npy')
    
    V_hat = embedder.fit_transform(distance)
    clusterer = KMeansCluster(n_clusters)
    mase_idx = clusterer.fit_predict(V_hat)
    
    mase_idx = relabel_clusters(mase_idx)
    f_name = 'mase_is_mdf_idx.npy'
    np.save(f_name, mase_idx)
    print('saving: ', f_name)
  
    save_cluster_tractograms_and_density_maps(tractogram, 'mase_is_mdf', mase_idx)
    
    # FA MDF
    embedder = MASE()
    
    fa_tissue = np.load('adjacency_fa_r2.npy')
    
    V_hat = embedder.fit_transform([fa_tissue, distance])
    clusterer = KMeansCluster(n_clusters)
    mase_idx = clusterer.fit_predict(V_hat)
    
    mase_idx = relabel_clusters(mase_idx)
    f_name = 'mase_fa_r2_is_mdf_idx.npy'
    np.save(f_name, mase_idx)
    print('saving: ', f_name)
  
    save_cluster_tractograms_and_density_maps(tractogram, 'mase_fa_r2_is_mdf', mase_idx)
    
    # FA MD MDF
    embedder = MASE()
    
    md_tissue = np.load('adjacency_md_r2.npy')
    
    V_hat = embedder.fit_transform([fa_tissue, md_tissue, distance])
    clusterer = KMeansCluster(n_clusters)
    mase_idx = clusterer.fit_predict(V_hat)
    
    mase_idx = relabel_clusters(mase_idx)
    f_name = 'mase_fa_r2_md_r2_is_mdf_idx.npy'
    np.save(f_name, mase_idx)
    print('saving: ', f_name)
  
    save_cluster_tractograms_and_density_maps(tractogram, 'mase_fa_r2_md_r2_is_mdf', mase_idx)
    
    toc = time.perf_counter()
    print(f'mase {toc - tic:0.4f} seconds')
    
    ### upload everything to s3 ###
    fs.put('*.npy', f'hcp-subbundle/{session}/{bundle_name}/{subject}/{n_clusters}')
    fs.put('*.nii.gz', f'hcp-subbundle/{session}/{bundle_name}/{subject}/{n_clusters}')
    fs.put('*.trk', f'hcp-subbundle/{session}/{bundle_name}/{subject}/{n_clusters}')
    
    print("end", subject, session, bundle_name, n_clusters)

test locally before running on AWS

In [5]:
import cloudknot as ck
ck.set_region('us-west-2')

Reconnect to existing knot

In [16]:
from datetime import datetime
knot = ck.Knot(
    name='hcp-subbundle-' + datetime.now().isoformat()[:-7].replace(':','-'),
    func=subbundle,
    base_image='python:3.8',
    pars_policies=('AmazonS3FullAccess',),
    memory=32000,  # in MB
    volume_size=50,  # in GB
    bid_percentage=105)



Reuse existing Docker Image

In [21]:
# run subset as trial
# result_futures = knot.map(args[:4], starmap=True)

# run all
result_futures = knot.map(args, starmap=True)

# rerun failed
# result_futures = knot.map(rerun_targets(), starmap=True)

In [22]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
c914a06f-8d45-4e28-967a-630824743dc2        hcp-subbundle-2021-01-22T15-56-32-1        PENDING  
afa39eb3-3381-45c3-a0dc-30a5a61e2d7d        hcp-subbundle-2021-01-22T15-56-32-0        SUCCEEDED


In [24]:
knot.jobs[1].status

{'status': 'SUCCEEDED',
 'statusReason': None,
 'attempts': [],
 'arrayProperties': {'statusSummary': {'STARTING': 0,
   'FAILED': 0,
   'RUNNING': 0,
   'SUCCEEDED': 88,
   'RUNNABLE': 0,
   'SUBMITTED': 0,
   'PENDING': 0},
  'size': 88}}