## Expirement

KMeans clustering using MASE joint embedding (FA_R2, MD_R2, IS_MDF)
 
##### Determine number clusters per bundle based on population data

**NOTE `662551` not in `HCP_1200`**

```
s3://profile-hcp-west/hcp_reliability/single_shell/hcp_1200_afq/
s3://profile-hcp-west/hcp_reliability/single_shell/hcp_1200_afq_CSD/
```

In [None]:
experiment_names = ['MASE_FA_Sklearn_KMeans']

subjects = [
    '103818', '105923', '111312', '114823', '115320',
    '122317', '125525', '130518', '135528', '137128',
    '139839', '143325', '144226', '146129', '149337',
    '149741', '151526', '158035', '169343', '172332',
    '175439', '177746', '185442', '187547', '192439',
    '194140', '195041', '200109', '200614', '204521',
    '250427', '287248', '341834', '433839', '562345',
    '599671', '601127', '627549', '660951', # '662551', 
    '783462', '859671', '861456', '877168', '917255'
]

# session_names = ['HCP_1200']
session_names = ['HCP_1200', 'HCP_Retest']

# bundle_names = ['SLF_L']
# bundle_names = ['SLF_L', 'SLF_R']
bundle_names = ['ARC_L', 'ARC_R']
# bundle_names = ['SLF_L', 'SLF_R', 'ARC_L', 'ARC_R', 'CST_L', 'CST_R']
# bundle_names = [
#     'ATR_L', 'ATR_R',
#     'CGC_L', 'CGC_R',
#     'CST_L', 'CST_R',
#     'IFO_L', 'IFO_R',
#     'ILF_L', 'ILF_R',
#     'SLF_L', 'SLF_R',
#     'ARC_L', 'ARC_R',
#     'UNC_L', 'UNC_R',
#     'FA', 'FP'
# ]

scalars = [['DTI_FA']]

# range_n_clusters = range(2,10)
range_n_clusters = [2, 3, 4] # choosing minimal bundles and maximum to get silhouette

In [None]:
import itertools
# args = list(itertools.product(subjects, session_names, bundle_names))
args = list(itertools.product(experiment_names, subjects, session_names, bundle_names, range_n_clusters, scalars))
args

useful for when individual jobs fail

all-in-one: `subbundle3`, `subbundle4`, and `subbundle5`

**Multiple Adjacency Spectral Emdedding (MASE)** 

- UNWARPED **Fractional Anisotropy Coefficient of Determination (FA R2)** 

- UNWARPED **Mean Diffusivity Coefficient of Determination (MD R2)** 

- **Inverse Scaled Minimum average Direct-Flip (IS MDF) distance**

In [None]:
def subbundle(experiment_name, subject, session, bundle_name, n_clusters, scalars, clean_bundles=True):
    """
    Run clustering for K=`n_clusters` on `subject`, `session`, `bundle_name` 
    using `clean_bundles` CSD tractography
    
    Features:
    - Scalars
      - Tissue - Fractional Anisotropy Coefficient of Determination (FA R^2) 
      - Tissue - Mean Diffusivity Coefficient of Determination (MD R^2)
    - Distance - Inverse Scaled Minimum average Direct-Flip distance (IS_MDF)
    
    Generates:
    - Joint graph adjacency spectral embeddings using `MASE`
    - Clusters using `KMeans`
    """
    import logging
    from os.path import exists
    import s3fs
    import glob
    import numpy as np
    import nibabel as nib
    from dipy.io.streamline import load_tractogram, save_tractogram
    from dipy.io.stateful_tractogram import StatefulTractogram
    from dipy.io.utils import create_nifti_header, get_reference_info
    from dipy.tracking.streamline import set_number_of_points, values_from_volume, bundles_distances_mdf
    from dipy.stats.analysis import afq_profile, gaussian_weights
    from AFQ.segmentation import clean_bundle
    from graspologic.embed import MultipleASE as MASE
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score
    
    logger = logging.getLogger("s3fs")
    logger.setLevel(logging.DEBUG)
    
    def coeff_of_determination(data, model, axis=-1):
        """
         http://en.wikipedia.org/wiki/Coefficient_of_determination
                  _                                           _
                 |    sum of the squared residuals             |
        R^2 =    |1 - ---------------------------------------  | * 100
                 |_   sum of the squared mean-subtracted data _|
        """
        X = np.empty((data.shape[0], model.shape[0]))
        demeaned_data = data - np.mean(data, axis=axis)[...,np.newaxis]
        ss_tot = np.sum(demeaned_data **2, axis=axis)
        
        # Don't divide by 0:
        if np.all(ss_tot==0.0):
            X[:, :] = np.nan
            return X
        
        for ii in range(X.shape[0]):
            for jj in range(X.shape[1]):
                # There's no point in doing any of this: 
                if np.all(data[ii]==0.0) and np.all(model[ii]==0.0):
                    X[ii, jj] = np.nan
                else:
                    residuals = data[ii] - model[jj]
                    ss_err = np.sum(residuals ** 2, axis=axis)
                    X[ii, jj] = 1 - (ss_err/ss_tot[ii])
        return X

    def rrss(y, yhat):
        """
        Compute root residual sum of squares
        """
        residuals = y - yhat
        rss = np.dot(residuals.T, residuals)
        rrss = np.sqrt(rss)
        
        return rrss

    def relabel_clusters(cluster_labels):
        """
        Arrange cluster labels by number of streamlines
        """
        from_values = np.flip(np.argsort(np.bincount(cluster_labels))[-(np.unique(cluster_labels).size):])
        to_values = np.arange(from_values.size)

        d = dict(zip(from_values, to_values))

        new_cluster_labels = np.copy(cluster_labels)

        for k, v in d.items():
            new_cluster_labels[cluster_labels == k] = v

        return new_cluster_labels
        
    def cluster_tractograms(model_prefix, bundle_tractogram, cluster_labels, filtered_cluster_labels, filtred_bundle_tractogram):
        """
        For each cluster create `cluster_tractogram`s
        """
        
        for cluster_label in np.unique(cluster_labels):
            #  save `model_cluster_tractogram` using `cluster_labels`
            f_name = f'{model_prefix}_cluster_{cluster_label}_model.trk'
            cluster_indicies = np.array(np.where(cluster_labels == cluster_label)[0])
            model_cluster_tractogram = StatefulTractogram.from_sft(bundle_tractogram.streamlines[cluster_indicies], bundle_tractogram)
            print(f'model cluster {cluster_label} tractogram streamlines len', len(model_cluster_tractogram), flush=True)
            save_tractogram(model_cluster_tractogram, f_name, bbox_valid_check=False)
            print('saving model cluster tractogram:', f_name, flush=True)

        for cluster_label in np.unique(filtered_cluster_labels):
            # save `filtered_cluster_tractogram` using `filtered_cluster_labels` 
            f_name = f'{model_prefix}_cluster_{cluster_label}_filtered.trk'
            cluster_indicies = np.array(np.where(filtered_cluster_labels == cluster_label)[0])
            filtered_cluster_tractogram = StatefulTractogram.from_sft(filtred_bundle_tractogram.streamlines[cluster_indicies], filtred_bundle_tractogram)
            print(f'filtered cluster {cluster_label} tractogram streamlines len', len(filtered_cluster_tractogram), flush=True)
            save_tractogram(filtered_cluster_tractogram, f_name, bbox_valid_check=False)
            print('saving filtered cluster tractogram:', f_name, flush=True)
            
            # clean `filtered_cluster_tractogram` these are the final clusters
            clean_cluster_tractogram = clean_bundle(filtered_cluster_tractogram)
            sft = StatefulTractogram.from_sft(clean_cluster_tractogram.streamlines, filtred_bundle_tractogram)
            print(f'clean cluster {cluster_label} tractogram streamlines len', len(sft), flush=True)
            f_name = f'{model_prefix}_cluster_{cluster_label}_clean.trk'
            print(f'saving clean cluster tractogram: {f_name}', flush=True)
            save_tractogram(sft, f_name, bbox_valid_check=False)
            
            if 'DTI_FA' in scalars:
                cluster_profile = afq_profile(
                    fa_scalar_data,
                    clean_cluster_tractogram.streamlines,
                    clean_cluster_tractogram.affine,
                    n_points=100,
                    weights=gaussian_weights(
                        clean_cluster_tractogram.streamlines,
                        n_points=100
                    )
                )
                f_name = f'cluster_{cluster_label}_profile_fa.npy'
                np.save(f_name, cluster_profile)
                print('saving:', f_name, flush=True)
                
            if 'DTI_MD' in scalars:
                cluster_profile = afq_profile(
                    md_scalar_data,
                    clean_cluster_tractogram.streamlines,
                    clean_cluster_tractogram.affine,
                    n_points=100,
                    weights=gaussian_weights(
                        clean_cluster_tractogram.streamlines,
                        n_points=100
                    )
                )
                f_name = f'cluster_{cluster_label}_profile_md.npy'
                np.save(f_name, cluster_profile)
                print('saving:', f_name, flush=True)
                        
    def cluster(clusterer, embedding, model_prefix):
        """
        Run `fit_predict`, relabel according to number of streamlines in each cluster, and 
        save classifications.
        
        The cluster_labels file contains a label from `0` to `n_clusters - 1` for each streamline in the
        tractogram.
        """
        cluster_labels = clusterer.fit_predict(embedding)
        
        # sort cluster by size, for convience, 
        # will need to resort by cross-subject similarity
        cluster_labels = relabel_clusters(cluster_labels)
        
        f_name = f'{model_prefix}_cluster_labels.npy'
        np.save(f_name, cluster_labels)
        print('saving:', f_name, flush=True)
        
        return cluster_labels
        
        
    def filter_streamlines(bundle_tractogram, embedding, cluster_labels, model_prefix):
        """
        filter the streamlines using the silhouette score. 
        keep any streamlines above the average silhouette score.
        """
        print('cleaning streamlines using average silhouette score', flush=True)
        average_silhouette_score = silhouette_score(embedding, cluster_labels)
        sample_silhouette_scores = silhouette_samples(embedding, cluster_labels)
        
        # saving the embeddings and cluster_labels for generating the silhouette plots and pair plots
        filtered_embedding = embedding[sample_silhouette_scores > average_silhouette_score]
        f_name = f'{model_prefix}_embeddings_filtered.npy'
        np.save(f_name, filtered_embedding)
        print('saving:', f_name, flush=True)
        
        filtered_cluster_labels = cluster_labels[sample_silhouette_scores > average_silhouette_score]
        f_name = f'{model_prefix}_cluster_labels_filtered.npy'
        np.save(f_name, filtered_cluster_labels)
        print('saving:', f_name, flush=True)
        
        # get filtered_bundle_tractogram so can generate custer_tractrogram from the filtered_cluster_labels
        filtred_bundle_tractogram = StatefulTractogram.from_sft(
            bundle_tractogram.streamlines[sample_silhouette_scores > average_silhouette_score], 
            bundle_tractogram
        )

        return filtered_cluster_labels, filtred_bundle_tractogram

    
    def run_cluster_algos(bundle_tractogram, embedder_name, embedding, embedding_name):
        """
        Run clustering algorithms, now only using `KMeans`.
        """
        model_prefix = f'{embedder_name}_kmeans_{embedding_name}'
        clusterer = KMeans(n_clusters)
        cluster_labels = cluster(clusterer, embedding, model_prefix)
                
        filtered_cluster_labels, filtred_bundle_tractogram = filter_streamlines(
            bundle_tractogram, embedding, cluster_labels, model_prefix
        )
        
        cluster_tractograms(
            model_prefix, bundle_tractogram, cluster_labels, filtered_cluster_labels, filtred_bundle_tractogram
        )
        
        
    def run_embeddings(bundle_tractogram, features, feature_name, embedding_dimension=None):
        """
        multiple adjacency spectral embedding (mase)
        """
        print('mase', flush=True)

        embedder = MASE(n_components=embedding_dimension)
        embedder_name = 'mase'
        
        embedding = embedder.fit_transform(features)
        model_prefix = f'{embedder_name}_kmeans_{feature_name}'
        f_name = f'{model_prefix}_embeddings.npy'
        np.save(f_name, embedding)
        print('saving:', f_name, flush=True)
        
        print(embedder_name, feature_name, 'embedding dimension', embedding.shape, flush=True)
        run_cluster_algos(bundle_tractogram, embedder_name, embedding, feature_name)

    print("begin", experiment_name, subject, session, bundle_name, n_clusters, scalars, flush=True)
    
    fs = s3fs.S3FileSystem()
    
    ### fractional anisotropy scalar file ###
    if 'DTI_FA' in scalars:
        fa_scalar_filename = 'FA.nii.gz'
        print('loading FA scalar file:', fa_scalar_filename, flush=True)

        if not exists(fa_scalar_filename):
            fs.get(
                (
                    f'profile-hcp-west/hcp_reliability/single_shell/'
                    f'{session.lower()}_afq_CSD/sub-{subject}/ses-01/'
                    f'sub-{subject}_dwi_model-DTI_FA.nii.gz'
                ),
                f'{fa_scalar_filename}'
            )

        fa_scalar_data = nib.load(fa_scalar_filename).get_fdata()

    ### mean diffusivity scalar file ###
    if 'DTI_MD' in scalars:
        md_scalar_filename = 'MD.nii.gz'
        print('loading scalar file: ', md_scalar_filename, flush=True)

        if not exists(md_scalar_filename):
            fs.get(
                (
                    f'profile-hcp-west/hcp_reliability/single_shell/'
                    f'{session.lower()}_afq_CSD/sub-{subject}/ses-01/'
                    f'sub-{subject}_dwi_model-DTI_MD.nii.gz'
                ),
                f'{md_scalar_filename}'
            )

        md_scalar_data = nib.load(md_scalar_filename).get_fdata()
    
    ### single shell deterministic bundle tractography ###
    bundle_tractogram_filename = f'{bundle_name}.trk'
    print('loading bundle tractogram:', bundle_tractogram_filename, flush=True)

    bundle_folder = 'bundles'
    
    if clean_bundles:
        bundle_folder = 'clean_' + bundle_folder
    
    if not exists(bundle_tractogram_filename):
        fs.get(
            (
                f'profile-hcp-west/hcp_reliability/single_shell/'
                f'{session.lower()}_afq_CSD/sub-{subject}/ses-01/'
                f'{bundle_folder}/sub-{subject}_dwi_space-RASMM_model-CSD_desc-det-afq-{bundle_name}_tractography.trk'
            ),
            f'{bundle_tractogram_filename}'
        )
    
    bundle_tractogram = load_tractogram(bundle_tractogram_filename, 'same')
    
    ### bundle profile ###
    if 'DTI_FA' in scalars:
        bundle_profile = afq_profile(
            fa_scalar_data,
            bundle_tractogram.streamlines,
            bundle_tractogram.affine,
            n_points=100,
            weights=gaussian_weights(
                bundle_tractogram.streamlines,
                n_points=100
            )
        )
        f_name = 'bundle_profile_fa.npy'
        np.save(f_name, bundle_profile)
        print('saving:', f_name, flush=True)
    
    if 'DTI_MD' in scalars:
        bundle_profile = afq_profile(
            md_scalar_data,
            bundle_tractogram.streamlines,
            bundle_tractogram.affine,
            n_points=100,
            weights=gaussian_weights(
                bundle_tractogram.streamlines,
                n_points=100
            )
        )
        f_name = 'bundle_profile_md.npy'
        np.save(f_name, bundle_profile)
        print('saving:', f_name, flush=True)
        
    ### streamline profiles ###
    n_points = 100
    
    fgarray = set_number_of_points(bundle_tractogram.streamlines, n_points)
    
    if len(fgarray) == 0:
        return
    
    # FA Values
    if 'DTI_FA' in scalars:
        fa_values = np.array(values_from_volume(fa_scalar_data, fgarray, bundle_tractogram.affine))
        f_name = 'streamline_profile_fa.npy'
        np.save(f_name, fa_values)
        print('saving:', f_name, flush=True)

        print('fa values:', fa_values.shape, flush=True)

    # MD Values
    if 'DTI_MD' in scalars:
        md_values = np.array(values_from_volume(md_scalar_data, fgarray, bundle_tractogram.affine))
        f_name = 'streamline_profile_md.npy'
        np.save(f_name, md_values)
        print('saving: ', f_name, flush=True)

        print('md values:', md_values.shape, flush=True)
    
    ### Inverse Scaled MDF (Minimum average Direct-Flip) ###
    mdf = bundles_distances_mdf(fgarray, fgarray)
    
    # enforce symmetry
    mdf = (mdf + mdf.T) / 2
    
    # inverse scale
    is_mdf = (mdf.max() - mdf)
    is_mdf = is_mdf / is_mdf.max()

    f_name = 'adjacency_is_mdf.npy'
    np.save(f_name, is_mdf)
    print('saving:', f_name, flush=True)
    
    print('is_mdf:', is_mdf.shape, flush=True)
    
    ### streamline r2 ###
    if 'DTI_FA' in scalars:
        # calculate FA R2
        fa_r2 = coeff_of_determination(fa_values, fa_values)

        # enforce symmetry
        fa_r2 += fa_r2.T
        fa_r2 = fa_r2/2

        # save file
        f_name = 'adjacency_fa_r2.npy'
        np.save(f_name, fa_r2)
        print('saving:', f_name, flush=True)

        print('adjacency_fa_r2:', fa_r2.shape, flush=True)

    if 'DTI_MD' in scalars:
        # calculate MD R2
        md_r2 = coeff_of_determination(md_values, md_values)

        # enforce symmetry
        md_r2 += md_r2.T
        md_r2 = md_r2/2

        # save file
        f_name = 'adjacency_md_r2.npy'
        np.save(f_name, md_r2)
        print('saving: ', f_name, flush=True)

        print('adjacency_md_r2:', md_r2.shape, flush=True)
    
    ### clustering ###
    
    if 'DTI_FA' in scalars:
        fa_tissue = np.load('adjacency_fa_r2.npy')

    if 'DTI_MD' in scalars:
        md_tissue = np.load('adjacency_md_r2.npy')
    
    distance = np.load('adjacency_is_mdf.npy')
    
    features = []
    feature_names = []
    
    if 'DTI_FA' in scalars:
        features.append(fa_tissue)
        feature_names.append('fa_r2')
        
    if 'DTI_MD' in scalars:
        features.append(md_tissue)
        feature_names.append('md_r2')
        
    features.append(distance)
    feature_names.append('is_mdf')
    feature_name = '_'.join(feature_names)
            
    run_embeddings(bundle_tractogram, features, feature_name)
    
    ### upload everything to s3 ###
    print(f'uploading to s3://hcp-subbundle/{experiment_name}/{session}/{bundle_name}/{subject}/{n_clusters}/', flush=True)

    nii_files = glob.glob('*.nii.gz')
    
    for nii_file in nii_files:
        fs.put(nii_file, f'hcp-subbundle/{experiment_name}/{session}/{bundle_name}/{subject}/{n_clusters}/{nii_file}')

    fs.put('*.trk', f'hcp-subbundle/{experiment_name}/{session}/{bundle_name}/{subject}/{n_clusters}/')
    fs.put('*.npy', f'hcp-subbundle/{experiment_name}/{session}/{bundle_name}/{subject}/{n_clusters}/')
    
    print("end", experiment_name, subject, session, bundle_name, n_clusters, flush=True)

test locally before running on AWS

In [None]:
def clean(delete=True):
    """
    delete files in between, otherwise uploading corrupted data
    
    used for local testing
    """
    import os
    import glob
    
    for extensions in ["*.nii.gz", "*.trk", "*.npy", "*.pkl", "*.png"]:
        for file in glob.glob(extensions):
            print('removing', file)
            if delete:
                os.remove(file)

In [None]:
clean()

In [None]:
local_args = list(itertools.product(experiment_names, subjects[1:], session_names, bundle_names, range_n_clusters, scalars))
for (experiment_name, subject, session, bundle_name, n_clusters, scalars) in local_args:
    subbundle(experiment_name, subject, session, bundle_name, n_clusters, scalars)
    clean()

## AWS

Reconnect to existing knot

Reuse existing Docker Image

run

status

delete everything associate to the knot