# Subbundles Part 5: Clustering

**Subbundle** - a subgroup of streamlines with a set of common properties

In [None]:
from utils import *

import time

import numpy as np
import pandas as pd

from pyclustertend import hopkins, assess_tendency_by_metric

from sklearn.cluster import MeanShift, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture

import scipy.cluster.hierarchy as spc
from scipy.cluster.hierarchy import dendrogram


import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## Streamline Profiles (from Part 3)

In [None]:
dataset_name = 'HCP_retest'
# dataset_name = 'HCP'

In [None]:
# subjects = get_subjects(dataset_name)
subjects = get_subjects_small(dataset_name)
# subjects = get_subjects_medium(dataset_name)

In [None]:
myafq = get_afq(dataset_name)
display(myafq.data_frame)

In [None]:
# bundle_names = [*myafq.bundle_dict]
# bundle_names = ['SLF_L', 'SLF_R']
# bundle_names = ['ARC_L', 'ARC_R', 'CST_L', 'CST_R', 'FP'] 
bundle_names = ['SLF_L', 'SLF_R', 'ARC_L', 'ARC_R', 'CST_L', 'CST_R', 'FP']

In [None]:
target_dirs = {}
fa_values = {}
warped_fa_values = {}
md_values = {}
warped_md_values = {}

for subject in subjects:
    target_dirs[subject] = {}
    
    loc = get_iloc(myafq, subject)

    for bundle_name in bundle_names:
        target_dir = get_dir_name(myafq, dataset_name, bundle_name, loc)
        target_dirs[subject][bundle_name] = target_dir

## Adjacencies (From Part 4)

In [None]:
adjacencies = {}
adjacencies_names = {}

for subject in subjects:
    adjacencies[subject] = {}
    adjacencies_names[subject] = {}
    for bundle_name in bundle_names:
        target_dir = target_dirs[subject][bundle_name]
#         adjacencies[subject][bundle_name] = get_adjacencies(target_dir, '*wt*pairwise*')
        adjacencies[subject][bundle_name] = get_adjacencies(target_dir)
#         adjacencies_names[subject][bundle_name] = get_adjacencies_names(target_dir, '*wt*pairwise*')
        adjacencies_names[subject][bundle_name] = get_adjacencies_names(target_dir)

In [None]:
adjacencies_df = pd.DataFrame.from_dict(
    {(i,j): pd.Series(adjacencies_names[i][j]) for i in adjacencies_names.keys() for j in adjacencies_names[i].keys()}, 
    orient='index'
)

with pd.option_context('display.max_colwidth', -1):
    display(adjacencies_df)
    
# os.makedirs(op.join('subbundles', dataset_name), exist_ok=True)
# f_name = op.join('subbundles', dataset_name, f'adjacencies_names.csv')
# print(f_name)
# adjacencies_df.to_csv(f_name)

## [Cluster Tendency Metrics](https://en.wikipedia.org/wiki/Cluster_analysis#Cluster_Tendency)

- [Hopkins Statistics](https://en.wikipedia.org/wiki/Hopkins_statistic)

A statistical hypothesis test where the null hypothesis is that the data is generated by a Poisson point process (and are thus uniformly randomly distributed)

**Scores between 0 and 1, a score around 0.5 express no clusterability and a score tending to 0 express a high cluster tendency.**


- [Silhouette _(using Kmeans)_](https://en.wikipedia.org/wiki/Silhouette_(clustering))

The **silhouette** is a measure of how similar an object is to its own cluster (*cohesion*) compared to other clusters (*separation*).

Assumes a minimum of two clusters. 

**Returns the recommended number of clusters based on silhouette score, where the best score is 1 and worst is -1. Note that values near 0 indicate overlapping clusters.**

## Permute/reorder matrix by similarity

- this becomes convex optimization problem

- many clustering algorithms would work for this setup

### [Dimensionality Reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction)

Dimensionality reduction can be thought of as a neutral clustering model

##### <span style="color:red">NOTE: There are many [dimensionality reduction techniques](https://www.analyticsvidhya.com/blog/2018/08/dimensionality-reduction-techniques-python/) available to choose from.</span>

- <span style="color:red">**Question: Which others make sense to explore? and how to compare?**</span>


- [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis) (*Principle Component Analysis*)

- ICA (*Independent Component Analysis*)

- t-SNE (*t-distributed Stochastic Neighbor Embedding*)

- ...

### PCA

##### <span style="color:red">NOTE: `n_components` is hardcoded</span>

- at the moment focused on SLF which believe from literature to have 3 subbundles

#### NOTE that the eigenvectors are very short and hard to see in this

#### WARNING: Using clustering results from below to color code clusters

## Clustering

- For every streamline assign class given collection in unsupervised manner

  - Then label clusters
    
    - To identify number of clusters use empirical test like F-test

###  [Spectral Clustering](https://en.wikipedia.org/wiki/Spectral_clustering)

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html

In [None]:
sc_idxs = {}
sc_names = {}
sc_scores = {}

sc = SpectralClustering(affinity="precomputed", n_clusters=3)

for subject in subjects:
    sc_idxs[subject] = {}
    sc_names[subject] = {}
    sc_scores[subject] = {}
    
    for bundle_name in bundle_names:
        sc_idxs[subject][bundle_name] = []
        sc_names[subject][bundle_name] = []
        sc_scores[subject][bundle_name] = []
        
        target_dir = target_dirs[subject][bundle_name]
        
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):
            tic = time.perf_counter()
            idx = sc.fit(np.absolute(adjacency)).labels_
            toc = time.perf_counter()
#             print(dataset_name, subject, bundle_name, name, f'{toc - tic:0.4f} seconds')
            
            sc_idxs[subject][bundle_name].append(idx)
            
            f_name = op.join(target_dir, f'sc_{name}_idx.npy')
            sc_names[subject][bundle_name].append(name)
            np.save(f_name, idx)
    
            if len(np.unique(idx)) > 1:
                sc_scores[subject][bundle_name].append(silhouette_score(adjacency, idx))
#             if len(np.unique(idx)) > 1:
#                 print(f'{dataset_name} {subject} {bundle_name} {name} silhouette score:', silhouette_score(adjacency, idx))
#             else:
#                 print(f'{dataset_name} {subject} {bundle_name} {name} single cluster!')

In [None]:
sc_data = [[subject, bundle_name, name, score] for subject in sc_names.keys() for bundle_name in sc_names[subject].keys() for name, score in zip(sc_names[subject][bundle_name], sc_scores[subject][bundle_name])]
sc_df = pd.DataFrame(sc_data, columns=['subject', 'bundle_name', 'name', 'score'])

# display(sc_df)

for subject in sc_names.keys():
    for bundle_name in sc_names[subject].keys():
        sc_df.loc[(sc_df['subject']==subject)  & (sc_df['bundle_name']==bundle_name)].plot(figsize=(10,5), title=f'{subject} {bundle_name} spectral clustering silhouette scores', use_index=False) 
        xlabels = sc_df.loc[(sc_df['subject']==subject)  & (sc_df['bundle_name']==bundle_name)].name
        plt.xticks(range(0,len(xlabels)), xlabels, rotation='vertical')
        f_name = op.join('subbundles', dataset_name, f'sc_{bundle_name}_silhouette_scores.png')
        print(f_name)
        plt.savefig(f_name, bbox_inches = "tight")
        
# os.makedirs(op.join('subbundles', dataset_name), exist_ok=True)
# f_name = op.join('subbundles', dataset_name, f'sc_names.csv')
# print(f_name)
# sc_names_df.to_csv(f_name)

### Distribution model: Gaussian mixture models (GMM)

https://scikit-learn.org/stable/modules/mixture.html#mixture

##### <span style="color:red">NOTE: Despite BIC and AIC results manually setting `n_components=3` to the desired number of clusters.</span>

In [None]:
gmm_idxs = {}
gmm_names = {}
gmm_scores = {}

gmm = GaussianMixture(n_components=3)

for subject in subjects:
    gmm_idxs[subject] = {}
    gmm_names[subject] = {}
    gmm_scores[subject] = {}
    
    for bundle_name in bundle_names:
        gmm_idxs[subject][bundle_name] = []
        gmm_names[subject][bundle_name] = []
        gmm_scores[subject][bundle_name] = []
        
        target_dir = target_dirs[subject][bundle_name]
        
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):

            tic = time.perf_counter()
            gmm.fit(adjacency)
            idx = gmm.predict(adjacency)
            toc = time.perf_counter()
            print(dataset_name, subject, bundle_name, name, f'{toc - tic:0.4f} seconds')
            gmm_idxs[subject][bundle_name].append(idx)

            f_name = op.join(target_dir, f'gmm_{name}_idx.npy')
            gmm_names[subject][bundle_name].append(name)
            np.save(f_name, idx)
#             print(dataset_name, subject, bundle_name, name, 'saving', f_name)
            
            if len(np.unique(idx)) > 1:
                gmm_scores[subject][bundle_name].append(silhouette_score(adjacency, idx))
#             print(f'{dataset_name} {subject} {bundle_name} {name} silhouette score:', silhouette_score(adjacency, idx))

In [None]:
gmm_data = [[subject, bundle_name, name, score] for subject in gmm_names.keys() for bundle_name in gmm_names[subject].keys() for name, score in zip(gmm_names[subject][bundle_name], gmm_scores[subject][bundle_name])]
gmm_df = pd.DataFrame(sc_data, columns=['subject', 'bundle_name', 'name', 'score'])

# display(sc_df)

for subject in sc_names.keys():
    for bundle_name in gmm_names[subject].keys():
        gmm_df.loc[(gmm_df['subject']==subject)  & (gmm_df['bundle_name']==bundle_name)].plot(figsize=(10,5), title=f'{subject} {bundle_name} gaussian mixture model silhouette scores', use_index=False) 
        xlabels = gmm_df.loc[(gmm_df['subject']==subject)  & (gmm_df['bundle_name']==bundle_name)].name
        plt.xticks(range(0,len(xlabels)), xlabels, rotation='vertical')
        f_name = op.join('subbundles', dataset_name, f'gmm_{bundle_name}_silhouette_scores.png')
        print(f_name)
        plt.savefig(f_name, bbox_inches = "tight")
        
# os.makedirs(op.join('subbundles', dataset_name), exist_ok=True)
# f_name = op.join('subbundles', dataset_name, f'sc_names.csv')
# print(f_name)
# sc_names_df.to_csv(f_name)

### Centroid Model: MeanShift

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift

##### <span style="color:red">NOTE: There is a `bandwidth` parameter taking default, so bandwidth is estimated using `sklearn.cluster.estimate_bandwidth`.</span>

In [None]:
ms_idxs = {}
ms_names = {}
ms_scores = {}

ms = MeanShift()

for subject in subjects:
    ms_idxs[subject] = {}
    ms_names[subject] = {}
    ms_scores[subject] = {}
    
    for bundle_name in bundle_names:
        ms_idxs[subject][bundle_name] = []
        ms_names[subject][bundle_name] = []
        ms_scores[subject][bundle_name] = []
        
        target_dir = target_dirs[subject][bundle_name]
        
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):

            tic = time.perf_counter()
            idx = ms.fit(adjacency).labels_
            toc = time.perf_counter()
            print(dataset_name, subject, bundle_name, name, f'{toc - tic:0.4f} seconds')
            ms_idxs[subject][bundle_name].append(idx)
            
            f_name = op.join(target_dir, f'ms_{name}_idx.npy')
            ms_names[subject][bundle_name].append(name)
            np.save(f_name, idx)
#             print(dataset_name, subject, bundle_name, name, 'saving', f_name)
    
            if len(np.unique(idx)) > 1:
                ms_scores[subject][bundle_name].append(silhouette_score(adjacency, idx))
#             if len(np.unique(idx)) > 1:
#                 print(f'{dataset_name} {subject} {bundle_name} {name} silhouette score:', silhouette_score(adjacency, idx))
#             else:
#                 print(f'{dataset_name} {subject} {bundle_name} {name} single cluster!')

In [None]:
ms_data = [[subject, bundle_name, name, score] for subject in ms_names.keys() for bundle_name in ms_names[subject].keys() for name, score in zip(ms_names[subject][bundle_name], ms_scores[subject][bundle_name])]
ms_df = pd.DataFrame(ms_data, columns=['subject', 'bundle_name', 'name', 'score'])

# display(sc_df)

for subject in sc_names.keys():
    for bundle_name in ms_names[subject].keys():
        ms_df.loc[(ms_df['subject']==subject)  & (ms_df['bundle_name']==bundle_name)].plot(figsize=(10,5), title=f'{subject} {bundle_name} mean shift silhouette scores', use_index=False) 
        xlabels = ms_df.loc[(ms_df['subject']==subject)  & (ms_df['bundle_name']==bundle_name)].name
        plt.xticks(range(0,len(xlabels)), xlabels, rotation='vertical')
        f_name = op.join('subbundles', dataset_name, f'ms_{bundle_name}_silhouette_scores.png')
        print(f_name)
        plt.savefig(f_name, bbox_inches = "tight")

### Connectivity Model: Hierarchical Clusters

Following example from:

https://stackoverflow.com/questions/52787431/create-clusters-using-correlation-matrix-in-python/52787518#52787518

Using [`scipy.cluster` Hierarchical Clustering package](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html#module-scipy.cluster.hierarchy)

- see https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html#module-scipy.cluster.hierarchy

##### <span style="color:red">NOTE: The dendrogram can be used to manually tune the threshold to desired number of clusters.</span>

- Here setting `p` level to `1` since expect SLF bundle to have 2 or 3 subbundles

In [None]:
pdists = {}
linkages = {}

for subject in subjects:
    pdists[subject] = {}
    linkages[subject] = {}
    for bundle_name in bundle_names:
        pdists[subject][bundle_name] = []
        linkages[subject][bundle_name] = []
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):

            # Pairwise distances between observations in n-dimensional space
            pdist = spc.distance.pdist(adjacency)
            pdists[subject][bundle_name].append(pdist)

            # Perform hierarchical clustering
            linkage = spc.linkage(pdist, method='complete')
            linkages[subject][bundle_name].append(linkage)

        #     plt.figure()
        #     plt.title(f'{dataset_name} {subject} {bundle_name} {name} dendrogram')
        #     dendrogram(linkage, p=1, truncate_mode='level', show_leaf_counts=True)
        #     plt.show()

##### <span style="color:red">NOTE: Manually setting threshold based on dendrogram</span>

- However, it would be better to learn threshold

In [None]:
hier_idxs = {}
hier_names = {}
hier_scores = {}

for subject in subjects:
    hier_idxs[subject] = {}
    hier_names[subject] = {}
    hier_scores[subject] = {}
    
    for bundle_name in bundle_names:
        hier_idxs[subject][bundle_name] = []
        hier_names[subject][bundle_name] = []
        hier_scores[subject][bundle_name] = []
        
        target_dir = target_dirs[subject][bundle_name]
        
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):

            i = adjacencies_names[subject][bundle_name].index(name)

            # Threshold to apply when forming flat clusters
        #     cluster_threshold = ratios[i] * pdists[i].max()

            # Forms flat clusters from the hierarchical clustering defined by the given linkage matrix
            # so that the original observations in each flat cluster have no greater a cophenetic distance 
            # than cluster_threshold
        #     idx = spc.fcluster(linkages[i], cluster_threshold, 'distance')

            tic = time.perf_counter()
            idx = spc.fcluster(linkages[subject][bundle_name][i], t=3, criterion='maxclust')
            toc = time.perf_counter()
            print(dataset_name, subject, bundle_name, name, f'{toc - tic:0.4f} seconds')
            hier_idxs[subject][bundle_name].append(idx)
            
            f_name = op.join(target_dir, f'hier_{name}_idx.npy')
            hier_names[subject][bundle_name].append(name)
            np.save(f_name, idx)
#             print(dataset_name, subject, bundle_name, name, 'saving', f_name)

            if len(np.unique(idx)) > 1:
                hier_scores[subject][bundle_name].append(silhouette_score(adjacency, idx))
#             print(f'{dataset_name} {subject} {bundle_name} {name} silhouette score:', silhouette_score(adjacency, idx))

        #     print(f"{name} number of clusters:", len(np.unique(idx)))

In [None]:
hier_data = [[subject, bundle_name, name, score] for subject in hier_names.keys() for bundle_name in hier_names[subject].keys() for name, score in zip(hier_names[subject][bundle_name], hier_scores[subject][bundle_name])]
hier_df = pd.DataFrame(hier_data, columns=['subject', 'bundle_name', 'name', 'score'])

# display(sc_df)

for subject in hier_names.keys():
    for bundle_name in hier_names[subject].keys():
        hier_df.loc[(hier_df['subject']==subject)  & (hier_df['bundle_name']==bundle_name)].plot(figsize=(10,5), title=f'{subject} {bundle_name} hierarchical clustering silhouette scores', use_index=False) 
        xlabels = hier_df.loc[(hier_df['subject']==subject)  & (hier_df['bundle_name']==bundle_name)].name
        plt.xticks(range(0,len(xlabels)), xlabels, rotation='vertical')
        f_name = op.join('subbundles', dataset_name, f'hier_{bundle_name}_silhouette_scores.png')
        print(f_name)
        plt.savefig(f_name, bbox_inches = "tight")

## Resort the matrix based on cluster

This is just a visualization of clusters. It is meant as a baselines sanity check for clustering, and does not save output. Additional visualization and statistics are caluclated in part 6.

In [None]:
def resort_cluster_ids(idx):
    from_values = np.flip(np.argsort(np.bincount(idx))[-(np.unique(idx).size):])
    to_values = np.arange(from_values.size)
    d = dict(zip(from_values, to_values))
    new_idx = np.copy(idx)
    for k, v in d.items(): new_idx[idx==k] = v
    return new_idx

In [None]:
# load adjacencies into dataframes to simplify sorting
dfs = {}

for subject in subjects:
    dfs[subject] = {}
    for bundle_name in bundle_names:
        dfs[subject][bundle_name] = []
        for adjacency in adjacencies[subject][bundle_name]:
            dfs[subject][bundle_name].append(pd.DataFrame.from_records(adjacency))

In [None]:
model_idxs = [sc_idxs, gmm_idxs, ms_idxs, hier_idxs]
# model_idxs = [sc_idxs]
model_names = ['SC', 'GMM', 'MS', 'Hier']
# model_names = ['SC']

for subject in subjects:
    for bundle_name in bundle_names:
        target_dir = target_dirs[subject][bundle_name]
        
        for name, adjacency in zip(adjacencies_names[subject][bundle_name], adjacencies[subject][bundle_name]):
            print(subject, bundle_name, name)
            i = adjacencies_names[subject][bundle_name].index(name)

            show_original = True
            
            # should be same as output from notebook 4, outputing for comparision convienence
            if show_original:
                plt.figure()
                plt.title(f'{dataset_name} {subject} {bundle_name} {name} original')
                plt.imshow(adjacency, cmap='hot', interpolation='nearest')
                plt.xlabel('streamline index')
                plt.ylabel('streamline index')
                plt.colorbar()
                plt.show()

            for model_name, model_idx in zip(model_names, model_idxs):
                # TODO should show cluster labels, not always clear where delineated
                
                # relabel clusters so largest cluster is first and sort

                columns = [dfs[subject][bundle_name][i].columns.tolist()[j] for j in list((np.argsort(resort_cluster_ids(model_idx[subject][bundle_name][i]))))]
#                 columns = [dfs[subject][bundle_name][i].columns.tolist()[j] for j in list((np.argsort(model_idx[subject][bundle_name][i])))]

                clust_df = dfs[subject][bundle_name][i].reindex(columns, axis=1)
#                 display(clust_df)
                
                rows = [dfs[subject][bundle_name][i].T.columns.tolist()[j] for j in list((np.argsort(resort_cluster_ids(model_idx[subject][bundle_name][i]))))]
#                 rows = [dfs[subject][bundle_name][i].T.columns.tolist()[j] for j in list((np.argsort(model_idx[subject][bundle_name][i])))]
#                 display(rows)
                
                clust_df = clust_df.reindex(rows, axis=0)
#                 display(clust_df)
  
                labels, counts = np.unique(resort_cluster_ids(model_idx[subject][bundle_name][i]), return_counts=True)
                plt.figure()
                plt.title(f'{dataset_name} {subject} {bundle_name} {name} {model_name} custer freq')
                plt.bar(labels, counts, align='center')
                plt.gca().set_xticks(labels)
#                 ax = pd.DataFrame(resort_cluster_ids(model_idx[subject][bundle_name][i])).plot(kind='hist', legend=False, bins=len(np.unique(model_idx[subject][bundle_name][i])), title=f'{dataset_name} {subject} {bundle_name} {name} {model_name} custer freq')
#                 for p in ax.patches:
#                     if p.get_height() > 0:
#                         ax.annotate(str(int(p.get_height())), (p.get_x(), p.get_height()))
#                 plt.xticks(np.unique(model_idx[subject][bundle_name][i]))
                plt.ylabel('streamline frequency')
                plt.xlabel('cluster label')
                f_name = op.join(target_dir, f'{model_name}_{name}_hist.png')
                print(f_name)
                plt.savefig(f_name)
                plt.show()
                
                show_sorted = True
                
                if show_sorted:
                    plt.figure()
                    plt.title(f'{dataset_name} {subject} {bundle_name} {name} {model_name} sorted')
                    plt.imshow(clust_df, cmap='hot', interpolation='nearest')
                    plt.xlabel('streamline index')
                    plt.ylabel('streamline index')
                    plt.colorbar()
                    f_name = op.join(target_dir, f'{model_name}_{name}.png')
                    print(f_name)
                    plt.savefig(f_name)
                    plt.show()
                    
#                     try to sort clusters by size

#                     display(pd.Series(resort_cluster_ids(model_idx[subject][bundle_name][i])))

#                     plt.figure()
#                     plt.title(f'{dataset_name} {subject} {bundle_name} {name} {model_name} sorted')
#                     plt.imshow(resort_cluster_ids(model_idx[subject][bundle_name][i]), cmap='hot', interpolation='nearest')
#                     plt.xlabel('streamline index')
#                     plt.ylabel('streamline index')
#                     plt.colorbar()
                    

<span style="color:blue">**TODO add test retest check**</span>

Note these are going to be of different sizes and will have different streamlines, but would like to get some sense of how clusters changed.

e.g., High level check: are there similar numbers (or ratio) of streamlines per cluster? Is there some way to determine if 'similar' streamlines are classified differently? Is there a way to identify these streamliens?