# Extracting Arrays

In [None]:
import pickle
import re
import random
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
import pandas as pd
import PIL
from sklearn.metrics import f1_score, confusion_matrix

In [None]:
from PIL import Image
from PIL import GifImagePlugin
from numpy import asarray
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, datasets
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, homogeneity_score, completeness_score
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from sklearn.preprocessing import StandardScaler

In [None]:
seed = 42

In [None]:
# Define base path where files will be stored.
# This is unpacked from the pickle file created in Step 0.

with open('pickledHomeScratchShared.pickle', "rb") as f:
    baseHomePath,baseScratchPath,baseSharedPath = pickle.load(f)

In [None]:
# Create Dataframe of O1 and create labels off of CDR score
df = pd.read_csv('{}/milestone_II_project/data/oasis_labelled_data/oasis_1_labelled_data.csv'.format(baseHomePath))
df['CDR'] = df.CDR.fillna(0)
df['demented'] = [1 if x > 0 else 0 for x in df.CDR]
df.head(5)

In [None]:
# Standardize image arrays
def standardizeImg(img_16frames):
    # Flatten the array along the last dimension
    img_16frames_flat = img_16frames.reshape(-1, img_16frames.shape[-1])
    # Standardize the flattened array
    scaler = StandardScaler()
    img_16frames_flat_scaled = scaler.fit_transform(img_16frames_flat)
    # Reshape the standardized array to its original shape and reassign to 'img_16frames'
    img_16frames = img_16frames_flat_scaled.reshape(img_16frames.shape)
    return img_16frames

In [None]:
# # Function to  Display images
# def showImg(ndarr):
#     return plt.imshow(ndarr, cmap=plt.cm.gray_r, interpolation="nearest")

For local use of extracting array. 

In [None]:
# # Get file list to loop through our function above
files = list(df.ID)

### Load our Coronal pickle file

In [None]:
coronalDict = 'processed_img_c_dict.pickle'

with open("{}/{}".format(baseSharedPath,coronalDict), "rb") as f:
    arr_dict_c = pickle.load(f)

# CNN model

Recreate our Best CNN model using Coronal images

In [None]:
def split_train_test(array):
    # Keep y as a series so we have the idx of y_test when we split the dataset. 
    y = (df.demented)
    X = [arr for pat, arr in list(array.items())]
    X = [x.reshape(x.shape + (1,)) for x in X]
    X = np.array(X)
    X = standardizeImg(X)
    # Split data to train and test
    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = seed, stratify = y)
    # extract the file idx for y-test for visualization purposes down the line.
    y_test_file_idx = y_test.index
    y_test_file_idx
    # convert y_train and y_test to be arrays to fit model
    y_train = np.array(y_train)
    y_train = y_train.reshape(-1,1)
    y_test = np.array(y_test)
    y_test = y_test.reshape(-1,1)
    return x_train, x_test, y_train, y_test, y_test_file_idx

In [None]:
# update hyper parameters
def generate_model(view):
    if view == 'Transverse':
        rows = 208
        cols = 176
    elif view == 'Coronal':
        rows = 176
        cols = 176
    elif view == 'Sagittal':
        rows = 176
        cols = 208
    tf.random.set_seed(42)
    model = models.Sequential()
    model.add(layers.Conv2D(filters=128, kernel_size= 3
                    ,kernel_regularizer = tf.keras.regularizers.L2(0.005), activation='relu'
                    ,input_shape=(rows, cols, 1), name = "C_2d_1"))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(filters=64, kernel_size= 3
                    ,kernel_regularizer = tf.keras.regularizers.L2(0.005), activation='relu', name = "C_2d_2"))
    model.add(layers.Dropout(0.4))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(filters=64, kernel_size= 3
                    ,kernel_regularizer = tf.keras.regularizers.L2(0.005), activation='relu', name = "C_2d_3"))
    model.add(layers.Dropout(0.4))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(filters=64, kernel_size= 3
                    ,kernel_regularizer = tf.keras.regularizers.L2(0.005), activation='relu', name = "C_2d_4"))
    model.add(layers.Flatten())
    model.add(layers.Dense(16, kernel_regularizer = tf.keras.regularizers.L2(0.005), activation='relu', name = "Dense_1"))
    model.add(layers.Dense(2))
    model.compile(tf.keras.optimizers.Adam(learning_rate=0.001),
          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=['accuracy']
                     )
    return model

In [None]:
def gen_scores(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred = y_pred.round(1)
    y_pred_binary = [0 if x[0] > x[1] else 1 for x in y_pred]
    y_test_binary = list(y_test.reshape(1,-1)[0])
    f1_S = f1_score(y_test_binary, y_pred_binary, average='macro')
    
    conf_matx = pd.DataFrame(confusion_matrix(y_test_binary, y_pred_binary), index = ['neg', 'pos'], columns = ['neg', 'pos'])
    conf_matx.columns.name = 'Predicted'
    conf_matx.index.name = 'Actual'
    return f1_S, conf_matx

In [None]:
model_c = generate_model('Coronal')

In [None]:
x_train_c, x_test_c, y_train_c, y_test_c, y_test_file_idx_c = split_train_test(arr_dict_c)

In [None]:
model_c.fit(x_train_c,  
            y_train_c, 
            epochs=11, 
            validation_data=(x_test_c, y_test_c))

In [None]:
model_c.evaluate(x_test_c, y_test_c)

In [None]:
gen_scores(model_c, x_test_c, y_test_c)

# Clustering Analysis

First we will make a copy of our overall dataframe but we will only keep records of the files that are in our test set. We will record the cluster groups predicted by our algorithms on to this dataframe.

In [None]:
testset_df = df.iloc[y_test_file_idx_c].copy()

In [None]:
# Turn the first dense layer as our CNN output so that we may extract the output as features to cluster with.
layer_name = 'Dense_1'
layer = model_c.get_layer(name=layer_name)

In [None]:
activation_model_c = tf.keras.models.Model(inputs = model_c.input, outputs =layer.output)

In [None]:
# Compute Features
features = activation_model_c.predict(x_test_c)

In [None]:
# Observe the shape of the features as a sanity check. We have 109 patients and 16 features.
features.shape

In [None]:
# Our features are very dense, therefore we should not use pca
features[0]

### hierarchical clustering to find optimal clusters 

The first clustering approach we will use is Agglomerative Clustering. Through this method, we do not need to specity the number of clusters we wish to generate. In addition, we can plot a tree diagram, or a dendrogram, to visually observe the optimal number of clusters.

In [None]:
def plot_dendrogram(model):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
        
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    
    plt.figure(figsize=(10,5))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('MRI Nodes')
    plt.ylabel('Euclidean Distance')
    dendrogram(linkage_matrix)
    plt.show()

In [None]:
dendro_tree = AgglomerativeClustering(n_clusters=None, distance_threshold = 0, compute_distances= True, affinity= 'euclidean')
dendro_tree.fit(features)

In [None]:
plot_dendrogram(dendro_tree)

Based off the dendogram, two looks the best. The distance between these two clusters is very large compared to distances within these clusters. We will use 2 clusters for our clustering analysis.

While we are still using Agglomerative clustering, lets make a model that clusters our test set by 2 clusters and extract their labels. The features we extracted from the CNN will be our inputs for this model. Note: we do not apply PCA on these features as the data is sparse and PCA does not perform well with sparse data.

In [None]:
def cluster_scores(features, cluster_labels):
    sil_score = silhouette_score(features, cluster_labels, random_state = 42)
    cal_har_score = calinski_harabasz_score(features, cluster_labels)
    dav_bou_score = davies_bouldin_score(features, cluster_labels)
    return sil_score, cal_har_score, dav_bou_score

In [None]:
# scores = np.zeros((4,4))
scores = np.zeros((3,4))

col_idx = 0
for link in ['ward', 'complete', 'average', 'single']:
    agg_clus = AgglomerativeClustering(n_clusters=2, distance_threshold = None, compute_distances= True, affinity= 'euclidean', linkage= link)
    agg_clus.fit(features)
    agg_labels = agg_clus.labels_
    sil_score, cal_har_score, dav_bou_score = cluster_scores(features, agg_labels)
    scores[:,col_idx] = round(sil_score,2), round(cal_har_score,2), round(dav_bou_score,2)
    col_idx += 1

In [None]:
agg_scores = pd.DataFrame(data= scores, columns= ['ward', 'complete', 'average', 'single'], index = ['silhouette', 'calinski_harabasz', 'davies_bouldin'])
print(agg_scores)

Complete seems to do the best for silhouette and calinski harabasz. We will use 'complete' for visualization purposes and for ground truth evaluations.

In [None]:
agg_clus = AgglomerativeClustering(n_clusters=2, distance_threshold = None, compute_distances= True, affinity= 'euclidean', linkage= 'complete')
agg_clus.fit(features)
agg_labels = agg_clus.labels_

In [None]:
# We will make a color map for these labels for visualization purposes down the line.
agg_cmap = ['orange' if x == 0 else 'green' for x in agg_labels]

### K-Means

Next we will turn our heads at a very basic clustering algorithm, K-Means. We will do the same as the Algomerative clustering.

In [None]:
kmeans = KMeans(n_clusters = 2, random_state=42, max_iter = 300, algorithm= 'auto')
kmeans.fit(features)
k_means_labels = kmeans.labels_
k_means_cmap = ['blue' if l == 0 else 'green' if l == 1 else 'red' for l in k_means_labels]

In [None]:
from sklearn.metrics import pairwise_distances_argmin_min
def get_three_center_nodes(features):
    points = features.copy()
    closest_nodes_c1 = []
    closest_nodes_c2 = []
    for x in range(3):
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, points)
        closest_nodes_c1.append(closest[0])
        closest_nodes_c2.append(closest[1])
        for idx in closest:
            points = np.delete(points, obj= idx, axis=0)
    
    c1_files = list(df.iloc[closest_nodes_c1].ID)
    c2_files = list(df.iloc[closest_nodes_c2].ID)
    return c1_files, c2_files

In [None]:
closest_c1, closest_c2 = get_three_center_nodes(features)

In [None]:
closest_c1

In [None]:
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(2,3,1)
ax1.imshow(arr_dict_c[closest_c1[0]])
# ax1.set_xticks([])
# ax1.set_yticks([])
ax2 = fig.add_subplot(2,3,2)
ax2.imshow(arr_dict_c[closest_c1[1]])
ax3 = fig.add_subplot(2,3,3)
ax3.imshow(arr_dict_c[closest_c1[2]])
ax4 = fig.add_subplot(2,3,4)
ax4.imshow(arr_dict_c[closest_c2[0]])
ax5 = fig.add_subplot(2,3,5)
ax5.imshow(arr_dict_c[closest_c2[1]])
ax6 = fig.add_subplot(2,3,6)
ax6.imshow(arr_dict_c[closest_c2[2]])

for ax in [ax1,ax2,ax3,ax4,ax5,ax6]:
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect('equal')
    
ax1.set_ylabel('Cluster 1', size= 20)
ax4.set_ylabel('Cluster 2', size = 20)
ax1.set_title('1st closest to centroid')
ax2.set_title('2nd closest to centroid')
ax3.set_title('3rd closest to centroid')

plt.subplots_adjust(wspace=0, hspace=0)

### DBSCAN

Our last clustering algorithm will be DBSCAN, a deterministic method. 

In [None]:
dbs_params = np.zeros((8,11))
col_idx = 0
row_idx = 0
for eps in np.linspace(0.5,1.5,11):
    for samps in np.linspace(15,22,8).astype(int):
        dbs = DBSCAN(eps=eps, min_samples= samps) #eps= 3, min_samples= 7
        dbs.fit(features)
        dbs_labels = dbs.labels_
        clusters = list(np.unique(dbs_labels))
        if -1 in clusters:
            clusters.remove(-1) # remove outliers
        dbs_params[row_idx,col_idx] = len(clusters)
        row_idx += 1
    col_idx += 1
    row_idx = 0

In [None]:
dbs_hyp_params = pd.DataFrame(data = dbs_params, columns = np.linspace(0.5,1.5,11), index = np.linspace(15,22,8).astype(int))
dbs_hyp_params.index.rename('samples', inplace= True)
dbs_hyp_params.columns.rename('eps', inplace= True)
dbs_hyp_params

We will use an eps of 1.1 and test sample values 17 to 21.

In [None]:
scores = np.zeros((3,5))

col_idx = 0
for samp in range(17,22):
    dbs = DBSCAN(eps=1.1, min_samples= samp) #eps= 3, min_samples= 7
    dbs.fit(features)
    dbs_labels = dbs.labels_
    sil_score, cal_har_score, dav_bou_score = cluster_scores(features, dbs_labels)
    scores[:,col_idx] = round(sil_score,2), round(cal_har_score,2), round(dav_bou_score,2)
    col_idx += 1

In [None]:
dbs_scores = pd.DataFrame(data= scores, columns= range(17,22), index = ['silhouette', 'calinski_harabasz', 'davies_bouldin'])

dbs_scores

17 samples appear to be best. Lets train a DBSCAN model that uses eps = 1.1 and samples = 17

In [None]:
dbs = DBSCAN(eps=1.1, min_samples= 17) #eps= 3, min_samples= 7
dbs.fit(features)
dbs_labels = dbs.labels_
dbs_cmap =['red' if x ==0 else 'green' if x==1 else 'blue' if x==2 else 'black' for x in dbs_labels]

In [None]:
dbs_labels

### Correlation Heatmap

Since we compute 2 clusters, we need to make sense of what each of those clusters represent. It may be easy for us to think that our clustering models will capture demented and non-demented groups; however, we need to consider other possibilities for our cluster differences. Besides the grouping of demented and non-demented, the clustering algorithms may group patients by gender or perhaps age groups.

We will compute two binary variables, male and elder. Male will be 1 if the patient is a Male, 0 otherwise. Elder will be 1 if the patient is above the age of 60, 0 otherwise. When clustering, we need to consider how the algorithms cluster these patients. Besides the grouping of demented and not demented, the algorithm may cluster by gender or perhaps age groups. These will serve as our ground truth labels.

In [None]:
# Compute 2 ground truth labels
testset_df['male'] = [1 if x == 'M' else 0 for x in testset_df['M/F']]
testset_df['elder'] = [1 if x >=60 else 0 for x in testset_df.Age]

We will also import our clustering labels for our dataframe.

In [None]:
testset_df['k_mean_clus'] = k_means_labels
testset_df['dbs_clus'] = dbs_labels
testset_df['agg_clus'] = agg_labels
testset_df.head(5)

Lets create a correlation coeficient table for our cluster labels and our ground truth labels. We will then visualize a heatmap to see how our cluster labels correlates with each ground truth labels.

In [None]:
corr = testset_df[['demented', 'male', 'elder', 'agg_clus', 'k_mean_clus','dbs_clus']].corr().abs()
corr = corr[:3][['agg_clus', 'k_mean_clus','dbs_clus']]
corr

In [None]:
f = plt.figure(figsize = (10,10))
plt.matshow(corr, fignum=f.number, cmap =plt.cm.Blues)
plt.xticks(range(len(corr.columns)), corr.columns, size = 20)
plt.yticks(range(len(corr.index)), corr.index, size= 20)
plt.title('Cluster and Variable Correlation', size = 30, pad = 20)
cb = plt.colorbar(shrink = 0.8)

It appears that most of the clustering algorithms seem to have highest correlation with patient's age (elder). The next variable with the second highest correlation is demenia. Lastly, our clustering labels does not seem to correlate much to gender groups.

# Manifold Learning t-SNE

Lets visualize our data and clustering in a two dimensional space. We have 16 features generated from our CNN and we can not really make any sense of each of these features due to the nature of neural networks. We can however, visualize the features in 2-dimensions by using t-SNE. 

Once mapped onto a 2-dimensional space, we can color code our datapoints through cluster labels and ground truth labels.

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components = 2, init = 'random', perplexity= 10, random_state = 42)

In [None]:
tsne_2d = tsne.fit_transform(features)

In [None]:
tsne_x, tsne_y = tsne_2d.transpose()

In [None]:
demented_cmap = ['blue' if x == 0 else 'red' for x in y_test_c.reshape(1,-1)[0]]

In [None]:
plt.scatter(tsne_x, tsne_y, c = demented_cmap);
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('Demented Labels mapped on t-SNE', size = 14);
legend_elements = [Line2D([0], [0], marker='o', color='r', lw=0 ,label='demented',
                          markerfacecolor='r', markersize=10),
                   Line2D([0], [0], marker='o', color='b', lw=0 ,label='Non-demented',
                          markerfacecolor='b', markersize=10)]
plt.legend(handles=legend_elements);

In [None]:
gender_cmap = ['blue' if x ==1 else 'red' for x in testset_df.male]
plt.scatter(tsne_x, tsne_y, c = gender_cmap)
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('Gender Labels mapped on t-SNE', size= 14)
legend_elements = [Line2D([0], [0], marker='o', color='r', lw=0 ,label='female',
                          markerfacecolor='r', markersize=10),
                   Line2D([0], [0], marker='o', color='b', lw=0 ,label='male',
                          markerfacecolor='b', markersize=10)]
plt.legend(handles=legend_elements);

In [None]:
elder_cmap = ['red' if x ==1 else 'blue' for x in testset_df.elder]
plt.scatter(tsne_x, tsne_y, c = elder_cmap)
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('Elder Labels mapped on t-SNE', size = 14);
legend_elements = [Line2D([0], [0], marker='o', color='r', lw=0 ,label='elder(age >= 60)',
                          markerfacecolor='r', markersize=10),
                   Line2D([0], [0], marker='o', color='b', lw=0 ,label='non-elder(age < 60)',
                          markerfacecolor='b', markersize=10)]
plt.legend(handles=legend_elements);

In [None]:
plt.scatter(tsne_x, tsne_y, c = dbs_cmap)
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('DBSCAN Cluster Labels mapped on t-SNE', size = 14);
legend_elements = [Line2D([0], [0], marker='o', color='g', lw=0 ,label='DBSCAN cluster 1',
                          markerfacecolor='g', markersize=10),
                   Line2D([0], [0], marker='o', color='r', lw=0 ,label='DBSCAN cluster 2',
                          markerfacecolor='r', markersize=10),
                   Line2D([0], [0], marker='o', color='k', lw=0 ,label='Outliers',
                          markerfacecolor='k', markersize=10)]
plt.legend(handles=legend_elements);

In [None]:
plt.scatter(tsne_x, tsne_y, c = k_means_cmap)
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('K-Means Cluster Labels mapped on t-SNE', size = 14);
legend_elements = [Line2D([0], [0], marker='o', color='b', lw=0 ,label='K-Means cluster 1',
                          markerfacecolor='b', markersize=10),
                   Line2D([0], [0], marker='o', color='g', lw=0 ,label='K-Means cluster 2',
                          markerfacecolor='g', markersize=10)]
plt.legend(handles=legend_elements);

In [None]:
plt.scatter(tsne_x, tsne_y, c = agg_cmap)
plt.xlabel('First t-SNE feature');
plt.ylabel('Second t-SNE feature');
plt.xlabel(None);
plt.ylabel(None);
plt.xticks([]);
plt.yticks([]);
plt.title('Agglomerative Cluster Labels mapped on t-SNE', size = 14);
legend_elements = [Line2D([0], [0], marker='o', color='orange', lw=0 ,label='Agglomerative cluster 1',
                          markerfacecolor='orange', markersize=10),
                   Line2D([0], [0], marker='o', color='g', lw=0 ,label='Agglomerative cluster 2',
                          markerfacecolor='g', markersize=10)]
plt.legend(handles=legend_elements);

### Evaluation

By looking at the t-SNE charts, we can see that our clustering models cluster our age groups pretty well. In addition, our clusters seems to capture a group of patients that are not demented and another group of patients that are demented or non-demented. This could be due to the nature of the age groups since dementia is highly correlated with age. 

Lets evaluate our models through a series of metrics. Because we have ground truth labels for demented and age, we will use ground truth metrics as well (completeness score and homogenity score).

In [None]:
# silhouette_score- near 1 for good, near -1 for bad
# calinski_harabasz_score - higher the score the better
# davies_vouldin_score - lower the score the better
# completeness and homogeneity scores- near 1 for good, near 0 for bad.

In [None]:
def unsupervised_eval(features, y_test, cluster_labels):
    score_list = []
    sil_score = silhouette_score(features, cluster_labels, random_state = 42)
    cal_har_score = calinski_harabasz_score(features, cluster_labels)
    dav_bou_score = davies_bouldin_score(features, cluster_labels)
    complete_score = completeness_score(y_test, cluster_labels)
    homogen_score = homogeneity_score(y_test, cluster_labels)
#     print('silhouette_score: ', sil_score)
#     print('calinski_harabasz_score: ', cal_har_score)
#     print('davies_bouldin_score: ', dav_bou_score)
#     print('completeness_score: ', complete_score)
#     print('homogeneity_score: ', homogen_score)
#     score_dict['silhouette_score'] = round(sil_score,2)
#     score_dict['calinski_harabasz_score'] = round(cal_har_score,2)
#     score_dict['davies_bouldin_score'] = round(dav_bou_score,2)
#     score_dict['completeness_score'] = round(complete_score,2)
#     score_dict['homogeneity_score'] = round(homogen_score,2)
    score_list.append(round(sil_score,3))
    score_list.append(round(cal_har_score,3))
    score_list.append(round(dav_bou_score,3))
    score_list.append(round(complete_score,3))
    score_list.append(round(homogen_score,3))
    return score_list

Lets use demented and elder as ground truth labels.

In [None]:
columns = ['silhouette_score', 'calinski_harabasz_score', 'davies_bouldin_score', 'completeness_score', 'homogeneity_score']
index = ['Agg_vs_Demented', 'Kmean_vs_Demented', 'DBS_vs_Demented', 'Agg_vs_Elder', 'Kmean_vs_Elder', 'DBS_vs_Elder']
final_results_df = pd.DataFrame(columns=columns)

In [None]:
final_scores = np.zeros((6,5))

In [None]:
final_scores[0,:] = unsupervised_eval(features, y_test_c.flatten(), agg_labels)
final_scores[1,:] = unsupervised_eval(features, y_test_c.flatten(), k_means_labels)
final_scores[2,:] = unsupervised_eval(features, y_test_c.flatten(), dbs_labels)
final_scores[3,:] = unsupervised_eval(features, testset_df.elder, agg_labels)
final_scores[4,:] = unsupervised_eval(features, testset_df.elder, k_means_labels)
final_scores[5,:] = unsupervised_eval(features, testset_df.elder, dbs_labels)

In [None]:
final_results_df = pd.DataFrame(data = final_scores.round(3), columns=columns, index = index)
final_results_df

We can see that K-Means performs the best for metrics that do not require ground truth labels. For our ground truth metrics, we compared the clusters to demented groups and age groups. K-Means performs best for both labels with scores that are relatively great at capturing age groups and scores that are not so great at capturing demented groups.

### K-Means Sensitivity Analysis

K-Means does not really have many tunable hyper-parameters. The only concern regarding K-Means would be a bad initial centroid points. Therefore, We will run 5 different random initial points and observe how it affects the scores.

In [None]:
def display_scores(features, y_test, cluster_labels):
    score_list = []
    sil_score = silhouette_score(features, cluster_labels, random_state = 42)
    cal_har_score = calinski_harabasz_score(features, cluster_labels)
    dav_bou_score = davies_bouldin_score(features, cluster_labels)
    complete_score = completeness_score(y_test, cluster_labels)
    homogen_score = homogeneity_score(y_test, cluster_labels)
    print('silhouette_score: ', sil_score)
    print('calinski_harabasz_score: ', cal_har_score)
    print('davies_bouldin_score: ', dav_bou_score)
    print('completeness_score: ', complete_score)
    print('homogeneity_score: ', homogen_score)

In [None]:
for _ in range(3):
    seed = np.random.randint(1000)
    print("seed number: ",seed)
    kmeans = KMeans(n_clusters = 2, init= 'random', random_state=seed, max_iter = 300, algorithm= 'auto')
    kmeans.fit(features)
    k_means_labels = kmeans.labels_
    display_scores(features, testset_df.elder, k_means_labels)
    print('-----------')

In addition, lets try 'elkan algroithm instead of 'lloyd' (default)

In [None]:
for _ in range(3):
    seed = np.random.randint(1000)
    print("seed number: ",seed)
    kmeans = KMeans(n_clusters = 2, init= 'random', random_state=seed, max_iter = 300, algorithm= 'elkan')
    kmeans.fit(features)
    k_means_labels_elkan = kmeans.labels_
    display_scores(features, testset_df.elder, k_means_labels_elkan)
    print('-----------')

Nothing really changes. Based on the structure of the data as visualized through t-SNE, the data seems to be structured as a chain. Regardless of where the initial points start, the results always end up as the same.

What if we used pca to squash down the 16 features to two? How will that affect our K-mean model?

In [None]:
pca = PCA(n_components = 2, random_state =42)

In [None]:
pca.fit(features.transpose())

In [None]:
pca.explained_variance_ratio_

In [None]:
pc1, pc2 = pca.components_

In [None]:
kmeans = KMeans(n_clusters = 2, init= 'random', random_state=42, max_iter = 300, algorithm= 'elkan')
kmeans.fit(pca.components_.transpose())
k_means_labels_pca = kmeans.labels_
display_scores(features, testset_df.elder, k_means_labels_pca)

In [None]:
sens_analysis = np.zeros((3,5))

In [None]:
sens_analysis[0,:] = unsupervised_eval(features, testset_df.elder, k_means_labels)
sens_analysis[1,:] = unsupervised_eval(features, testset_df.elder, k_means_labels_elkan)
sens_analysis[2,:] = unsupervised_eval(features, testset_df.elder, k_means_labels_pca)

In [None]:
index = ['Normal K-Means', 'K-Means with Elkan', 'K-Means with PCA']
sens_analysis = pd.DataFrame(data = sens_analysis, columns = columns, index = index)
sens_analysis

It does a bit worse in terms of using ground truth labels. Overall our K-Means model is very insensitive.