In [2]:
# Initialization code that runs before all other cells
import marimo as mo
from preprocessing import preprocess, get_labels
from extraction import feature_extraction
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.cluster import hierarchy as h
from scipy.spatial.distance import pdist,squareform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
!pwd

/home/isaac/dev/sfu/cmpt459/CMPT-459-Project/src/isaac/notebooks


# Plot Hierarchical Clustering of Dataset
* Load Data
* Preprocess and Apply Wavelet transform
* Compute Distance Matrix
* Visualize Distance matrix clusterplot
* Show visualization of 2D point with highest cluster score

In [5]:
df = pd.read_csv("../../../data/plant_health_data.csv")
cleaned = preprocess(df,attr=[])
cleaned

Unnamed: 0_level_0,Soil_Moisture,Ambient_Temperature,Soil_Temperature,Humidity,Light_Intensity,Soil_pH,Nitrogen_Level,Phosphorus_Level,Potassium_Level,Chlorophyll_Content,Electrochemical_Signal
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-10-03 10:54:53.407995,0.278321,-0.511285,0.662825,0.049963,-0.247408,-1.620166,-1.746638,1.355983,0.768579,0.108796,-0.080646
2024-10-03 16:54:53.407995,-1.184139,-0.666361,-0.435676,1.035845,-0.072300,1.051744,0.052635,-0.424888,-1.043216,-0.770972,-1.431377
2024-10-03 22:54:53.407995,-0.924655,-0.819209,-1.557519,1.478694,-0.094261,-1.491368,-0.066879,-0.234739,0.479665,1.015220,0.163452
2024-10-04 04:54:53.407995,-1.126428,-0.408656,0.962382,0.380088,-1.626583,-1.615750,-1.141667,-0.356286,-0.330470,0.352435,0.344986
2024-10-04 10:54:53.407995,1.696462,1.433049,-0.633556,1.015672,-0.736748,-1.039349,-1.664853,0.666060,0.646682,1.542359,1.082045
...,...,...,...,...,...,...,...,...,...,...,...
2024-11-01 04:54:53.493508,0.525571,1.048262,-1.561354,-0.021064,0.166063,-1.390877,-0.009874,-1.339201,-0.733189,-1.195476,-0.475251
2024-11-01 10:54:53.493508,-1.108602,-0.549694,-1.616173,0.727684,0.684081,-0.971854,1.316460,-0.440430,0.156786,-0.435783,0.800634
2024-11-01 16:54:53.493508,-0.185748,-0.674071,-1.521070,-1.612468,-1.397725,1.539182,0.716310,1.604562,-0.140515,0.407937,-0.187423
2024-11-01 22:54:53.493508,0.676122,-0.284769,0.674590,-1.533501,-0.524421,-1.149304,1.676496,-0.560153,-0.888438,1.286945,-1.118791


In [None]:
X = cleaned
X

# Agglomerative Clustering
There does seem to be some block structure to our dataset and features.  Now we show the distances with
clustering. We show single, farthest, and average link clustering results

### Single Linkage

In [None]:
sns.clustermap(X, method='single',cmap='mako',col_cluster=False)
plt.show()

### Average Linkage

In [None]:
sns.clustermap(X, method='average',cmap='mako',col_cluster=False)
plt.show()

### Complete/Farthest Linkage

In [None]:
sns.clustermap(X, method='complete',cmap='mako',col_cluster=False)
plt.show()

In [None]:
d_mat = pdist(X)
sns.heatmap(squareform(d_mat),cmap='mako')
plt.title("Distance between Plant Samples")

In [None]:
single = h.linkage(d_mat)
average = h.linkage(d_mat, method='average')
complete = h.linkage(d_mat, method='complete')

### Single Link Dendogram

In [None]:
plt.figure(figsize=(50,8))
h.dendrogram(single)
plt.title("Single Link Dendogram")
plt.show()

### Average Link Dendogram

In [None]:
plt.figure(figsize=(50,8))
h.dendrogram(average)
plt.title("Average Link Dendogram")
plt.show()

### Complete Link Dendogram

In [None]:
plt.figure(figsize=(50,8))
h.dendrogram(complete)
plt.title("Complete Link Dendogram")
plt.show()

Of the three methods, the worst looking dendogram is single.  Most clusters are formed near the leaves of the tree, implying small clusters.  Complete and average linkage were able to find much larger clusters. This implies better performance.

### Average Linkage

In [None]:
sns.clustermap(
    squareform(d_mat),row_linkage=average,col_linkage=average,cmap = 'mako'
)
plt.show()

### Complete Linkage

In [None]:
sns.clustermap(
    squareform(d_mat),row_linkage=complete,col_linkage=complete,cmap = 'mako'
)
plt.show()

Neither method produces strong block structures.  Suggesting there aren't sub regions of the distance matrix that are more similar.  The complete method creates slightly strong structure.  Therefore, I'll use that method to compute the best flat clustering

In [None]:
scores = []
for t in range(2,20):
    labels = h.fcluster(complete, t = t,criterion='maxclust')
    score = silhouette_score(X, labels)
    scores.append(score)

In [None]:
plt.plot(np.arange(2,20),scores, '-o', c = 'orange')
plt.title("Silhouette Score")
plt.xlabel("# Clusters")
plt.ylabel("Average Score")
plt.grid()
plt.show()

Clusterings from 2-5 get worse and worse. However, there is a peak at 10.  Since low number of clusters are less interesting we chose 10 as the number of clusters. Further our data has little to no clustering behaviour. The Silhouette score is basically 0 

In [None]:
def hierarchical(X:pd.DataFrame, method = 'complete',n_clusters = 10):
    '''
        Perform Hierarchical Clustering on Dataset
        X: the extracted features from the dataset
        method: what method to use out of scipy linkages
        n_clusters: number of clusters for dataset
    '''
    d = pdist(X)
    tree = h.linkage(d, method=method)
    return h.fcluster(tree, t = n_clusters, criterion='maxclust')