# Spectral calustering

### Important libraries

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse.linalg import eigs
from scipy.sparse import csgraph
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from DataMatrix import generate_data_matrix
from sklearn.neighbors import kneighbors_graph
import warnings
warnings.filterwarnings("ignore")

-  **Create an object from DataMatrix class to use it for loading the data in the two solution methods**

## Spectral clustering implementation

In [2]:
def similarity_matrix(X, sigma=0.1):
    pairwise_sq_dists = np.square(pairwise_distances(X, metric='euclidean'))
    return np.exp(-pairwise_sq_dists / (2. * sigma ** 2))

In [18]:
def knn_similarity_matrix(X, n_neighbors=10):
    # Compute the KNN graph
    knn_graph = kneighbors_graph(X, n_neighbors=n_neighbors, mode='connectivity', include_self=False)
    
    # Convert the KNN graph to a similarity matrix
    knn_similarity = 0.5 * (knn_graph + knn_graph.T)  # Make the graph undirected
    return knn_similarity

In [12]:
def laplacian_a_matrix(W):
    D = np.diag(np.sum(W, axis=1))
    D_inv = np.linalg.inv(D)
    I = np.eye(len(D))
    return I - np.dot(D_inv, W)


In [20]:
def spectral_clustering(X, n_clusters=19,sim="ecu" ,sigma=0.1):
    if sim == "ecu":
        W = similarity_matrix(X, sigma)
    else:
        W = knn_similarity_matrix(X)
    W = similarity_matrix(X, sigma)
    L = laplacian_a_matrix(W)
    eigenvalues, eigenvectors = np.linalg.eig(L)
    idx = eigenvalues.argsort()  # Sort eigenvalues in ascending order
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    eigenvectors = np.real(eigenvectors)
    norm_eigenvectors = normalize(eigenvectors[:, :n_clusters], norm='l2', axis=1)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(norm_eigenvectors)
    return kmeans

### load the data in the first solution method `Mean Method`

In [17]:
x_train,y_train,x_test,y_test = generate_data_matrix(method="mean")
print("X_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (7296, 45)
y_train shape:  (7296, 1)
X_test shape:  (1824, 45)
y_test shape:  (1824, 1)


In [21]:
kmean = spectral_clustering(x_train, n_clusters=19,sim="Knn")

In [None]:
from sklearn.metrics import confusion_matrix

def precision_recall_f1_entropy(true_labels, cluster_labels, n_clusters):
    # Compute confusion matrix
    cm = confusion_matrix(true_labels, cluster_labels)
    
    # Compute precision, recall, and F1 score
    precision = np.zeros(n_clusters)
    recall = np.zeros(n_clusters)
    f1_score = np.zeros(n_clusters)
    entropy = 0.0
    
    for cluster in range(n_clusters):
        tp = cm[cluster, cluster]  # True Positives
        fp = np.sum(cm[:, cluster]) - tp  # False Positives
        fn = np.sum(cm[cluster, :]) - tp  # False Negatives
        
        precision[cluster] = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall[cluster] = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1_score[cluster] = 2 * precision[cluster] * recall[cluster] / (precision[cluster] + recall[cluster]) if (precision[cluster] + recall[cluster]) > 0 else 0.0
    
    # Compute entropy
    total_samples = np.sum(cm)
    for cluster in range(n_clusters):
        cluster_prob = np.sum(cm[cluster]) / total_samples
        if cluster_prob > 0:
            entropy += -cluster_prob * np.log2(cluster_prob)
    
    # Compute average precision, recall, F1 score, and normalize entropy
    avg_precision = np.mean(precision)
    avg_recall = np.mean(recall)
    avg_f1_score = np.mean(f1_score)
    normalized_entropy = entropy / np.log2(n_clusters)  # Normalize entropy
    
    return avg_precision, avg_recall, avg_f1_score, normalized_entropy

# Compute evaluation metrics
precision, recall, f1_score, entropy = precision_recall_f1_entropy(y_train, kmean.labels_, n_clusters=19)
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1_score))
print("Entropy: {:.4f}".format(entropy))

In [15]:
l=kmean.labels_

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

def spectral_clustering_evaluation(X_train, y_train, X_test, y_test, n_clusters=19, sigma=0.1):
    # Apply spectral clustering on the training data
    kmeans = spectral_clustering(X_train, n_clusters, sigma)
    train_labels = kmeans.labels_
    
    # Predict cluster labels for the test data
    test_labels = kmeans.predict(X_test)
    
    # Compute clustering evaluation metrics
    silhouette = silhouette_score(X_test, test_labels)
    davies_bouldin = davies_bouldin_score(X_test, test_labels)
    rand_index = adjusted_rand_score(y_test, test_labels)  # Adjusted Rand index requires true labels
    
    return silhouette, davies_bouldin, rand_index

# Generate data matrices using method 1
# X_train1, y_train1, X_test1, y_test1 = generate_data_matrix(method="mean")

# Evaluate spectral clustering using method 1
silhouette1, davies_bouldin1, rand_index1 = spectral_clustering_evaluation(X_train1, y_train1, X_test1, y_test1)
print("Method 1 - Silhouette Score:", silhouette1)
print("Method 1 - Davies-Bouldin Index:", davies_bouldin1)
print("Method 1 - Adjusted Rand Index:", rand_index1)

# Generate data matrices using method 2
# X_train2, y_train2, X_test2, y_test2 = generate_data_matrix(method="flatten")

# Evaluate spectral clustering using method 2
silhouette2, davies_bouldin2, rand_index2 = spectral_clustering_evaluation(X_train2, y_train2, X_test2, y_test2)
print("Method 2 - Silhouette Score:", silhouette2)
print("Method 2 - Davies-Bouldin Index:", davies_bouldin2)
print("Method 2 - Adjusted Rand Index:", rand_index2)


### load the data in the second solution method `Flatten Method`

In [None]:
# x_train, x_test, y_train, y_test= generate_data_matrix(method="flatten")
# print("X_train shape: ", x_train.shape)
# print("y_train shape: ", y_train.shape)
# print("X_test shape: ", x_test.shape)
# print("y_test shape: ", y_test.shape)

## Measurement method 

In [28]:
def construct_clusters(true_labels,labels):
    clusters = {}
    for i in range(len(true_labels)):
        if labels[i] in clusters:
            clusters[labels[i]].append(true_labels[i])
        else:
            clusters[labels[i]]=[true_labels[i]]
    return clusters

In [29]:
clusters=construct_clusters(y_train.flatten(),l)
clusters

{2: [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1

### Entropy method

In [30]:
# function to compute the conditional entropy of the clusters
def conditional_entropy(clusters,n_samples):
    H = 0
    for cluster in clusters.values():
        n = len(cluster)
        s=-sum([(cluster.count(label)/n) * np.log2(cluster.count(label)/n) for label in set(cluster)])
        H += (n/n_samples) * s
    return H

### Precision & Recall method

In [31]:
def precision_recall(clusters):
    precision = 0
    recall = 0
    for cluster in clusters.values():
        mx=0
        val=0
        for i in set(cluster):
            x=cluster.count(i)
            if x > mx:
                mx = x
                val = i
        p = mx/len(cluster)
        num_val=mx
        for cluster2 in clusters.values():
            if cluster != cluster2:
                num_val+=cluster2.count(val)
        r = mx/num_val
        precision += p
        recall += r
        
    return precision/len(clusters),recall/len(clusters)

### F1 score method

In [32]:
def f_measure( clusters):
    f = 0
    num_clusters = len(clusters)
    for cluster in clusters.values():
        mx=0
        val=0
        for i in set(cluster):
            x=cluster.count(i)
            if x > mx:
                mx = x
                val = i
        p = mx/len(cluster)
        num_val=mx
        for cluster2 in clusters.values():
            if cluster != cluster2:
                num_val+=cluster2.count(val)
        r = mx/num_val
        f += (2*p*r)/(p+r)
    return f/num_clusters

In [34]:
# get the acuracy, conditional entropy, precision, recall and f-measure
def get_metrics(true_labels,labels):
    clusters=construct_clusters(true_labels,labels)
    n_samples = len(true_labels)
    H = conditional_entropy(clusters,n_samples)
    precision,recall = precision_recall(clusters)
    f = f_measure(clusters)
    return H,precision,recall,f

print(get_metrics(y_train.flatten(),l))

(2.985917746059928, 0.36443196993658133, 0.24616228070175433, 0.2484947275903121)
