# Clustering Time Series Data

## importing modules

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.cluster import *
import random
import operator
import math

In [2]:
from tqdm.notebook import trange,tqdm
import random
import statistics

## datasets

In [3]:
datasets = [("Colposcopy.csv",6),("ECG200.csv",2),("Lightning2.csv",2),("SharePriceIncrease.csv",2),("Wafer.csv",2)]

## implementation of time series clustering methods

### K means Euclidean:

In [4]:
class KMeansClusteringE:
    
    def __init__(self, X, num_clusters, metric="Euclidean", max_iter=100):
        self.K = num_clusters
        self.max_iterations = max_iter
        self.num_examples = X.shape[0]
        self.num_features = X.shape[1]
        self.metric=metric

    """random centroids are being initialized
    input: dataset
    returns: initial centroids"""
    def initialize_random_centroids(self, X):
        centroids = np.zeros((self.K, self.num_features))
        for k in range(self.K):
            centroid = X[np.random.choice(range(self.num_examples))]  #generating random index
            centroids[k] = centroid                                   #assigning random examples as the centroids
        return centroids
    
    """clusters are being created based on euclidean distance
    input: dataset, centroids
    returns: clusters"""
    def create_clusters(self, X, centroids):
        clusters = [[] for _ in range(self.K)]                        #points associated with specific cluster        
        for point_idx, point in enumerate(X):                     #loop through each point and check the closest centroid
            closest_centroid = np.argmin(
                np.sqrt(np.sum((point - centroids) ** 2, axis=1))     #Euclidean distance using numpy broadcasting
            )
            clusters[closest_centroid].append(point_idx)
        return clusters

    """calculating new centroids from the available clusters
    input: clusters, dataset
    returns: new centroids calculated"""
    def calculate_new_centroids(self, clusters, X):
        centroids = np.zeros((self.K, self.num_features))
        for idx, cluster in enumerate(clusters):
            new_centroid = np.mean(X[cluster], axis=0)                #calculating new centroid for each cluster by averaging
            centroids[idx] = new_centroid
        return centroids

    """final prediction of class labels for each point
    input: clusters, dataset
    returns: predicted labels for each point"""
    def predict_cluster(self, clusters, X):
        y_pred = np.zeros(self.num_examples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                y_pred[sample_idx] = cluster_idx+1                     #assigning class label to each point 
        return y_pred

    """basically this is the k-means function named as fit
    input: dataset
    returns: predicted labels for each point obtained from cluster_prediction function"""
    def fit(self, X):
        centroids = self.initialize_random_centroids(X)
        #print(centroids)
        for it in range(self.max_iterations):
            clusters = self.create_clusters(X, centroids)
            previous_centroids = centroids
            centroids = self.calculate_new_centroids(clusters, X)
            diff = centroids - previous_centroids
            if not diff.any():
                break               
        # Get label predictions
        y_pred = self.predict_cluster(clusters, X)
        return y_pred
    
#O(mnk)-->O(n)

In [5]:
print("K-means Clustering with Euclidean Distance:")
print("--------------------------------------------")

res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
kEARI,kEAMI,kEFMS,kEH,kEC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    np.random.seed(10)
    num_clusters=data[1]
    KMeans=KMeansClusteringE(x,num_clusters)
    y_pred=KMeans.fit(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    kEARI.append(ARI)
    kEAMI.append(AMI)
    kEFMS.append(FMI)
    kEH.append(HOMOGENITY)
    kEC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

K-means Clustering with Euclidean Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.090810  0.100538  0.288167    0.134597  1.419133e-01
1              ECG200  0.119568  0.054011  0.666569    0.051059  6.798699e-02
2          Lightning2  0.011740  0.011933  0.513127    0.018270  1.774633e-02
3  SharePriceIncrease -0.001123  0.000173  0.753539    0.000627  4.777712e-02
4               Wafer  0.000009 -0.001076  0.687585    0.000001  6.664530e-07


In [6]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(kEARI),2))
print("std:",round(statistics.stdev(kEARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(kEAMI),2))
print("std:",round(statistics.stdev(kEAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(kEFMS),2))
print("std:",round(statistics.stdev(kEFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(kEH),2))
print("std:",round(statistics.stdev(kEH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(kEC),2))
print("std:",round(statistics.stdev(kEC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.04
std: 0.06
AMI
------
avg: 0.03
std: 0.04
FMS
------
avg: 0.58
std: 0.19
HOMOGENITY
------
avg: 0.04
std: 0.06
COMPLETENESS
------
avg: 0.06
std: 0.06


### K means DTW

In [7]:
'''Dynamic Time warping distance measure for k means
   input: two time series and window size
   returns: distance between two time series for given window size'''

def DTWDistance(s1, s2, w):
    DTW={}    # stores the similarity measures 
    w = max(w, abs(len(s1)-len(s2)))    
    for i in range(-1,len(s1)):
        for j in range(-1,len(s2)):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0  
    for i in range(len(s1)):
        for j in range(max(0, i-w), min(len(s2), i+w)):
            dist= (s1[i]-s2[j])**2
            # DTW[i, j] is the distance between s1[1:i] and s2[1:j] with the best alignment.
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],      # increment
                                     DTW[(i, j-1)],      # decrement
                                     DTW[(i-1, j-1)])    # match
    return np.sqrt(DTW[len(s1)-1, len(s2)-1])

class KMeansClusteringD:
    
    def __init__(self, X, num_clusters, max_iter=100):
        self.K = num_clusters
        self.max_iterations = max_iter
        self.num_examples = X.shape[0]
        self.num_features = X.shape[1]

    """random centroids are being initialized
    input: dataset
    returns: initial centroids"""
    def initialize_random_centroids(self, X):
        centroids = np.zeros((self.K, self.num_features))
        for k in range(self.K):
            centroid = X[np.random.choice(range(self.num_examples))]  #generating random index
            centroids[k] = centroid                                   #assigning random examples as the centroids
        return centroids
    
    """clusters are being created based on euclidean distance
    input: dataset, centroids
    returns: clusters"""
    def create_clusters(self, X, centroids):
        clusters = [[] for _ in range(self.K)]                        #points associated with specific cluster
        for idx,point in enumerate(X):                #loop through each point and check the closest centroid
            min_dis=float('inf')
            w=len(point)*0.05
            for idx1,centroid in enumerate(centroids):
                d=DTWDistance(point,centroid,int(w))
                if d<min_dis:
                    min_dis=d
                    closest_centroid=idx1
            clusters[closest_centroid].append(idx)
        return clusters

    """calculating new centroids from the available clusters
    input: clusters, dataset
    returns: new centroids calculated"""
    def calculate_new_centroids(self, clusters, X):
        centroids = np.zeros((self.K, self.num_features))
        for idx, cluster in enumerate(clusters):
            new_centroid = np.mean(X[cluster], axis=0)                #calculating new centroid for each cluster by averaging
            centroids[idx] = new_centroid
        return centroids

    """final prediction of class labels for each point
    input: clusters, dataset
    returns: predicted labels for each point"""
    def predict_cluster(self, clusters, X):
        y_pred = np.zeros(self.num_examples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                y_pred[sample_idx] = cluster_idx+1                     #assigning class label to each point 
        return y_pred

    """basically this is the k-means function named as fit
    input: dataset
    returns: predicted labels for each point obtained from cluster_prediction function"""
    def fit(self, X):
        centroids = self.initialize_random_centroids(X)
        #print(centroids)
        for it in range(self.max_iterations):
            clusters = self.create_clusters(X, centroids)
            previous_centroids = centroids
            centroids = self.calculate_new_centroids(clusters, X)
            diff = centroids - previous_centroids
            if not diff.any():
                break               
        # Get label predictions
        y_pred = self.predict_cluster(clusters, X)
        return y_pred

In [8]:
print("K-means Clustering with DTW Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
kDARI,kDAMI,kDFMS,kDH,kDC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    np.random.seed(10)
    num_clusters=data[1]
    KMeansD=KMeansClusteringD(x,num_clusters)
    y_pred=KMeansD.fit(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    kDARI.append(ARI)
    kDAMI.append(AMI)
    kDFMS.append(FMI)
    kDH.append(HOMOGENITY)
    kDC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

K-means Clustering with DTW Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.086487  0.110571  0.277633    0.144960      0.149555
1              ECG200  0.129249  0.060669  0.667804    0.057209      0.075027
2          Lightning2  0.060838  0.030511  0.640132    0.031279      0.048329
3  SharePriceIncrease  0.012738  0.003187  0.548499    0.007541      0.006681
4               Wafer -0.004946 -0.000914  0.686887    0.000241      0.000125


In [9]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(kDARI),2))
print("std:",round(statistics.stdev(kDARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(kDAMI),2))
print("std:",round(statistics.stdev(kDAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(kDFMS),2))
print("std:",round(statistics.stdev(kDFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(kDH),2))
print("std:",round(statistics.stdev(kDH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(kDC),2))
print("std:",round(statistics.stdev(kDC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.06
std: 0.05
AMI
------
avg: 0.04
std: 0.05
FMS
------
avg: 0.56
std: 0.17
HOMOGENITY
------
avg: 0.05
std: 0.06
COMPLETENESS
------
avg: 0.06
std: 0.06


### K means Shape based distance

In [10]:
import math
import numpy as np

from numpy.random import randint
from numpy.linalg import norm, eigh
from numpy.fft import fft, ifft


def zscore(a, axis=0, ddof=0):
    '''
       input: a is the sequence to be z normalized
       output: returns the z normalized form of a
    '''
    a = np.asanyarray(a)
    mns = a.mean(axis=axis)
    sstd = a.std(axis=axis, ddof=ddof)
    if axis and mns.ndim < a.ndim:
        res = ((a - np.expand_dims(mns, axis=axis)) / np.expand_dims(sstd, axis=axis))
    else:
        res = (a - mns) / sstd         # (actual - mean)/standard deviation
    return np.nan_to_num(res)


def roll_zeropad(a, shift, axis=None):
    '''
       input: a is the sequence that is to be aligned according to shift
       output: returns aligned sequence of a according to shift
    '''
    a = np.asanyarray(a)
    if shift == 0:
        return a
    
    if axis is None:
        n = a.size
        reshape = True
    else:
        n = a.shape[axis]
        reshape = False
        
    if np.abs(shift) > n:
        res = np.zeros_like(a)
    elif shift < 0:    # if shift <0 then y = [y(1 − shift : end), zeros(1,−shift)] 
        shift += n
        zeros = np.zeros_like(a.take(np.arange(n-shift), axis))
        res = np.concatenate((a.take(np.arange(n-shift, n), axis), zeros), axis)
    else:              # if shift ≥ 0 then y = [zeros(1, shift), y(1 : end − shift)]
        zeros = np.zeros_like(a.take(np.arange(n-shift, n), axis))
        res = np.concatenate((zeros, a.take(np.arange(n-shift), axis)), axis)
        
    if reshape:
        return res.reshape(a.shape)
    else:
        return res


def _ncc_c(x, y):
    '''
       input: two time series x and y both of 1 dimensional
       output: returns coefficient normalization of x and y
    '''
    den = np.array(norm(x) * norm(y))
    den[den == 0] = np.Inf

    x_len = len(x)
    fft_size = 1 << (2*x_len-1).bit_length()                 # length = 2nextpower2(2∗length(x)−1)

    cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size)))  # CC = IFFT{FFT(x,length) ∗ FFT(y,length)}
    cc = np.concatenate((cc[-(x_len-1):], cc[:x_len]))       
    return np.real(cc) / den                                 #NCCc = CC/(||x|| ||y||)


def _ncc_c_3dim(x, y):
    '''
       input: two arrays x and y both of 2 dimensional
       output: returns coefficient normalization of x and y
    '''
    den = norm(x, axis=1)[:, None] * norm(y, axis=1)
    den[den == 0] = np.Inf
    x_len = x.shape[-1]
    fft_size = 1 << (2*x_len-1).bit_length()
    cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size))[:, None])
    cc = np.concatenate((cc[:,:,-(x_len-1):], cc[:,:,:x_len]), axis=2)
    return np.real(cc) / den.T[:, :, None]


def _sbd(x, y):
    '''
       Input: Two z-normalized sequences x and y
       Output: Dissimilarity dist of x and y 
               Aligned sequence y1 of y towards x
    '''
    ncc = _ncc_c(x, y)     # coefficient normalization of x and y and it gives values between -1 and 1
    idx = ncc.argmax()     # position where ncc is maximized
    dist = 1 - ncc[idx]    # it is the sbd distance which may be between 0 and 2 indicating similarity
    yshift = roll_zeropad(y, (idx + 1) - max(len(x), len(y)))    # aligning the sequence y towards x

    return dist, yshift


def _extract_shape(idx, x, j, cur_center):
    '''
       Input: idx is an n-by-1 vector containing the assignment of n time series to k clusters.
              x is an n-by-m matrix with z-normalized time series.
              j is the label of cur_center.
              cur_center is a 1-by-m vector with the reference sequence against which time series of x are aligned.
       Output: centroid is a z-normalized 1-by-m vector with the centroid.
    ''' 
    _a = []
    for i in range(len(idx)):            # for each label in idx array
        if idx[i] == j:                  # if the label matches cur_center label
            if cur_center.sum() == 0:    # if the sum of values in cur_center is 0
                opt_x = x[i]             # use x[i] itself
            else:
                _, opt_x = _sbd(cur_center, x[i])   # else apply shape based distance for cur_center and x[i]
            _a.append(opt_x)
    a = np.array(_a)                     # a contains all the points aligned towards the cur_center

    if len(a) == 0:
        return np.zeros((1, x.shape[1]))
    columns = a.shape[1]
    y = zscore(a, axis=1, ddof=1)        # z-normalization of array a
    s = np.dot(y.transpose(), y)         # dot product of y transpose and y

    p = np.empty((columns, columns))
    p.fill(1.0/columns)
    p = np.eye(columns) - p              # p = I - (1/m)O ---> I is the identity matrix, m is no. of columns, O is matrix with all ones

    m = np.dot(np.dot(p, s), p)          # dot product of (p,s) and p
    _, vec = eigh(m)                     # eigen value and eigen vector of m is returned
    centroid = vec[:, -1]                # normalized eigen vector corresponding to the eigen value
    finddistance1 = math.sqrt(((a[0] - centroid) ** 2).sum())
    finddistance2 = math.sqrt(((a[0] + centroid) ** 2).sum())

    if finddistance1 >= finddistance2:
        centroid *= -1

    return zscore(centroid, ddof=1)

def kshape(x, k):
    '''
    Input: x is an n-by-m matrix containing n time series of length m that are initially z-normalized.
           k is the number of clusters to produce.
    Output: idx is an n-by-1 vector containing the assignment of n time series to k clusters (initialized randomly).
            centroids is a k-by-m matrix containing k centroids of length m (initialized as vectors with all zeros).
    '''
    n = x.shape[0] 
    idx = randint(0, k, size=n)
    centroids = np.zeros((k, x.shape[1]))
    distances = np.empty((n, k))
    for _ in range(100):              # loop until condition is satisfied
        #refinement step
        old_idx = idx
        for j in range(k):            # loop for each centroid
            centroids[j] = _extract_shape(idx, x, j, centroids[j])         # calculate each centroid using extract shape func. 
        
        #assignment step
        distances = (1 - _ncc_c_3dim(x, centroids).max(axis=2)).T          # dissimilarity distances of x and centroids
        idx = distances.argmin(1)     # label of the centroid with minimum distance for each point is stored in idx
        
        if np.array_equal(old_idx, idx):
            break

    for i in range(len(idx)):
        idx[i]+=1

    return idx

In [11]:
print("K-means Clustering with Shape Based Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
kSARI,kSAMI,kSFMS,kSH,kSC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    np.random.seed(10)
    num_clusters=data[1]
    y_pred=kshape(zscore(x,axis=1),num_clusters)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    kSARI.append(ARI)
    kSAMI.append(AMI)
    kSFMS.append(FMI)
    kSH.append(HOMOGENITY)
    kSC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

K-means Clustering with Shape Based Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.024521  0.037213  0.201862    0.076475      0.072848
1              ECG200  0.205360  0.117391  0.651226    0.119180      0.122754
2          Lightning2  0.056227  0.075928  0.547992    0.081272      0.082190
3  SharePriceIncrease  0.000804 -0.000350  0.545688    0.000054      0.000050
4               Wafer  0.014819  0.000616  0.689780    0.002490      0.001274


In [12]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(kSARI),2))
print("std:",round(statistics.stdev(kSARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(kSAMI),2))
print("std:",round(statistics.stdev(kSAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(kSFMS),2))
print("std:",round(statistics.stdev(kSFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(kSH),2))
print("std:",round(statistics.stdev(kSH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(kSC),2))
print("std:",round(statistics.stdev(kSC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.06
std: 0.08
AMI
------
avg: 0.05
std: 0.05
FMS
------
avg: 0.53
std: 0.19
HOMOGENITY
------
avg: 0.06
std: 0.05
COMPLETENESS
------
avg: 0.06
std: 0.05


### K medoids Euclidean distance

In [13]:
def euclideanDistance(x, y):
    squared_d = 0
    for i in range(len(x)):
        squared_d += (x[i] - y[i])**2
    d = np.sqrt(squared_d)
    return d

class KMedoids:
    def __init__(self, k = 2, max_iter = 100, has_converged = False):
        ''' 
        Parameters
        ----------
        - k: number of clusters. 
        - max_iter: number of times centroids will move
        - has_converged: to check if the algorithm stop or not
        '''
        self.k = k
        self.max_iter = max_iter
        self.has_converged = has_converged
        self.medoids_cost = []
        
    def initMedoids(self, X):
        ''' 
        Parameters
        ----------
        X: input data. 
        '''
        self.medoids = []
        
        #Starting medoids will be random members from data set X
        indexes = np.random.randint(0, len(X)-1,self.k)
        self.medoids = X[indexes]
        
        for i in range(0,self.k):
            self.medoids_cost.append(0)
        
    def isConverged(self, new_medoids):
        '''
        Parameters
        ----------
        new_medoids: the recently calculated medoids to be compared with the current medoids stored in the class
        '''
        return set([tuple(x) for x in self.medoids]) == set([tuple(x) for x in new_medoids])
        
    def updateMedoids(self, X, labels):
        '''
        Parameters
        ----------
        labels: a list contains labels of data points
        '''
        self.has_converged = True
        #Store data points to the current cluster they belong to
        clusters = []
        for i in range(0,self.k):
            cluster = []
            for j in range(len(X)):
                if (labels[j] == i):
                    cluster.append(X[j])
            clusters.append(cluster)
        #Calculate the new medoids
        new_medoids = []
        for i in range(0, self.k):
            new_medoid = self.medoids[i]
            old_medoids_cost = self.medoids_cost[i]
            for j in range(len(clusters[i])):         # non-medoid being considered as medoid
                #Cost of the current data points to be compared with the current optimal cost
                cur_medoids_cost = 0
                for dpoint_index in range(len(clusters[i])):
                    cur_medoids_cost += euclideanDistance(clusters[i][j], clusters[i][dpoint_index])
                #If current cost is less than current optimal cost,
                #make the current data point new medoid of the cluster
                if cur_medoids_cost < old_medoids_cost:
                    new_medoid = clusters[i][j]
                    old_medoids_cost = cur_medoids_cost
            #Now we have the optimal medoid of the current cluster
            new_medoids.append(new_medoid)
        #If not converged yet, accept the new medoids
        if not self.isConverged(new_medoids):
            self.medoids = new_medoids
            self.has_converged = False
    
    def fit(self, X):
        '''
        FIT function, used to find clusters
        Parameters
        ----------
        X: input data. 
        '''
        self.initMedoids(X)
        for i in range(self.max_iter):
            #Labels for this iteration
            cur_labels = []
            for k in range(len(X)):
                #Distances from a data point to each of the medoids
                d_list = []                    
                for j in range(0,self.k):
                    d_list.append(euclideanDistance(self.medoids[j], X[k]))
                #Data points' label is the medoid which has minimal distance to it
                idx=d_list.index(min(d_list))
                cur_labels.append(idx)
                self.medoids_cost[idx]+=min(d_list)                    
            self.updateMedoids(X, cur_labels)
            if self.has_converged:
                break
        return np.array(self.medoids)

        
    def predict(self,data):
        ''' 
        Parameters
        ----------
        data: input data.
        
        Returns:
        ----------
        pred: list cluster indexes of input data 
        '''
        pred = []
        for i in range(len(data)):
            #Distances from a data point to each of the medoids
            d_list = []
            for j in range(len(self.medoids)):
                d_list.append(euclideanDistance(self.medoids[j],data[i]))
            pred.append(d_list.index(min(d_list))+1)
        return np.array(pred)
    

In [14]:
print("K-medoids Clustering with Euclidean Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
kMEARI,kMEAMI,kMEFMS,kMEH,kMEC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    np.random.seed(10)
    num_clusters=data[1]
    model=KMedoids(k=num_clusters)
    model.fit(x)
    y_pred = model.predict(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    kMEARI.append(ARI)
    kMEAMI.append(AMI)
    kMEFMS.append(FMI)
    kMEH.append(HOMOGENITY)
    kMEC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

K-medoids Clustering with Euclidean Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.092428  0.108421  0.280708    0.143109  1.471142e-01
1              ECG200  0.117094  0.053091  0.670775    0.049630  6.824522e-02
2          Lightning2  0.007257  0.008986  0.511475    0.015291  1.487590e-02
3  SharePriceIncrease -0.000563 -0.000407  0.754090    0.000313  4.389560e-02
4               Wafer  0.000009 -0.001076  0.687585    0.000001  6.664530e-07


In [15]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(kMEARI),2))
print("std:",round(statistics.stdev(kMEARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(kMEAMI),2))
print("std:",round(statistics.stdev(kMEAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(kMEFMS),2))
print("std:",round(statistics.stdev(kMEFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(kMEH),2))
print("std:",round(statistics.stdev(kMEH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(kMEC),2))
print("std:",round(statistics.stdev(kMEC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.04
std: 0.06
AMI
------
avg: 0.03
std: 0.05
FMS
------
avg: 0.58
std: 0.19
HOMOGENITY
------
avg: 0.04
std: 0.06
COMPLETENESS
------
avg: 0.05
std: 0.06


### Fuzzy C Means Euclidean Distance

![title](Desktop\fuzzycmeans1.png)
Where, µ is fuzzy membership value of the data point, m is the fuzziness parameter (generally taken as 2), and xk is the data point

![title](Desktop\fuzzycmeans2.png)
for updation of membership values

In [16]:
import random
import operator
import math

class FuzzyCMeans:
    
    def __init__(self, df, num_clusters, max_iter=3):
        self.df=df
        self.num_attr=len(df.columns)-1  #no. of cols
        self.k=num_clusters              #no. of clusters
        self.MAX_ITER=max_iter           #Max. no. of iterations
        self.n=len(df)                   #No. of data points
        self.m=2.00                      #Fuzzy parameter
        
    #initializing membership matrix with random values
    def initialize_membershipMatrix(self):     
        membership_mat=list()
        for i in range(self.n):
            random_num_list=[random.random() for i in range(self.k)]
            summ=sum(random_num_list)
            temp=[x/summ for x in random_num_list]
            membership_mat.append(temp)
        return membership_mat
    #Each data point lies in all the clusters available with some membership value 
    #it will be random in the initial state
    
    #cluster center is calculated in every iteration
    def calculate_clusterCenter(self,membership_mat):     
        cluster_mem_value=list(zip(*membership_mat)) #zip returns an iterator of tuples with each tuple having elements from all the iterables
        cluster_centers=list()
        for j in range(self.k):
            x=list(cluster_mem_value[j])
            xraised=[e**self.m for e in x]
            denominator=sum(xraised)
            temp=list()
            for i in range(self.n):
                datapoint=list(self.df.iloc[i])
                prod=[xraised[i]*val for val in datapoint]
                temp.append(prod)
            numerator=map(sum,list(zip(*temp)))      
            center=[z/denominator for z in numerator]
            cluster_centers.append(center)
        return cluster_centers
    
    #updating membership values using the recent cluster centers available
    def update_membershipValue(self,membership_mat,cluster_centers):
        p=float(2/(self.m-1))
        for i in range(self.n):
            x=list(self.df.iloc[i])
            #finding out distance of each point from centroids (euclidean distance)
            distances=[np.linalg.norm(list(map(operator.sub,x,cluster_centers[j]))) for j in range(self.k)]  
            for j in range(self.k):
                # finding new membership value for point i
                den=sum([math.pow(float(distances[j]/distances[c]),p) for c in range(self.k)]) 
                membership_mat[i][j]=float(1/den)
        return membership_mat
    
    #returns the cluster labels for each point in the dataset
    def getClusters(self,membership_mat):
        cluster_labels=list()
        for i in range(self.n):
            max_value,idx=max((val,idx) for (idx,val) in enumerate(membership_mat[i]))
            cluster_labels.append(idx+1)
        return cluster_labels
    
    def fit(self):
        membership_mat=self.initialize_membershipMatrix()
        for i in range(self.MAX_ITER):
            cluster_centers=self.calculate_clusterCenter(membership_mat)
            membership_mat=self.update_membershipValue(membership_mat,cluster_centers)
            cluster_labels=self.getClusters(membership_mat)
        return cluster_labels,cluster_centers

In [17]:
print("Fuzzy C Means Clustering with Euclidean Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
cMARI,cMAMI,cMFMS,cMH,cMC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    np.random.seed(10)
    num_clusters=data[1]
    fcm=FuzzyCMeans(dataset,num_clusters)
    y_pred,centers=fcm.fit()
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    cMARI.append(ARI)
    cMAMI.append(AMI)
    cMFMS.append(FMI)
    cMH.append(HOMOGENITY)
    cMC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

Fuzzy C Means Clustering with Euclidean Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.111517  0.132553  0.325275    0.159920      0.183650
1              ECG200 -0.005263 -0.003760  0.527580    0.000081      0.000076
2          Lightning2  0.000096  0.002618  0.507950    0.008874      0.008633
3  SharePriceIncrease -0.000425 -0.000380  0.533276    0.000016      0.000014
4               Wafer  0.085454  0.050653  0.701145    0.078170      0.038559


In [18]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(cMARI),2))
print("std:",round(statistics.stdev(cMARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(cMAMI),2))
print("std:",round(statistics.stdev(cMAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(cMFMS),2))
print("std:",round(statistics.stdev(cMFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(cMH),2))
print("std:",round(statistics.stdev(cMH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(cMC),2))
print("std:",round(statistics.stdev(cMC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.04
std: 0.06
AMI
------
avg: 0.04
std: 0.06
FMS
------
avg: 0.52
std: 0.13
HOMOGENITY
------
avg: 0.05
std: 0.07
COMPLETENESS
------
avg: 0.05
std: 0.08


### Density Peaks with Euclidean Distance

In [19]:
class DensityPeakCluster(object):
    """
    Attributes:
        n_id: data row count
        distance: each id distance
        dc: threshold of density cut off
        rho: each id density
        nneigh: each id min upper density nearest neighbor
        delta: each id min upper density nearest neighbor distance
    """

    def __init__(self, x, k, distance_metric='euclidean'):
        """
        Init parameters for Density peak cluster.
        parameters
        x: data
        k: number of clusters
        distance_metric: distance calculate function euclidean
        """
        self.data = x
        self.k = k
        self.distance_metric = distance_metric
        

    def build_distance(self):
        """
        Calculates distance dictionary.
        return: distance dict, max distance, min distance
        """
        from scipy.spatial.distance import pdist, squareform
        
        # Pairwise distances between observations in n-dimensional space.
        distance_matrix = pdist(self.data, metric=self.distance_metric) 
        
        # Convert a vector-form distance vector to a square-form distance matrix
        distance_matrix = squareform(distance_matrix)
        
        # Return the indices for the upper-triangle of an (n, m) array.
        triangle_upper = np.triu_indices(self.data.shape[0], 1)
        triangle_upper = distance_matrix[triangle_upper]

        # distance dictionary
        distance = {}
        for i in range(self.n_id):
            for j in range(i + 1, self.n_id):
                distance[(i, j)] = distance_matrix[i, j]
                distance[(j, i)] = distance_matrix[i, j]

        max_dis, min_dis = np.max(triangle_upper), np.min(triangle_upper)
        return distance, max_dis, min_dis

    def select_dc(self):
        """
        selects the local density threshold that let average neighbor is 1-2 percent of all nodes.
        return: dc that local density threshold
        """
        max_dis, min_dis = self.max_dis, self.min_dis
        dc = (max_dis + min_dis) / 2
        
        while True:
            # calculating the nearest neighbors within the selected dc
            nneighs = sum([1 for v in self.distances.values() if v < dc]) / self.n_id ** 2
            if 0.01 <= nneighs <= 0.002:
                break
            # binary search
            if nneighs < 0.01:
                min_dis = dc
            else:
                max_dis = dc
            dc = (max_dis + min_dis) / 2
            if max_dis - min_dis < 0.0001:
                break
        return dc


    def local_density(self):
        """
        computes all points' local density.
        return: local density vector that index is the point index
        """
        cutoff_func = lambda dij, dc: 1 if dij < dc else 0
        
        rho = [0] * self.n_id
        for i in range(self.n_id):
            for j in range(i + 1, self.n_id):
                temp = cutoff_func(self.distances[(i, j)], self.dc)
                rho[i] += temp
                rho[j] += temp
        return np.array(rho, np.float32)

    
    def clustering(self):
        """
        Compute all points' min util to the higher local density point(which is the nearest neighbor).
        return: y_pred containing labels for all the points
        """
        # sort_rho_idx contains index of values in rho in descending order
        sort_rho_idx = np.argsort(-self.rho)
        
        # nneigh contains each id min upper density nearest neighbor
        # delta contains each id min upper density nearest neighbor distance
        delta, nneigh = [float(self.max_dis)] * (self.n_id), [0] * self.n_id
        delta[sort_rho_idx[0]] = -1
        for i in range(self.n_id):
            for j in range(0, i):
                old_i, old_j = sort_rho_idx[i], sort_rho_idx[j]
                if self.distances[(old_i, old_j)] < delta[old_i]:
                    delta[old_i] = self.distances[(old_i, old_j)]
                    nneigh[old_i] = old_j
        delta[sort_rho_idx[0]] = max(delta)
    
        # cluster center selection
        # vector multiplication of values in rho and delta
        res=np.multiply(self.rho,delta)
        
        # points contains index of values in res in descending order
        points=np.argsort(-res)
        
        # idxs contains indexes of top k points as cluster centers
        idxs=points[:self.k]
        centers=[]
        for i in range(self.k):
            centers.append(self.data[idxs[i]])
            
        # assignment of clusters to all the points 
        clusters={}
        #assignment of labels to the cluster centers
        for i in range(self.k):
            clusters[idxs[i]]=i
        #assignment of labels to all the points other than cluster centers based on the nearest neighbors
        for j in range(self.n_id):
            if sort_rho_idx[j] not in clusters:
                clusters[sort_rho_idx[j]]=clusters[nneigh[sort_rho_idx[j]]]

        # predicting the labels for all the points and placing them in y_pred
        y_pred=[]
        for i in range(self.n_id):
            y_pred.append(clusters[i]+1)
        return y_pred
    
    
    def fit(self,data):
        """
        Fits the model.
        param data: data for clustering
        return: predicted labels
        """
        if isinstance(data, np.ndarray): data = np.array(data)

        self.n_id = self.data.shape[0]

        # calculate distance
        self.distances, self.max_dis, self.min_dis = self.build_distance()

        # select distance cut off
        self.dc = self.select_dc()

        # calculate local density
        self.rho = self.local_density()
        
        # calculate nearest neighbor and delta and performs actual clustering
        y_pred = self.clustering()
        return y_pred


In [20]:
print("Density Peaks Clustering with Euclidean Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
dEARI,dEAMI,dEFMS,dEH,dEC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    num_clusters=data[1]
    dpca = DensityPeakCluster(x,num_clusters)
    y_pred=dpca.fit(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    dEARI.append(ARI)
    dEAMI.append(AMI)
    dEFMS.append(FMI)
    dEH.append(HOMOGENITY)
    dEC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

Density Peaks Clustering with Euclidean Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.018208  0.051353  0.400210    0.069189      0.209473
1              ECG200  0.198014  0.108872  0.664993    0.106894      0.118946
2          Lightning2  0.002352 -0.014167  0.708191    0.000546      0.004355
3  SharePriceIncrease -0.010791  0.000494  0.689611    0.000780      0.001662
4               Wafer -0.002467 -0.001043  0.687234    0.000051      0.000027


In [21]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(dEARI),2))
print("std:",round(statistics.stdev(dEARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(dEAMI),2))
print("std:",round(statistics.stdev(dEAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(dEFMS),2))
print("std:",round(statistics.stdev(dEFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(dEH),2))
print("std:",round(statistics.stdev(dEH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(dEC),2))
print("std:",round(statistics.stdev(dEC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.04
std: 0.09
AMI
------
avg: 0.03
std: 0.05
FMS
------
avg: 0.63
std: 0.13
HOMOGENITY
------
avg: 0.04
std: 0.05
COMPLETENESS
------
avg: 0.07
std: 0.09


### Density Peaks with DTW

In [22]:
import math
import numpy as np

def DTW_Distance(s1,s2):
    DTW={}
    
    w = max(int(0.05*len(s1)), abs(len(s1)-len(s2)))
   
    for i in range(-1,len(s1)):
        for j in range(-1,len(s2)):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0
  
    for i in range(len(s1)):
        for j in range(max(0, i-w), min(len(s2), i+w)):
            dist= (s1[i]-s1[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
            
    return np.sqrt(DTW[len(s1)-1, len(s2)-1])

class DensityPeakClusterD(object):
    """
    Attributes:
        n_id: data row count
        distance: each id distance
        dc: threshold of density cut off
        rho: each id density
        nneigh: each id min upper density nearest neighbor
        delta: each id min upper density nearest neighbor distance
    """

    def __init__(self, x, k, distance_metric='dtw'):
        """
        Init parameters for Density peak cluster.
        parameters
        x: data
        k: number of clusters
        distance_metric: distance calculate function euclidean
        """
        self.data = x
        self.k = k
        self.distance_metric = distance_metric
        

    def build_distance(self):
        """
        Calculates distance dicttionary.
        return: distance dict, max distance, min distance
        """
        from scipy.spatial.distance import pdist, squareform
        
        # Pairwise distances between observations in n-dimensional space.
        distance_matrix = pdist(self.data, DTW_Distance)
        
        # Convert a vector-form distance vector to a square-form distance matrix
        distance_matrix = squareform(distance_matrix)
        
        # Return the indices for the upper-triangle of an (n, m) array.
        triangle_upper = np.triu_indices(self.data.shape[0], 1)
        triangle_upper = distance_matrix[triangle_upper]

        # distance dictionary
        distance = {}
        for i in range(self.n_id):
            for j in range(i + 1, self.n_id):
                distance[(i, j)] = distance_matrix[i, j]
                distance[(j, i)] = distance_matrix[i, j]

        max_dis, min_dis = np.max(triangle_upper), np.min(triangle_upper)
        return distance, max_dis, min_dis

    def select_dc(self):
        """
        selects the local density threshold that let average neighbor is 1-2 percent of all nodes.
        return: dc that local density threshold
        """
        max_dis, min_dis = self.max_dis, self.min_dis
        dc = (max_dis + min_dis) / 2
        
        while True:
            # calculating the nearest neighbors within the selected dc
            nneighs = sum([1 for v in self.distances.values() if v < dc]) / self.n_id ** 2
            if 0.01 <= nneighs <= 0.002:
                break
            # binary search
            if nneighs < 0.01:
                min_dis = dc
            else:
                max_dis = dc
            dc = (max_dis + min_dis) / 2
            if max_dis - min_dis < 0.0001:
                break
        return dc


    def local_density(self):
        """
        computes all points' local density.
        return: local density vector that index is the point index
        """
        cutoff_func = lambda dij, dc: 1 if dij < dc else 0
        
        rho = [0] * self.n_id
        for i in range(self.n_id):
            for j in range(i + 1, self.n_id):
                temp = cutoff_func(self.distances[(i, j)], self.dc)
                rho[i] += temp
                rho[j] += temp
        return np.array(rho, np.float32)

    
    def clustering(self):
        """
        Compute all points' min util to the higher local density point(which is the nearest neighbor).
        return: y_pred containing labels for all the points
        """
        # sort_rho_idx contains index of values in rho in descending order
        sort_rho_idx = np.argsort(-self.rho)
        
        # nneigh contains each id min upper density nearest neighbor
        # delta contains each id min upper density nearest neighbor distance
        delta, nneigh = [float(self.max_dis)] * (self.n_id), [0] * self.n_id
        delta[sort_rho_idx[0]] = -1
        for i in range(self.n_id):
            for j in range(0, i):
                old_i, old_j = sort_rho_idx[i], sort_rho_idx[j]
                if self.distances[(old_i, old_j)] < delta[old_i]:
                    delta[old_i] = self.distances[(old_i, old_j)]
                    nneigh[old_i] = old_j
        delta[sort_rho_idx[0]] = max(delta)
    
        # cluster center selection
        # vector multiplication of values in rho and delta
        res=np.multiply(self.rho,delta)
        
        # points contains index of values in res in descending order
        points=np.argsort(-res)
        
        # idxs contains indexes of top k points as cluster centers
        idxs=points[:self.k]
        '''centers=[]
        for i in range(self.k):
            centers.append(self.data[idxs[i]])'''
            
        # assignment of clusters to all the points 
        clusters={}
        #assignment of labels to the cluster centers
        for i in range(self.k):
            clusters[idxs[i]]=i
        #assignment of labels to all the points other than cluster centers based on the nearest neighbors
        for j in range(self.n_id):
            if sort_rho_idx[j] not in clusters:
                clusters[sort_rho_idx[j]]=clusters[nneigh[sort_rho_idx[j]]]

        # predicting the labels for all the points and placing them in y_pred
        y_pred=[]
        for i in range(self.n_id):
            y_pred.append(clusters[i]+1)
        return y_pred
    
    
    def fit(self,data):
        """
        Fits the model.
        param data: data for clustering
        return: predicted labels
        """
        if isinstance(data, np.ndarray): data = np.array(data)

        self.n_id = self.data.shape[0]

        # calculate distance
        self.distances, self.max_dis, self.min_dis = self.build_distance()

        # select distance cut off
        self.dc = self.select_dc()

        # calculate local density
        self.rho = self.local_density()
        
        # calculate nearest neighbor and delta and performs actual clustering
        y_pred = self.clustering()
        return y_pred


In [23]:
print("Density Peaks Clustering with DTW Distance:")
print("--------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
dDARI,dDAMI,dDFMS,dDH,dDC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    num_clusters=data[1]
    dpcaD = DensityPeakClusterD(x,num_clusters)
    y_pred=dpcaD.fit(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    dDARI.append(ARI)
    dDAMI.append(AMI)
    dDFMS.append(FMI)
    dDH.append(HOMOGENITY)
    dDC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

Density Peaks Clustering with DTW Distance:
--------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.002376  0.002889  0.434005    0.026694      0.287088
1              ECG200 -0.004951 -0.003485  0.737881    0.003209      0.065001
2          Lightning2  0.008599  0.005953  0.716218    0.011455      0.160738
3  SharePriceIncrease  0.013844  0.009775  0.762866    0.010113      0.194859
4               Wafer -0.008716 -0.006431  0.917709    0.001499      0.013279


In [24]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(dDARI),2))
print("std:",round(statistics.stdev(dDARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(dDAMI),2))
print("std:",round(statistics.stdev(dDAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(dDFMS),2))
print("std:",round(statistics.stdev(dDFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(dDH),2))
print("std:",round(statistics.stdev(dDH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(dDC),2))
print("std:",round(statistics.stdev(dDC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.0
std: 0.01
AMI
------
avg: 0.0
std: 0.01
FMS
------
avg: 0.71
std: 0.18
HOMOGENITY
------
avg: 0.01
std: 0.01
COMPLETENESS
------
avg: 0.14
std: 0.11


### Agglomerative Clustering

In [25]:
from sklearn.cluster import AgglomerativeClustering
print("Agglomerative Clustering with Euclidean Distance:")
print("-------------------------------------------------")
res={"Dataset":[],"ARI":[],"AMI":[],"FMS":[],"Homogenity":[],"Completeness":[]}
aARI,aAMI,aFMS,aH,aC=[],[],[],[],[]
for i in trange(len(datasets)):
    data=datasets[i]
    dataset=pd.read_csv(data[0])
    res["Dataset"].append(data[0][:-4])
    cols=dataset.shape[1]
    x=dataset.iloc[:,:cols-1].values
    y=dataset.iloc[:,-1].values
    
    num_clusters=data[1]
    clustering = AgglomerativeClustering(n_clusters=num_clusters,affinity='euclidean',linkage='ward')
    y_pred=clustering.fit_predict(x)
    
    ARI=adjusted_rand_score(y,y_pred)
    res["ARI"].append(ARI)
    AMI=adjusted_mutual_info_score(y,y_pred,average_method='arithmetic')
    res["AMI"].append(AMI)
    FMI=fowlkes_mallows_score(y,y_pred,sparse=False)
    res["FMS"].append(FMI)
    HOMOGENITY=homogeneity_score(y,y_pred)
    res["Homogenity"].append(HOMOGENITY)
    COMPLETENESS=completeness_score(y,y_pred)
    res["Completeness"].append(COMPLETENESS)
    
    aARI.append(ARI)
    aAMI.append(AMI)
    aFMS.append(FMI)
    aH.append(HOMOGENITY)
    aC.append(COMPLETENESS)
    
df=pd.DataFrame(res)
print(df)

Agglomerative Clustering with Euclidean Distance:
-------------------------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


              Dataset       ARI       AMI       FMS  Homogenity  Completeness
0          Colposcopy  0.079482  0.092676  0.295376    0.125439      0.138992
1              ECG200  0.244086  0.145752  0.671168    0.146076      0.152485
2          Lightning2  0.145156  0.107958  0.661253    0.097720      0.138426
3  SharePriceIncrease  0.029147  0.010744  0.734392    0.007221      0.027060
4               Wafer  0.002483 -0.001016  0.687940    0.000088      0.000045


In [26]:
print("Average and Standard deviation of different metrics")
print("ARI")
print("------")
print("avg:",round(statistics.mean(aARI),2))
print("std:",round(statistics.stdev(aARI),2))
print("AMI")
print("------")
print("avg:",round(statistics.mean(aAMI),2))
print("std:",round(statistics.stdev(aAMI),2))
print("FMS")
print("------")
print("avg:",round(statistics.mean(aFMS),2))
print("std:",round(statistics.stdev(aFMS),2))
print("HOMOGENITY")
print("------")
print("avg:",round(statistics.mean(aH),2))
print("std:",round(statistics.stdev(aH),2))
print("COMPLETENESS")
print("------")
print("avg:",round(statistics.mean(aC),2))
print("std:",round(statistics.stdev(aC),2))

Average and Standard deviation of different metrics
ARI
------
avg: 0.1
std: 0.1
AMI
------
avg: 0.07
std: 0.06
FMS
------
avg: 0.61
std: 0.18
HOMOGENITY
------
avg: 0.08
std: 0.07
COMPLETENESS
------
avg: 0.09
std: 0.07


### Spread values

In [27]:
methods = [kEARI, kDARI, kSARI, kMEARI, cMARI, dEARI, dDARI, aARI]
method_names = ['kmeans-euc', 'kmeans-dtw', 'kmeans-shape', 'kmedoids-euc', 'cmeans-euc', 'dp-euc', 'dp-dtw', 'agglom-euc']
for i in range(8):
    method1 = methods[i]
    for j in range(i+1,8):
        method2 = methods[j]
        spread=0
        for k in range(5):
            spread += ((method1[k]-method2[k])**2)
        print(method_names[i],'-',method_names[j],'-',round(spread/5,4))

kmeans-euc - kmeans-dtw - 0.0005
kmeans-euc - kmeans-shape - 0.0028
kmeans-euc - kmedoids-euc - 0.0
kmeans-euc - cmeans-euc - 0.0047
kmeans-euc - dp-euc - 0.0023
kmeans-euc - dp-dtw - 0.0047
kmeans-euc - agglom-euc - 0.0069
kmeans-dtw - kmeans-shape - 0.002
kmeans-dtw - kmedoids-euc - 0.0007
kmeans-dtw - cmeans-euc - 0.0062
kmeans-dtw - dp-euc - 0.0027
kmeans-dtw - dp-dtw - 0.0056
kmeans-dtw - agglom-euc - 0.0041
kmeans-shape - kmedoids-euc - 0.003
kmeans-shape - cmeans-euc - 0.012
kmeans-shape - dp-euc - 0.0007
kmeans-shape - dp-dtw - 0.0095
kmeans-shape - agglom-euc - 0.0027
kmedoids-euc - cmeans-euc - 0.0045
kmedoids-euc - dp-euc - 0.0024
kmedoids-euc - dp-dtw - 0.0047
kmedoids-euc - agglom-euc - 0.0072
cmeans-euc - dp-euc - 0.0116
cmeans-euc - dp-dtw - 0.0042
cmeans-euc - agglom-euc - 0.0184
dp-euc - dp-dtw - 0.0084
dp-euc - agglom-euc - 0.0056
dp-dtw - agglom-euc - 0.0174
