In [348]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [349]:
df = pd.read_csv('dataset.txt', header=None)
#remove the last column
df = df.iloc[:, :-1]
df

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [350]:

def manhattan(A, B):
    return np.sum(np.abs(A - B))

In [351]:

def euclidean(A, B):
    return np.sqrt(np.sum((A - B) ** 2))

In [352]:

def minkowski(A, B,p=3):
    return np.sum(np.abs(A - B) ** p) ** (1 / p)

In [353]:

def cosine(A, B):
    nom = np.sum(A * B)
    denom = np.sqrt(np.sum(np.power(A,2) )) * np.sqrt(np.sum(np.power(B,2) ))
    return 1 - (nom / denom)

In [354]:
def hamming(A,B):
    #hamming distance is the number of positions at which the corresponding symbols are different
    return np.sum(A != B)

In [355]:

def sort(data,new_instance, distance):
    # return sorted(data, key=lambda x: distance(x[0], x[1]))
    d = []
    for i,x in enumerate(data):
        d.append([i, distance(x, new_instance)])
    d = sorted(d, key=lambda x: x[1])
    return d

In [356]:
class Kmeans:
    def __init__(self,k,dataset) -> None:
        self.k = k
        self.dataset = dataset
        self.dataset = self.dataset.sample(frac=1).reset_index(drop=True)
    def generate_cluster(self,distanceFn,centroids=None):
        if centroids is None:
            print('none')
            #pick k random points
            #shuffle dataset
            centroids = self.dataset.sample(n=self.k ,random_state=1)
            #converr the centroids to a list
            centroids = centroids.values.tolist()
            print(centroids)
            #calculate distance to each point for every centroid
        distances  = {}
        for c in range(self.k):
            distances[c] = []
        for i,val in self.dataset.iterrows():
            dists = []
            for c in range(self.k):
                dists.append([c,np.array(val),distanceFn(val,centroids[c])])
            #get the closest centroid
            dists = sorted(dists,key=lambda x: x[2])
            min_dist = dists[0]
            distances[min_dist[0]].append(min_dist[1:])
        return distances,centroids

    def cluster(self,distanceFn,max_iter=100):
        distances, centroids = self.generate_cluster(distanceFn)
        #calculate new centroids
        repeat = True
        nb_iter   =0
        while repeat:
            c_centroids = []
            for k in distances.keys():
                #get the values from the second column
                vals = [x[0] for x in distances[k]]
                vals = np.array(vals)
                c_centroids.append(np.mean(np.array(vals),axis=0))
            #calculate the distances for the new centroids
            new_distances,_ = self.generate_cluster(distanceFn,c_centroids)
            #check if the valus in the column 1 are the same
            for k in distances.keys():
                # print('k: ',k)
                vals = [x[0] for x in distances[k]]
                new_vals = [x[0] for x in new_distances[k]]
                # print('old ',vals)
                # print('new ',new_vals)
                if np.array_equal(np.array(vals,dtype=object),np.array(new_vals,dtype=object)):
                    repeat = False
            if repeat or nb_iter > max_iter:
                nb_iter += 1
                print('rep')
                distances = new_distances
                centroids = c_centroids
        return distances, centroids


In [357]:
k = 7
kmeans = Kmeans(k,df)
res_cos, centroids = kmeans.cluster(cosine)
kmeans = Kmeans(k,df)
res_euc, centroids = kmeans.cluster(euclidean)
kmeans = Kmeans(k,df)
res_ham, centroids = kmeans.cluster(hamming)
kmeans = Kmeans(k,df)
res_mink, centroids = kmeans.cluster(minkowski)
kmeans = Kmeans(k,df)
res_man, centroids = kmeans.cluster(manhattan)
print(centroids)

none
[[5.6, 3.0, 4.1, 1.3], [5.7, 3.8, 1.7, 0.3], [5.9, 3.0, 5.1, 1.8], [6.2, 2.2, 4.5, 1.5], [6.0, 2.2, 4.0, 1.0], [6.3, 2.7, 4.9, 1.8], [4.8, 3.0, 1.4, 0.1]]


rep
rep
rep
none
[[6.7, 3.1, 5.6, 2.4], [5.6, 2.9, 3.6, 1.3], [5.6, 2.5, 3.9, 1.1], [4.8, 3.1, 1.6, 0.2], [5.7, 2.8, 4.5, 1.3], [5.0, 3.4, 1.6, 0.4], [6.2, 2.2, 4.5, 1.5]]
rep
none
[[5.2, 3.4, 1.4, 0.2], [6.7, 3.3, 5.7, 2.5], [5.1, 3.5, 1.4, 0.3], [5.7, 2.6, 3.5, 1.0], [5.9, 3.0, 5.1, 1.8], [5.4, 3.7, 1.5, 0.2], [6.1, 3.0, 4.9, 1.8]]
rep


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


none
[[4.4, 2.9, 1.4, 0.2], [6.9, 3.2, 5.7, 2.3], [6.7, 3.1, 4.7, 1.5], [5.0, 3.5, 1.6, 0.6], [6.3, 3.3, 6.0, 2.5], [6.4, 3.1, 5.5, 1.8], [7.9, 3.8, 6.4, 2.0]]
rep
rep
none
[[6.5, 3.2, 5.1, 2.0], [5.7, 2.8, 4.1, 1.3], [6.8, 2.8, 4.8, 1.4], [6.8, 3.0, 5.5, 2.1], [5.0, 3.2, 1.2, 0.2], [6.3, 2.3, 4.4, 1.3], [5.6, 2.7, 4.2, 1.3]]
[[6.5, 3.2, 5.1, 2.0], [5.7, 2.8, 4.1, 1.3], [6.8, 2.8, 4.8, 1.4], [6.8, 3.0, 5.5, 2.1], [5.0, 3.2, 1.2, 0.2], [6.3, 2.3, 4.4, 1.3], [5.6, 2.7, 4.2, 1.3]]


In [358]:
def score(instance,res,centroids,distanceFn):
    for k in res.keys():
        #search the cluster for the instance
        for x in res[k]:
            if np.array_equal(x[0],instance):
                i = k
    moy = 0
    for x in res[i]:
        if not np.array_equal(x[0],instance):
            moy += distanceFn(instance,x[0])
    moy /= len(res[i])-1
    return moy

In [359]:
point = [5.4,3.9, 1.3,0.4]

print('score: ',score(point,res_cos,centroids,hamming))

score:  3.5714285714285716


In [360]:
def score_cluster(instance,res,centroids,distanceFn):
    for k in res.keys():
        #search the cluster for the instance
        for x in res[k]:
            if np.array_equal(x[0],instance):
                i = k
    moys = []
    for k in res.keys():
        if k != i:
            moy = 0
            for x in res[k]:
                moy += distanceFn(instance,x[0])
            moy /= len(res[k])
            moys.append([k,moy])
    min_moy = sorted(moys,key=lambda x: x[1])

    return min_moy[0][1]

In [361]:
point = [5.4,3.9, 1.3,0.4]

print('score: ',score_cluster(point,res_cos,centroids,hamming))

score:  3.761904761904762


In [362]:
def silhouette(instance,res,centroids,distanceFn):
    a = score(instance,res,centroids,distanceFn)
    b = score_cluster(instance,res,centroids,distanceFn)
    return (b-a)/max(a,b)

In [363]:

point = [5.4,3.9, 1.3,0.4]
print('silhouette: ',silhouette(point,res_cos,centroids,hamming))

silhouette:  0.050632911392405


In [364]:
def silhouette_global(res,centroids,distanceFn):
    sil = []
    for k in res.keys():
        for x in res[k]:
            sil.append(silhouette(x[0],res,centroids,distanceFn))
    return np.mean(sil)

In [365]:
print('silhouette global manhattan: ',silhouette_global(res_man,centroids,manhattan))
print('silhouette global minkowski: ',silhouette_global(res_mink,centroids,minkowski))
print('silhouette global cosine: ',silhouette_global(res_cos,centroids,cosine))
# print('silhouette global hamming: ',silhouette_global(res_ham,centroids,hamming))
print('silhouette global euclidean: ',silhouette_global(res_euc,centroids,euclidean))

silhouette global manhattan:  0.3720101054493736
silhouette global minkowski:  0.30094678778120887
silhouette global cosine:  0.4609781919227684
silhouette global euclidean:  0.28657978493630965


In [366]:
from sklearn.metrics import silhouette_score
