## Fuzzy c means clustering algorithm

In [13]:
import numpy as np, numpy.random
import pandas as pd
from scipy.spatial import distance

In [14]:
k = 2 #number of clusters
p = 2

In [15]:
#X = np.array([
#        [1,2],
#        [2,3],
#        [9,4],
#        [10,1]
#        ])

In [16]:
X = pd.DataFrame([
        [1,1,2,1], 
        [2,1,2,3], 
        [2,2,4,5], 
        [50,42,2,83],
        [51,43,1,82],
        [51,44,3,89],
        [53,40,8,80]])

In [17]:
#print the number of data and dimension
n = len(X)
d = len(X.columns)
addZeros = np.zeros((n, 1))
X = np.append(X, addZeros, axis=1)

In [18]:
print("The FCM algorithm:")
print("The training data :\n", X)
print("The total number of data :", n)
print("The total number of features :", d)
print("The total number of Clusters :", k)

The FCM algorithm:
The training data :
 [[ 1.  1.  2.  1.  0.]
 [ 2.  1.  2.  3.  0.]
 [ 2.  2.  4.  5.  0.]
 [50. 42.  2. 83.  0.]
 [51. 43.  1. 82.  0.]
 [51. 44.  3. 89.  0.]
 [53. 40.  8. 80.  0.]]
The total number of data : 7
The total number of features : 4
The total number of Clusters : 2


In [19]:
#Create an empty array of centers
C = np.zeros((k,d+1))

In [20]:
#Randomly initialize the weight matrix
weight = np.random.dirichlet(np.ones(k),size=n)
print("The initial weight: \n", np.round(weight,2))

The initial weight: 
 [[0.5  0.5 ]
 [0.93 0.07]
 [0.13 0.87]
 [0.35 0.65]
 [0.89 0.11]
 [0.52 0.48]
 [0.36 0.64]]


In [21]:
for it in range(3): #total number of iterations
    #compute centroid
    for j in range(k):
        denoSum = sum(np.power(weight[:,j],p))
        
        sumMM = 0
        for i in range(n):
            mm = np.multiply(np.power(weight[i,j],p),X[i,:])
            sumMM += mm
        cc = sumMM/denoSum
        C[j] = np.reshape(cc,d+1)
        
    print("Updating the fuzzy pseudo partition")
    for i in range(n):
        denoSumNext = 0
        for j in range(k):
            denoSumNext += np.power(1/distance.euclidean(C[j,0:d],X[i,0:d]),1/(p-1))
        for j in range(k):
            w = np.power((1/distance.euclidean(C[j,0:d],X[i,0:d])),1/(p-1))/denoSumNext
            weight[i,j] = w
            
print("The final weights: ", np.round(weight,2))

Updating the fuzzy pseudo partition
Updating the fuzzy pseudo partition
Updating the fuzzy pseudo partition
The final weights:  [[0.47 0.53]
 [0.47 0.53]
 [0.47 0.53]
 [0.54 0.46]
 [0.54 0.46]
 [0.53 0.47]
 [0.54 0.46]]


In [22]:
for i in range(n):    
    cNumber = np.where(weight[i] == np.amax(weight[i]))
    X[i,d] = cNumber[0]
print("\nThe data with cluster number: \n", X)


The data with cluster number: 
 [[ 1.  1.  2.  1.  1.]
 [ 2.  1.  2.  3.  1.]
 [ 2.  2.  4.  5.  1.]
 [50. 42.  2. 83.  0.]
 [51. 43.  1. 82.  0.]
 [51. 44.  3. 89.  0.]
 [53. 40.  8. 80.  0.]]


In [23]:
# Sum squared error calculation
SSE = 0
for j in range(k):
    for i in range(n):
        SSE += np.power(weight[i,j],p)*distance.euclidean(C[j,0:d], X[i,0:d])

print("\nSSE: ",np.round(SSE,4))


SSE:  176.2944


In [24]:
#ref.1 http://www.cleartheconcepts.com/fuzzy-c-means/
#ref.2 https://www.youtube.com/watch?v=FA-hJBu5Bkc
#ref.3 https://github.com/omadson/fuzzy-c-means