In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd # reading all required header files
import numpy as np
import random
import operator
import math
import matplotlib.pyplot as plt 
from scipy.stats import multivariate_normal   

In [None]:
#Reading the dataset using pandas
df_full = pd.read_csv("/content/gdrive/MyDrive/completeDataset.csv")


In [None]:
# Removing the unneccessary column from the dataset

dfi=df_full

dfi=dfi.drop(dfi.columns[0],axis=1)
print(dfi.head(),dfi.shape)

   Unnamed: 0.1         0         1  ...      2046      2047  targets
0             0  0.411524  0.166255  ...  0.357510  0.322855        0
1             1  0.268370  0.548399  ...  0.216509  0.187411        9
2             2  0.580135  0.704130  ...  0.464417  0.207319        0
3             3  0.332108  0.627917  ...  0.432334  0.425472        3
4             4  0.309318  0.450592  ...  0.590379  0.508042        3

[5 rows x 2050 columns] (75516, 2050)


In [None]:
#seperating the targets and features to fed into FCM
df_full=dfi
columns = list(df_full.columns)
features = columns[:len(columns)-1]
class_labels = list(df_full[columns[-1]])
df = df_full[features]

In [None]:
#Checking if features are right in place
print(df.head())
#print(len(class_labels))


          0         1         2  ...      2045      2046      2047
0  0.411524  0.166255  0.598072  ...  0.352187  0.357510  0.322855
1  0.268370  0.548399  0.464581  ...  0.369844  0.216509  0.187411
2  0.580135  0.704130  0.680772  ...  0.224066  0.464417  0.207319
3  0.332108  0.627917  0.625845  ...  0.051118  0.432334  0.425472
4  0.309318  0.450592  0.284435  ...  0.186153  0.590379  0.508042

[5 rows x 2048 columns]


In [None]:
#Hyper parameters
# Number of Clusters
k = 11
# Maximum number of iterations 
MAX_ITER = 10
# Number of data points
n = len(df)
# Fuzzy parameter
m = 1.7 #Select a value greater than 1 else it will be knn

In [None]:
def initializeMembershipMatrix(): # initializing the membership matrix for the data points
    membership_mat = []
    for i in range(n):
        random_num_list = [random.random() for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        
        flag = temp_list.index(max(temp_list))
        for j in range(0,len(temp_list)):
            if(j == flag):
                temp_list[j] = 1
            else:
                temp_list[j] = 0
        
        membership_mat.append(temp_list)
    return membership_mat

In [None]:
#calling membership matrix function
membership_mat = initializeMembershipMatrix()

In [None]:
def calculateClusterCenter(membership_mat): # calculating the cluster centers
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = []
    for j in range(k):
        #print(j)
        x = list(cluster_mem_val[j])
        xraised = [p ** m for p in x]
        denominator = sum(xraised)
        temp_num = []
        for i in range(n):
            data_point = list(df.iloc[i])
            prod = [xraised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = map(sum, list(zip(*temp_num)))
        center = [z/denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers

In [None]:
#Calculating the clusters centers
calculateClusterCenter(membership_mat)

0
1
2
3
4
5
6
7
8
9
10


[[0.3729825378049581,
  0.5452031738294658,
  0.48066654607438286,
  0.4150096892489542,
  0.5760077015929088,
  0.5072458008624768,
  0.4832211358510865,
  0.5112255533920997,
  0.4438038457556578,
  0.42784579558339303,
  0.28323180493181443,
  0.37421848491105647,
  0.5460824368711257,
  0.5591334399609333,
  0.4464680340167217,
  0.49005155034986264,
  0.5470275167790098,
  0.258780364421003,
  0.40157965974138693,
  0.3998449551695243,
  0.4005444989351291,
  0.4735300645734474,
  0.446245461793569,
  0.4112741222462161,
  0.5365653252851371,
  0.5078763586982862,
  0.44863357212721655,
  0.412317341359956,
  0.457548388675941,
  0.3956277647586271,
  0.40746252509600583,
  0.4102423313544758,
  0.4220565015831067,
  0.4392523976452368,
  0.7153701325513901,
  0.4774498332446278,
  0.44905089811662074,
  0.44392679721824846,
  0.35772679436860305,
  0.5311236644539424,
  0.43625668784503496,
  0.5212179750076401,
  0.38222396609528586,
  0.38733990907553756,
  0.32784202693498704,

In [None]:
#Here Accuracy is calculated by dividing the labels nearly equal chunks and finding max membership value.
#each data point can represent to the one cluster which is done in for loop and returning the accuracy.
def accuracy(cluster_labels, class_labels):
    correct_pred = 0
    #print(cluster_labels)
    l0 = max(set(labels[0:7000]), key=labels[0:7000].count)
    l1 = max(set(labels[7000:14000]), key=labels[7000:14000].count)
    l2 = max(set(labels[14000:21000]), key=labels[14000:21000].count)
    l3 = max(set(labels[21000:28000]), key=labels[21000:28000].count)
    l4 = max(set(labels[28000:35000]), key=labels[28000:35000].count)
    l5 = max(set(labels[35000:42000]), key=labels[35000:42000].count)
    l6 = max(set(labels[42000:49000]), key=labels[42000:49000].count)
    l7 = max(set(labels[49000:56000]), key=labels[49000:56000].count)
    l8 = max(set(labels[56000:63000]), key=labels[56000:63000].count)
    l9 = max(set(labels[63000:70000]), key=labels[63000:70000].count)
    l10 = max(set(labels[70000:]), key=labels[70000:].count)
    for i in range(len(df)):
        if cluster_labels[i] == l0 and class_labels[i] == 0:
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l1 and class_labels[i] == 1 and l1!=l0:
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l2 and class_labels[i] == 2 and l2!=l0 and l2!=l1:
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l3 and class_labels[i] == 3 and l3!=l0 and l3!=l1 and l3!=l2:
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l4 and class_labels[i] == 4 and l4!=l0 and l4!=l1 and l4!=l2 and l4!=l3:
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l5 and class_labels[i] == 5 and l5!=l0 and l5!=l1 and l5!=l2 and l5!=l3 and l5!=l4 :
            correct_pred = correct_pred + 1
        if cluster_labels[i] == l6 and class_labels[i] == 6 and l6!=l0 and l6!=l1 and l6!=l5 and l6!=l2 and l6!=l3 and l6!=l4:
            correct_pred = correct_pred + 1    
        if cluster_labels[i] == l7 and class_labels[i] == 7 and l7!=l0 and l7!=l1 and l7!=l5 and l7!=l2 and l7!=l3 and l7!=l4 and l7!=l6:
            correct_pred = correct_pred + 1    
        if cluster_labels[i] == l8 and class_labels[i] == 8 and l8!=l0 and l8!=l1 and l8!=l2 and l8!=l3 and l8!=l4 and l8!=l5 and l8!=l6 and l8!=l7:
            correct_pred = correct_pred + 1    
        if cluster_labels[i] == l9 and class_labels[i] == 9 and l9!=l0 and l9!=l1 and l9!=l2 and l9!=l3 and l9!=l4 and l9!=l5 and l9!=l6 and l9!=l7 and l9!=l8:
            correct_pred = correct_pred + 1    
        if cluster_labels[i] == l10 and class_labels[i] == 10 and l10!=l0 and l10!=l1 and l10!=l2 and l10!=l3 and l10!=l4 and l10!=l5 and l10!=l6 and l10!=l7 and l10!=l8 and l10!=l9:
            correct_pred = correct_pred + 1    

    accuracy = (correct_pred/len(df))*100
    return accuracy

In [None]:
def updateMembershipValue(membership_mat, cluster_centers): # Updating the membership value
    p = float(2/(m-1))
    for i in range(n):
        x = list(df.iloc[i])
        distances = [np.linalg.norm(np.array(list(map(operator.sub, x, cluster_centers[j])))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)       
    return membership_mat

In [None]:
def getClusters(membership_mat): # getting the clusters
    cluster_labels = list()
    for i in range(n):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels

In [None]:
def fuzzyCMeansClustering(): #cluster centers are at Random vectors from data
    # Membership Matrix
    membership_mat = initializeMembershipMatrix()
    curr = 0
    acc=[]
    while curr < MAX_ITER:
        cluster_centers = calculateClusterCenter(membership_mat)
        membership_mat = updateMembershipValue(membership_mat, cluster_centers)
        cluster_labels = getClusters(membership_mat)
        
        acc.append(cluster_labels)
        
        if(curr == 0):
            print("Cluster Centers:")
            print(np.array(cluster_centers))
        curr += 1
    print("---------------------------")
    print("Partition matrix:")
    print(np.array(membership_mat))
    #return cluster_labels, cluster_centers
    return cluster_labels, cluster_centers, acc

In [None]:

labels, centers, acc = fuzzyCMeansClustering()


0
1
2
3
4
5
6
7
8
9
10
Cluster Centers:
[[0.37305611 0.5489732  0.48374849 ... 0.32927566 0.45735196 0.33375855]
 [0.37416712 0.54782052 0.48447287 ... 0.3287012  0.45701473 0.33734382]
 [0.37346268 0.54462125 0.4837643  ... 0.33087841 0.45763007 0.33466472]
 ...
 [0.37561288 0.54721655 0.48526165 ... 0.33006699 0.45839874 0.33428012]
 [0.37286818 0.5464754  0.48587332 ... 0.33196726 0.45478237 0.33559625]
 [0.37259119 0.54846957 0.48262804 ... 0.33108867 0.45431673 0.33543226]]
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
0
1
2
3
4
5
6
7
8
9
10
---------------------------
Partition matrix:
[[0.09090909 0.09090909 0.09090909 ... 0.09090909 0.09090909 0.09090909]
 [0.0909091  0.0909091  0.09090909 ... 0.09090909 0.09090909 0.09090909]
 [0.09090908 0.09090908 0.0909091  ... 0.09090909 0.09090909 0.09090909]
 ...
 [0.09090909 0.09090909 0.09090909 ... 

In [None]:
#Accuracy calculation
a = accuracy(labels, class_labels)

In [None]:
print(a)

10.234917103660152


In [None]:
acc_lis = [] 
for i in range(0,len(acc)):
    val = accuracy(acc[i], class_labels)
    acc_lis.append(val)

In [None]:
acc_lis = np.array(acc_lis) #calculating accuracy and std deviation 100 times
print("mean=",np.mean(acc_lis))
print("Std dev=",np.std(acc_lis))

mean= 6.758038031675406
Std dev= 3.124115130502461


In [None]:
print("Accuracy = " + str(round(a, 2)))

Accuracy = 10.23


In [None]:
print("Cluster center vectors:") #final cluster centers
print(np.array(centers))

Cluster center vectors:
[[0.37402738 0.54771265 0.48577505 ... 0.32981812 0.45622955 0.33664481]
 [0.3740274  0.54771271 0.48577507 ... 0.32981814 0.45622953 0.33664483]
 [0.37402738 0.54771265 0.48577505 ... 0.32981812 0.45622955 0.3366448 ]
 ...
 [0.37402733 0.54771253 0.48577503 ... 0.32981809 0.45622958 0.33664476]
 [0.3740274  0.54771269 0.48577506 ... 0.32981813 0.45622953 0.33664482]
 [0.37402738 0.54771264 0.48577505 ... 0.32981812 0.45622955 0.3366448 ]]


In [None]:
# Reference to FCM
#https://www.kaggle.com/prateekk94/fuzzy-c-means-clustering-on-iris-dataset

In [None]:
#plotting the labels predicted
plt.hist(labels,bins=11)
plt.title("Predicted Labels from FCM")