In [87]:
import math
import random
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

In [88]:
def isValidNumberString(s):
  if len(s) == 0:
    return False
  if  len(s) > 1 and s[0] == "-":
      s = s[1:]
  for c in s:
    if c not in "0123456789.":
      return False
  return True


In [89]:
def stringsToNumbers(myList):
    for i in range(len(myList)):
        if (isValidNumberString(myList[i])):
            myList[i] = float(myList[i])

In [90]:
def lineToTuple(line):
    # remove leading/trailing witespace and newlines
    cleanLine = line.strip()
    # get rid of quotes
    cleanLine = cleanLine.replace('"', '')
    # separate the fields
    lineList = cleanLine.split(",")
    # convert strings into numbers
    stringsToNumbers(lineList)
    lineTuple = tuple(lineList)
    return lineTuple

In [91]:
def loadCSV(fileName):
    fileHandler = open(fileName, "rt")
    lines = fileHandler.readlines()
    fileHandler.close()
    del lines[0] # remove the header
    dataset = []
    for line in lines:
        instance = lineToTuple(line)
        dataset.append(instance)
    return dataset

In [92]:
def printTable(instances):
    for instance in instances:
        if instance != None:
            line = instance[0] + "\t"
            for i in range(1, len(instance)):
                line += "%.2f " % instance[i]
            print(line)

In [93]:
def extractAttribute(instances, index):
    result = []
    for instance in instances:
        result.append(instance[index])
    return result


In [94]:
def euclidean_distance(instance1, instance2):
    if instance1 == None or instance2 == None:
        return float("inf")
    distance = 0
    for i in range(1, len(instance1)):
        distance += (instance1[i] - instance2[i])**2
    return math.sqrt(distance)

In [95]:
def cosine_distance(instance1, instance2):
  if instance1 == None or instance2 == None:
        return float("inf")
  A = np.array(instance1[1:],dtype=float)
  B = np.array(instance2[1:],dtype=float)
  distance = 1 - np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))
  return distance

In [96]:
def jaccard(instance1, instance2):
  if instance1 == None or instance2 == None:
        return float("inf")
  A = np.array(instance1[1:],dtype=float)
  B = np.array(instance2[1:],dtype=float)
  return 1 - (np.sum(np.minimum(A,B), axis = 0)/np.sum(np.maximum(A, B), axis = 0)) 

In [97]:
def calculateSSE(instance1, instance2):
    if instance1 == None or instance2 == None:
        return float("inf")
    sumOfSquares = 0
    for i in range(1, len(instance1)):
        sumOfSquares += (instance1[i] - instance2[i])**2
    return sumOfSquares

In [98]:
def meanInstance(name, instanceList):
    numInstances = len(instanceList)
    if (numInstances == 0):
        return
    numAttributes = len(instanceList[0])
    means = [name] + [0] * (numAttributes-1)
    for instance in instanceList:
        for i in range(1, numAttributes):
            means[i] += instance[i]
    for i in range(1, numAttributes):
        means[i] /= float(numInstances)
    return tuple(means)

In [99]:
def cluster_count(cluster):
  cl = defaultdict(int)
  for point in cluster:
    cl[point[-1]] += 1
  return cl

In [100]:
def assign(instance, centroids, dist_func):
  if(dist_func == "Euclidean"):
    minDistance = euclidean_distance(instance, centroids[0])
  elif(dist_func == "Cosine"):
    minDistance = cosine_distance(instance, centroids[0])
  else:
    minDistance = jaccard(instance, centroids[0])

  minDistanceIndex = 0
  for i in range(1, len(centroids)):
      if(dist_func == "Euclidean"):
        d = euclidean_distance(instance, centroids[i])
      elif(dist_func == "Cosine"):
        d = cosine_distance(instance, centroids[i])
      else:
        d = jaccard(instance, centroids[i])
      if (d < minDistance):
          minDistance = d
          minDistanceIndex = i
  return minDistanceIndex

In [101]:
def createEmptyListOfLists(numSubLists):
    myList = []
    for i in range(numSubLists):
        myList.append([])
    return myList

In [102]:
label_accuracy = pd.read_csv("label.csv").to_numpy()
data = pd.read_csv("data.csv").to_numpy()

arr = []

for row in range(len(data)):
  temp = []
  for col in range(len(data[row])):
    temp.append(data[row][col])
  temp.append(label_accuracy[row][0])
  arr.append(temp)

arr=sorted(arr, key=lambda x: x[len(arr[0])-1], reverse=False)

In [103]:
def assignAll(instances, centroids, dist_func):
    clusters = createEmptyListOfLists(len(centroids))
    for instance in instances:
        clusterIndex = assign(instance, centroids, dist_func)
        clusters[clusterIndex].append(instance)
    return clusters

In [104]:
def computeCentroids(clusters):
    centroids = []
    for i in range(len(clusters)):
        name = "centroid" + str(i)
        centroid = meanInstance(name, clusters[i])
        centroids.append(centroid)
    return centroids

In [105]:

def kmeans(instances, k, dist_func,initCentroids=None):

    result = {}
    sse_list=[]
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = []

    iteration = 0
    start = time.time()
    while (centroids != prevCentroids):
        if(iteration > 150):
          break
        iteration += 1
        clusters = assignAll(instances, centroids,dist_func)

        prevCentroids = centroids
        centroids = computeCentroids(clusters)
        withinss = computeWithinss(clusters, centroids)

        sse_list.append(withinss)

    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = withinss
    result["SSE"] = sse_list
    result["iterations"] = iteration
   # print("Time taken:", time.time() - start)
    return result

In [106]:
def computeWithinss(clusters, centroids):
    result = 0
    for i in range(len(centroids)):
        centroid = centroids[i]
        cluster = clusters[i]
        for instance in cluster:
            result += calculateSSE(centroid, instance)
    return result


In [107]:
label_group_count = dict(cluster_count(label_accuracy))
print(f'label_group_count',label_group_count)
total = 0
for a in label_group_count:
   total += label_group_count[a]
print(total)

label_group_count {2: 1032, 1: 1135, 0: 980, 4: 982, 9: 1009, 5: 892, 6: 958, 7: 1027, 3: 1010, 8: 974}
9999


In [108]:
print(f'===== Euclidean =====')
euclidean_clustering = kmeans(arr, 10,"Euclidean")

===== Euclidean =====


In [109]:
initial_labels = {0: 0, 1: 0, 2: 0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0}
for key in range(len(euclidean_clustering["clusters"])):
      cluster_data = dict(cluster_count(euclidean_clustering["clusters"][key]))
      max_value = 0
      label = ""
      for k in cluster_data:
        if cluster_data[k] > max_value:
          max_value = cluster_data[k]
          label = k
      initial_labels[label] = max_value
print('initial_labels==>',initial_labels)

initial_labels==> {0: 434, 1: 515, 2: 0, 3: 347, 4: 198, 5: 0, 6: 638, 7: 642, 8: 256, 9: 301}


In [110]:
total_sum = 0
mispredicted = 0
for i in label_group_count:
    total_sum += label_group_count[i]
    mispredicted += abs(label_group_count[i] - initial_labels[i])
accuracy = (total_sum - mispredicted) / total_sum
print("Accuracy =",accuracy)

Accuracy = 0.3331333133313331


In [111]:
print(f'===== Cosine =====')
cosine_clustering = kmeans(arr, 10,"Cosine")

initial_labels = {0: 0, 1: 0, 2: 0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0}
for key in range(len(cosine_clustering["clusters"])):
      cluster_data = dict(cluster_count(cosine_clustering["clusters"][key]))
      max_value = 0
      label = ""
      for k in cluster_data:
        if cluster_data[k] > max_value:
          max_value = cluster_data[k]
          label = k
      initial_labels[label] = max_value
print('initial_labels==>',initial_labels)


total_sum = 0
mispredicted = 0
for i in label_group_count:
    total_sum += label_group_count[i]
    mispredicted += abs(label_group_count[i] - initial_labels[i])
accuracy = (total_sum - mispredicted) / total_sum
print("Accuracy =",accuracy)



===== Cosine =====
initial_labels==> {0: 669, 1: 803, 2: 734, 3: 798, 4: 559, 5: 183, 6: 311, 7: 562, 8: 0, 9: 286}
Accuracy = 0.49054905490549056


In [112]:
print(f'===== Jaccard =====')
jaccard_clustering = kmeans(arr, 10,"Jaccard")

initial_labels = {0: 0, 1: 0, 2: 0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0}
for key in range(len(jaccard_clustering["clusters"])):
      cluster_data = dict(cluster_count(jaccard_clustering["clusters"][key]))
      max_value = 0
      label = ""
      for k in cluster_data:
        if cluster_data[k] > max_value:
          max_value = cluster_data[k]
          label = k
      initial_labels[label] = max_value
print('initial_labels==>',initial_labels)


total_sum = 0
mispredicted = 0
for i in label_group_count:
    total_sum += label_group_count[i]
    mispredicted += abs(label_group_count[i] - initial_labels[i])
accuracy = (total_sum - mispredicted) / total_sum
print("Accuracy =",accuracy)


===== Jaccard =====
initial_labels==> {0: 832, 1: 1039, 2: 0, 3: 706, 4: 196, 5: 285, 6: 518, 7: 584, 8: 404, 9: 0}
Accuracy = 0.45644564456445647
