In [1]:
import math
import random
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def isValidNumberString(s):
  if len(s) == 0:
    return False
  if  len(s) > 1 and s[0] == "-":
      s = s[1:]
  for c in s:
    if c not in "0123456789.":
      return False
  return True


In [3]:
def stringsToNumbers(myList):
    for i in range(len(myList)):
        if (isValidNumberString(myList[i])):
            myList[i] = float(myList[i])

In [4]:
def lineToTuple(line):
    # remove leading/trailing witespace and newlines
    cleanLine = line.strip()
    # get rid of quotes
    cleanLine = cleanLine.replace('"', '')
    # separate the fields
    lineList = cleanLine.split(",")
    # convert strings into numbers
    stringsToNumbers(lineList)
    lineTuple = tuple(lineList)
    return lineTuple

In [5]:
def loadCSV(fileName):
    fileHandler = open(fileName, "rt")
    lines = fileHandler.readlines()
    fileHandler.close()
    del lines[0] # remove the header
    dataset = []
    for line in lines:
        instance = lineToTuple(line)
        dataset.append(instance)
    return dataset

In [6]:
def printTable(instances):
    for instance in instances:
        if instance != None:
            line = instance[0] + "\t"
            for i in range(1, len(instance)):
                line += "%.2f " % instance[i]
            print(line)

In [7]:
def extractAttribute(instances, index):
    result = []
    for instance in instances:
        result.append(instance[index])
    return result


In [8]:
def euclidean_distance(instance1, instance2):
    if instance1 == None or instance2 == None:
        return float("inf")
    distance = 0
    for i in range(1, len(instance1)):
        distance += (instance1[i] - instance2[i])**2
    return math.sqrt(distance)

In [9]:
def cosine_distance(instance1, instance2):
  if instance1 == None or instance2 == None:
        return float("inf")
  A = np.array(instance1[1:],dtype=float)
  B = np.array(instance2[1:],dtype=float)
  distance = 1 - np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))
  return distance

In [10]:
def jaccard(instance1, instance2):
  if instance1 == None or instance2 == None:
        return float("inf")
  A = np.array(instance1[1:],dtype=float)
  B = np.array(instance2[1:],dtype=float)
  return 1 - (np.sum(np.minimum(A,B), axis = 0)/np.sum(np.maximum(A, B), axis = 0)) 

In [11]:
def calculateSSE(instance1, instance2):
    if instance1 == None or instance2 == None:
        return float("inf")
    sumOfSquares = 0
    for i in range(1, len(instance1)):
        sumOfSquares += (instance1[i] - instance2[i])**2
    return sumOfSquares

In [12]:
def meanInstance(name, instanceList):
    numInstances = len(instanceList)
    if (numInstances == 0):
        return
    numAttributes = len(instanceList[0])
    means = [name] + [0] * (numAttributes-1)
    for instance in instanceList:
        for i in range(1, numAttributes):
            means[i] += instance[i]
    for i in range(1, numAttributes):
        means[i] /= float(numInstances)
    return tuple(means)

In [13]:
def assign(instance, centroids, dist_func):
  if(dist_func == "Euclidean"):
    minDistance = euclidean_distance(instance, centroids[0])
  elif(dist_func == "Cosine"):
    minDistance = cosine_distance(instance, centroids[0])
  else:
    minDistance = jaccard(instance, centroids[0])

  minDistanceIndex = 0
  for i in range(1, len(centroids)):
      if(dist_func == "Euclidean"):
        d = euclidean_distance(instance, centroids[i])
      elif(dist_func == "Cosine"):
        d = cosine_distance(instance, centroids[i])
      else:
        d = jaccard(instance, centroids[i])
      if (d < minDistance):
          minDistance = d
          minDistanceIndex = i
  return minDistanceIndex

In [14]:
def createEmptyListOfLists(numSubLists):
    myList = []
    for i in range(numSubLists):
        myList.append([])
    return myList

In [15]:
def assignAll(instances, centroids, dist_func):
    clusters = createEmptyListOfLists(len(centroids))
    for instance in instances:
        clusterIndex = assign(instance, centroids, dist_func)
        clusters[clusterIndex].append(instance)
    return clusters

In [16]:
def computeCentroids(clusters):
    centroids = []
    for i in range(len(clusters)):
        name = "centroid" + str(i)
        centroid = meanInstance(name, clusters[i])
        centroids.append(centroid)
    return centroids

In [17]:

def kmeans(instances, k, dist_func,centroid_poistion,sse_check,max_iterations,iteration_check,initCentroids=None):

    result = {}
    sse_list=[]
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = []

    iteration = 0
    start = time.time()
    while (iteration < 150):
      
        if(iteration_check == True and iteration >= max_iterations):
          break
        if(centroid_poistion == True and centroids == prevCentroids):
          break
        iteration += 1
        clusters = assignAll(instances, centroids,dist_func)

        prevCentroids = centroids
        centroids = computeCentroids(clusters)
        sse = computeWithinss(clusters, centroids)

        if(sse_check == True and iteration>1 and sse_list[-1]<sse):
          break

        sse_list.append(sse)

    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = sse
    result["SSE"] = sse_list
    result["iterations"] = iteration
    print("Time taken:", time.time() - start)
    return result

In [18]:
def computeWithinss(clusters, centroids):
    result = 0
    for i in range(len(centroids)):
        centroid = centroids[i]
        cluster = clusters[i]
        for instance in cluster:
            result += calculateSSE(centroid, instance)
    return result


In [19]:
dataset = loadCSV("data.csv")

print(f'===== when there is no change in centroid position =====')

print(f'===== Euclidean =====')
euclidean_clustering = kmeans(dataset, 10,"Euclidean",True,False,0,False)
print(f'Total no of iterations: ',euclidean_clustering["iterations"])
print(f'List of SSE: ',euclidean_clustering["SSE"])

print(f'===== Cosine =====')
cosine_clustering = kmeans(dataset, 10,"Cosine",True,False,0,False)
print(f'Total no of iterations: ',cosine_clustering["iterations"])
print(f'List of SSE: ',cosine_clustering["SSE"])

print(f'===== Jaccard =====')
jaccard_clustering = kmeans(dataset, 10,"Jaccard",True,False,0,False)
print(f'Total no of iterations: ',jaccard_clustering["iterations"])
print(f'List of SSE: ',jaccard_clustering["SSE"])


===== when there is no change in centroid position =====
===== Euclidean =====
Time taken: 964.9060289859772
Total no of iterations:  67
List of SSE:  [27892919446.536312, 26614618605.66253, 26184671652.94257, 25862830753.71948, 25629963670.831253, 25527686059.575943, 25478949995.604366, 25439503832.494385, 25402403194.16848, 25375474962.70242, 25364233294.81886, 25358376809.030697, 25354998815.53845, 25352909911.077637, 25351605460.714413, 25350583299.138268, 25349740467.964207, 25349077796.13816, 25348363736.671185, 25347467151.138653, 25346888131.864235, 25346374626.241108, 25346062128.63552, 25345810463.644432, 25345525609.92714, 25345281022.352077, 25345009297.937447, 25344806838.09144, 25344677279.608223, 25344506619.910557, 25344372072.67427, 25344258123.502197, 25344188985.890694, 25344066928.056168, 25343819982.30337, 25342925047.576782, 25341631687.92038, 25339767822.315895, 25337844490.28351, 25335301077.715187, 25332362380.6131, 25328469142.291542, 25324951715.94926, 253224

In [20]:

print(f'===== when the SSE value increases in the next iteration =====')

print(f'===== Euclidean =====')
euclidean_clustering = kmeans(dataset, 10,"Euclidean",False,True,0,False)
print(f'Total no of iterations: ',euclidean_clustering["iterations"])
print(f'List of SSE: ',euclidean_clustering["SSE"])


print(f'===== Cosine =====')
cosine_clustering = kmeans(dataset, 10,"Cosine",False,True,0,False)
print(f'Total no of iterations: ',cosine_clustering["iterations"])
print(f'List of SSE: ',cosine_clustering["SSE"])

print(f'===== Jaccard =====')
jaccard_clustering = kmeans(dataset, 10,"Jaccard",False,True,0,False)
print(f'Total no of iterations: ',jaccard_clustering["iterations"])
print(f'List of SSE: ',jaccard_clustering["SSE"])


===== when the SSE value increases in the next iteration =====
===== Euclidean =====
Time taken: 2164.0752630233765
Total no of iterations:  150
List of SSE:  [28561293397.083687, 27047442989.234898, 26418164942.19173, 26049276687.86183, 25851048575.106728, 25766746018.83552, 25720499605.394295, 25688320739.271164, 25659041229.629032, 25626402747.65079, 25587394840.586334, 25555384138.43186, 25530225432.784584, 25514719649.932724, 25505878116.9922, 25500037427.616024, 25496193183.523037, 25492940029.92607, 25489617459.371017, 25485091346.241776, 25480343244.67632, 25475887935.84032, 25471656049.11929, 25467361293.326023, 25462302868.74123, 25456679125.741367, 25450523949.76182, 25444334929.805885, 25437141940.250347, 25432291899.85764, 25429045356.669598, 25426525411.5226, 25423296217.400444, 25419977039.22829, 25417901342.898872, 25415552298.127274, 25413731879.990845, 25412208443.748863, 25410792012.399033, 25409734709.2152, 25408901677.694344, 25407886977.60137, 25407204136.954548, 

In [21]:

print(f'===== when the maximum preset value is 100 =====')

print(f'===== Euclidean =====')
euclidean_clustering = kmeans(dataset, 10,"Euclidean",False,False,100,True)
print(f'Total no of iterations: ',euclidean_clustering["iterations"])
print(f'List of SSE: ',euclidean_clustering["SSE"])


print(f'===== Cosine =====')
cosine_clustering = kmeans(dataset, 10,"Cosine",False,False,100,True)
print(f'Total no of iterations: ',cosine_clustering["iterations"])
print(f'List of SSE: ',cosine_clustering["SSE"])


print(f'===== Jaccard =====')
jaccard_clustering = kmeans(dataset, 10,"Jaccard",False,False,100,True)
print(f'Total no of iterations: ',jaccard_clustering["iterations"])
print(f'List of SSE: ',jaccard_clustering["SSE"])


===== when the maximum preset value is 100 =====
===== Euclidean =====
Time taken: 1424.8652386665344
Total no of iterations:  100
List of SSE:  [28448945225.097084, 26597697587.747326, 25995542335.200573, 25673243713.453224, 25510584656.856045, 25432231949.238247, 25393088820.69145, 25371463513.124817, 25354591819.13281, 25343389387.072723, 25335639326.386143, 25332140194.738785, 25331090646.690796, 25330251937.131596, 25328762368.20927, 25327408465.915943, 25326069236.445236, 25325170287.73303, 25324742933.785206, 25324284909.086464, 25323974919.094856, 25323683444.50648, 25323466714.312508, 25323369784.13907, 25323274304.25482, 25323233374.66408, 25323146056.10532, 25323105998.999126, 25323097038.17852, 25323092128.863255, 25323076878.49686, 25323033945.369762, 25322969719.223305, 25322839847.324844, 25322765628.963665, 25322671519.07538, 25322569766.37631, 25322481256.945957, 25322402504.395775, 25322173347.089725, 25321880466.09193, 25321562863.207344, 25321369484.457355, 25320984