In [53]:
import pandas as pd

from kmean import (kmeans, euclidean, cosine_sim,
                    jaccard, sum_of_squares)

In [54]:
# function to get csv file
def loadCSV(fileName):
    fileHandler = open(fileName, "rt")
    lines = fileHandler.readlines()
    fileHandler.close()
    del lines[0]
    dataset = []
    for line in lines:
        instance = lineToTuple(line)
        dataset.append(instance)
    return dataset


# converting function to tuples
def lineToTuple(line):
    cleanLine = line.strip()
    cleanLine = cleanLine.replace('"', '')
    lineList = cleanLine.split(",")
    stringsToNumbers(lineList)
    lineTuple = tuple(lineList)
    return lineTuple

# Convert string to numbers
def stringsToNumbers(myList):
    for i in range(len(myList)):
        if (isValidNumberString(myList[i])):
            myList[i] = float(myList[i])

# make sure string can be converted to number
def isValidNumberString(s):
  if len(s) == 0:
    return False
  if len(s) > 1 and s[0] == "-":
      s = s[1:]
  for c in s:
    if c not in "0123456789.":
      return False
  return True

In [56]:
#Task 2
#load csv files
data = loadCSV('./input/irisData.csv')
target = loadCSV('./input/irisTarget.csv')

In [60]:
def getKMeans(dataset, target, k, runTime, typeOfMetric, conditionToStop='centroids'):
    sseArray = []
    iterationArray = []
    accuracyArray = []
    
    for _ in range(runTime):
        result = kmeans(dataset, k, False, metric = typeOfMetric, stop_condition = conditionToStop)
        
        # Save result for SSE and Iterations
        sseArray.append(result['withinss'])
        iterationArray.append(result['iterations'])
        
        labelForClusters = {}
        
        # ci = cluster iteration, go through each cluster and label them
        for ci, cluster in enumerate(result['clusters']):
            countLabel = {}
            totalLabel = None
            totalLabelCount = 0
            
            # ii = instance iteration, go through each instance within a cluster
            for ii, instance in enumerate(cluster):
                
                # get label and add counter to the amount of time it shows
                label = target[int(instance[0])][1]
                countLabel[label] = countLabel.get(label, 0) + 1
                
                # update variable count with the most common label
                if countLabel[label] > totalLabelCount:
                    totalLabelCount = countLabel[label]
                    totalLabel = label
                    
            labelForClusters[ci] = totalLabel
            
        # get accuracy based on each cluster's label
        correctAmount = 0
        numberOfRuns = 0
        
        for ci, cluster in enumerate(result['clusters']):
            for ii, instance in enumerate(cluster):
                greaterThan = target[int(instance[0])][1]
                
                if greaterThan == labelForClusters[ci]:
                    correctAmount += 1
                numberOfRuns += 1
                
        # Calculate and save k-means iteration
        accuracyArray.append(correctAmount/numberOfRuns)
        
    sseAverage = sum(sseArray)/runTime
    iterationAverage = sum(iterationArray)/runTime
    accucaryAverage = sum(accuracyArray)/runTime
    
    print('   SSE: {0:.4}'.format(sseAverage))
    print('   Iterations: {0:.4}'.format(iterationAverage))
    print('   Accuracy: {0:.2}\n'.format(accucaryAverage))

In [61]:
print('Eucledian results:')
getKMeans(data, target, 3, 200, euclidean)

Eucledian results:
   SSE: 91.48
   Iterations: 7.245
   Accuracy: 0.85



In [63]:
print('Cosine results:')
getKMeans(data, target, 3, 200, cosine_sim)

Cosine results:
   SSE: 104.6
   Iterations: 5.34
   Accuracy: 0.91



In [64]:
print('Jaccard results:')
getKMeans(data, target, 3, 200, jaccard)

Jaccard results:
   SSE: 96.45
   Iterations: 5.74
   Accuracy: 0.83



### Q1, when comparing all the distance computation functions through SSE. 
we can determine that cosine is the best one of them due to the high SSE

### Q2, comparing accuracy of each computational function.
we can clearly see that the most accurate one is cosine with 91% versus 
85% of eucledian and 83% jaccard

### Q3, which k-means required more iterations and times
the one that required the most iterations is euclidian with an average of 7.245 iterations

### Q4 - Compare SSE of each Kmeans
* when there is no change in centroid position
* when the SSE value increases in the next iteration
* when the maximum preset value (100) of iteration is complete

In [67]:
print('Euclidean')
print('centroid result:')
getKMeans(data, target, 3, 200, euclidean, 'centroids')

print('SSE result:')
getKMeans(data, target, 3, 200, euclidean, 'SSE')

print('Max Iteration result:')
getKMeans(data, target, 3, 200, euclidean, 'max_iteration')

Euclidean
centroid result:
   SSE: 89.16
   Iterations: 7.4
   Accuracy: 0.85

SSE result:
   SSE: 174.7
   Iterations: 1.0
   Accuracy: 0.73

Max Iteration result:
   SSE: 90.09
   Iterations: 100.0
   Accuracy: 0.85



In [68]:
print('Cosine')
print('centroid result:')
getKMeans(data, target, 3, 200, cosine_sim, 'centroids')

print('SSE result:')
getKMeans(data, target, 3, 200, cosine_sim, 'SSE')

print('Max Iteration result:')
getKMeans(data, target, 3, 200, cosine_sim, 'max_iteration')

Cosine
centroid result:
   SSE: 110.3
   Iterations: 5.405
   Accuracy: 0.88

SSE result:
   SSE: 190.8
   Iterations: 1.0
   Accuracy: 0.76

Max Iteration result:
   SSE: 104.8
   Iterations: 100.0
   Accuracy: 0.91



In [69]:
print('Jaccard')
print('centroid result:')
getKMeans(data, target, 3, 200, jaccard, 'centroids')

print('SSE result:')
getKMeans(data, target, 3, 200, jaccard, 'SSE')

print('Max Iteration result:')
getKMeans(data, target, 3, 200, jaccard, 'max_iteration')

Jaccard
centroid result:
   SSE: 97.39
   Iterations: 5.64
   Accuracy: 0.83

SSE result:
   SSE: 174.1
   Iterations: 1.0
   Accuracy: 0.74

Max Iteration result:
   SSE: 94.07
   Iterations: 100.0
   Accuracy: 0.84



#### Which method requires more time or more iterations?
We can see that Euclidean k-means requires more iterations when there is no change in centroid position