In [303]:
#utils for algorithms

import numpy as np

def euclidianDistance(point, candidate):
    return np.sqrt((candidate[0] - point[0]) ** 2 + (candidate[1] - point[1]) ** 2)

def distances(point, points):
    return [euclidianDistance(point, candidate) for candidate in points]

def closestCentroidLabel(point, centroids):
    distancesList = distances(point, centroids)
    return distancesList.index(min(distancesList))

def chooseNewCentroid(centroids, cluster, i):
    if len(cluster) == 0 :
        return centroids[i]
    else:
        return np.mean(cluster,axis=0)


In [None]:
def kMeansShell(dataset, metric, cluster_count, starting_centroids, iterations):
    def iteration(centroids):
        labels = []
        for point in dataset:
            labels.append(closestCentroidLabel(point, centroids))
            
        indecies = [[i for i,x in enumerate(labels) if x == j ] for j in range(cluster_count)]
        clusters = [[p for p in dataset[indecies[i]] ] for i in range(cluster_count)]
        new_centroids = [chooseNewCentroid(centroids, cluster, i) for i, cluster in enumerate(clusters)]
        change = np.sum([min(distances(p, centroids)) for p in new_centroids])

        return new_centroids, labels, clusters, change

    best_thus_far = np.inf
    for i in range(iterations):
        current_centroids = starting_centroids
        change = 1
        while change != 0:
            current_centroids, labels, clusters, change = iteration(current_centroids)
        
        new_metric_result = metric(clusters, current_centroids)
        best_thus_far = min(best_thus_far, new_metric_result)
        if best_thus_far == new_metric_result:
            result_centroids = current_centroids
            result_labels = labels

    np.savetxt('centroids.txt', np.array(result_centroids))
    np.savetxt('cluster_labels.txt', np.array(result_labels), fmt="%d")


In [305]:
def kMeans(dataset, metric, cluster_count):
    def choosing(dataset, cluster_count):
        return dataset[np.random.permutation(len(dataset))[:cluster_count]]
    
    kMeansShell(dataset, metric, cluster_count, choosing(dataset, cluster_count),20)

def kMeansplusplus(dataset, metric, cluster_count):
    def choosing(dataset, cluster_count):
        result = [dataset[np.random.choice(range(len(dataset)))]]
        while len(result) < cluster_count:
            candidateDistances = [np.min(distances(current, result)) for current in dataset]
            result.append(dataset[np.argmax(candidateDistances)])

        return result
    
    kMeansShell(dataset, metric, cluster_count, choosing(dataset, cluster_count),1)



In [306]:
def wcss(clusters, centroids):
    return np.sum([np.sum(distances(centroids[i], clusters[i])) for i, _ in enumerate(centroids)])

def daviesBouldinIndex(clusters, centroids):
    size = len(centroids)
    s = [np.sum(distances(centroids[i], cluster))*1.0/len(cluster) for i,cluster in enumerate(clusters)]
    m = [[euclidianDistance(centroids[i], centroids[j]) for j in range(size)] for i in range(size)]

    return np.sum([np.max([(s[i]+s[j])/m[i][j] for j in range(size) if j != i]) for i in range(size)])*1.0/size


In [307]:
# utils for main

def chooseAlgorithm(name):
    pairs = {
        "kMeans" : kMeans,
        "kMeans++" : kMeansplusplus,
    }

    return pairs[name]

def chooseMetric(number):
    pairs = {
        "1" : wcss,
        "2" : daviesBouldinIndex,
    }

    return pairs[number]


In [308]:
import sys
#inp = input("<data_file> <algorithm_name> <metric> <cluster_count>")
inp = "unbalance.txt kMeans++ 1 8"
labels = inp.split(" ")
if len(labels) != 4:
    sys.exit("Not a valid input")

algorithm = chooseAlgorithm(labels[1])
metric = chooseMetric(labels[2])

labels[0] = "Datasets\\" + labels[0][:-4] + "\\" + labels[0]

algorithm(np.loadtxt(labels[0]) ,metric, int(labels[3]))

%run plot_clusters.py {labels[0]} centroids.txt cluster_labels.txt


NameError: name 'restart_count' is not defined