In [11]:
import numpy as np
import threading

In [12]:
#######################################################################################################
# simple n-dimensional point object
#######################################################################################################
class Point:
    '''
    INPUT:
        c:           list or tuple of coordinates that represent a point with n-dimensions
    ----------------------------------------------------------------------------------------
    SELF:
        coordinates: converted tuple of the list/tuple that is inputted as c
        n:           number of dimensions calculated by taking the length of c
    '''
    def __init__(self, c):
        self.coordinates = tuple(c)
        self.n           = len(c)
    
    def __repr__(self):
        temp = ''.join(["{:.2f}, ".format(self.coordinates[i]) for i in range(self.n - 1)])
        s = "(" + temp + "{:.2f})".format(self.coordinates[-1])
        return s
    
    def euclidean_distance(self, p2):
        return np.linalg.norm(np.array(self.coordinates) - np.array(p2.coordinates))
#         return np.sqrt(np.sum([(self.coordinates[i] - p2.coordinates[i]) ** 2 for i in range(self.n)]))
    
    
#######################################################################################################

class Cluster:
    '''
    INPUT:
        points:   hello        
    ----------------------------------------------------------------------------------------
    SELF:
        points:   converted tuple of the list/tuple that is inputted as c
        centroid: number of dimensions calculated by taking the length of c
        avg_dist: number of dimensions calculated by taking the length of c
        index:    number of dimensions calculated by taking the length of c
        n:        number of dimensions calculated by taking the length of c
    '''
    def __init__(self, points=[]):
        self.points     = []
        self.centroid   = None
        self.avg_dist   = None
        self.index      = None
        self.n          = None
        if len(points) > 0:
            self.points = points
            self.n      = self.points[0].n
            self.compute_centroid()
            self.compute_avg_distance()
    
    def __repr__(self):
        return "Index: {}\nCentroid: {}\nAvg. Distance: {:.2f}\nPoints: {}\nNumber of Points: {}\n".format(self.index, self.centroid, self.avg_dist, self.points, len(self.points))
    
    def get_points(self):
        return self.points
    
    def set_index(self, i):
        self.index = i
    
    def add_point(self, point):
        self.points.append(point)
        if len(self.points) == 1:
            self.centroid = point
            self.avg_dist = 0.0
            self.n        = point.n
        else:
            self.update_centroid()
            self.compute_avg_distance()
    
    def add_centroid(self, point):
        if len(self.points) == 0: 
            self.centroid = point
            self.avg_dist = 0.0
            self.n        = point.n
        
    def compute_centroid(self):
        means = [np.mean([pt.coordinates[i] for pt in self.points]) for i in range(self.n)]
        self.centroid = Point(means)
    
    def update_centroid(self):
        len_pts       = len(self.points)
        old           = [self.centroid.coordinates[i] for i in range(self.n)]
        new           = [self.points[-1].coordinates[i] for i in range(self.n)]
        update        = [(old[i] + ((new[i] - old[i]) / len_pts)) for i in range(self.n)]
        self.centroid = Point(update)
    
    def compute_avg_distance(self):
        distances = []
        for point in self.points:
            distances.append(self.centroid.euclidean_distance(point))
        self.avg_dist = np.mean(distances)
    
    def remove_point(self, point):
        try:
            self.points.remove(point)
            self.compute_centroid()
            self.compute_avg_distance()
        except ValueError:
            return
        
class Hierarchical_Clustering:
    def __init__(self, points, k):
        self.points     = points
        self.k          = k
        self.clusters   = []
        for pt in points:
            c = Cluster()
            c.add_point(pt)
            self.clusters.append(c)

    def get_points(self):
        return self.points
        
    def Perform_Hierarchical(self, print_clusters=True):
        init_k = len(self.clusters)
        
        # as we merge 
        while init_k != self.k:
            d = np.full((init_k, init_k), np.inf)
            
            # triangular matraix of distances from cluster[i]'s centroid to cluster[j]'s centroid.
            for i in range(len(self.clusters)):
                for j in range(i, len(self.clusters)):
                    if self.clusters[i] != self.clusters[j]:
                        d[i][j] = self.clusters[i].centroid.euclidean_distance(self.clusters[j].centroid)
            
            indices = np.where(d == np.min(d))
            ind_1   = indices[0][0]
            ind_2   = indices[1][0]
            
            self.merge_clusters(self.clusters[ind_1], self.clusters[ind_2])
            
            init_k = len(self.clusters)
        
        if print_clusters:
            for cluster in self.clusters:
                print(cluster)
        
        cluster_indices = self.get_cluster_colors()
        centroids = [self.clusters[i].centroid for i in range(len(self.clusters))]
        
        return cluster_indices, centroids
    
    def get_cluster_colors(self):
        cluster_indices = np.zeros(len(self.points))
        for cluster in self.clusters:
            for point in cluster.get_points():
                cluster_indices[self.points.index(point)] = cluster.index
        return cluster_indices
                        
    def merge_clusters(self, c1, c2):
        try:
            self.clusters.remove(c2)
            i1 = self.clusters.index(c1)
            for p in c2.get_points():
                self.clusters[i1].add_point(p)
            for i in range(len(self.clusters)):
                self.clusters[i].set_index(i)
        except ValueError:
            return   

class kMeans_Clustering:
    def __init__(self, points, k, init_points=[], random=False, convergence_term=1e-10):
        self.points     = points
        self.k          = k
        self.clusters   = []
        self.convergence_term = convergence_term
        i = 0
        for pt in init_points:
            if i < k:
                c = Cluster()
                c.add_centroid(pt)
                self.clusters.append(c)
            i += 0
        furthest_points = k - len(self.clusters)
        if furthest_points == k or random:
            pt = np.random.choice(self.points)
            c = Cluster()
            c.add_centroid(pt)
            self.clusters.append(c)
            furthest_points -= 1
        if random:
            pts = np.random.choice(self.points, furthest_points, False)
            for pt in pts:
                c = Cluster()
                c.add_centroid(pt)
                self.clusters.append(c)
            furthest_points -= furthest_points
        if furthest_points != 0: 
            for i in range(furthest_points):
                dist = []
                for pt in self.points:
                    dist.append(np.sum([pt.euclidean_distance(init_points[i]) for i in range(len(init_points))]))
                c = Cluster()
                c.add_centroid(self.points[dist.index(np.max(dist))])
                self.clusters.append(c)  
                
    def get_points(self):
        return self.points
        
    def Perform_kMeans(self, print_clusters=True):
        # have to flip the names here to get it to work
        new_centroids = self.get_cluster_centroids()
        old_centroids = self.initialize_new_centroids()
        c_distance = np.sum([old_centroids[i].euclidean_distance(new_centroids[i]) for i in range(len(old_centroids))])

        convergence = False if c_distance >= self.convergence_term else True
        
        while (not convergence):
            for point in self.get_points():
                p_distance = [point.euclidean_distance(new_centroids[i]) for i in range(len(new_centroids))]
                self.clusters[p_distance.index(np.min(p_distance))].add_point(point)
            
            old_centroids = new_centroids
            new_centroids = self.get_cluster_centroids()
            c_distance = np.sum([old_centroids[i].euclidean_distance(new_centroids[i]) for i in range(len(old_centroids))])
            
            convergence = False if c_distance >= self.convergence_term else True
            
            if not convergence:
                self.clusters = []
                for centroid in new_centroids:
                    c = Cluster()
                    c.add_centroid(centroid)
                    self.clusters.append(c)
                    
        self.print_clusters(print_clusters)
            
        cluster_indices = self.get_cluster_colors()
        centroids = [self.clusters[i].centroid for i in range(len(self.clusters))]
        return cluster_indices, centroids
    
    def get_cluster_centroids(self):
        return [Point([cluster.centroid.coordinates[i] for i in range(cluster.n)]) for cluster in self.clusters]
    
    def initialize_new_centroids(self):
        return [Point([0 for i in range(cluster.n)]) for cluster in self.clusters]
    
    def print_clusters(self, print_clusters):
        i = 0
        for cluster in self.clusters:
            cluster.set_index(i)
            if print_clusters:
                print(cluster)
            i += 1
    
    def get_cluster_colors(self):
        cluster_indices = np.zeros(len(self.points))
        for cluster in self.clusters:
            for point in cluster.get_points():
                cluster_indices[self.points.index(point)] = cluster.index
        return cluster_indices

In [13]:
p1 = Point((0, 0))
p2 = Point((4, 0))
p3 = Point((0, 3))
c = Cluster()
c.add_point(p1)
print(c)
print("-----")
c.add_point(p2)
print(c)
print("-----")
c.add_point(p3)
print(c)
print("-----")
c.remove_point(p1)
print(c)
print("-----")

Index: None
Centroid: (0.00, 0.00)
Avg. Distance: 0.00
Points: [(0.00, 0.00)]
Number of Points: 1

-----
Index: None
Centroid: (2.00, 0.00)
Avg. Distance: 2.00
Points: [(0.00, 0.00), (4.00, 0.00)]
Number of Points: 2

-----
Index: None
Centroid: (1.33, 1.00)
Avg. Distance: 2.31
Points: [(0.00, 0.00), (4.00, 0.00), (0.00, 3.00)]
Number of Points: 3

-----
Index: None
Centroid: (2.00, 1.50)
Avg. Distance: 2.50
Points: [(4.00, 0.00), (0.00, 3.00)]
Number of Points: 2

-----


In [14]:
p1 = Point((0, 0))
p2 = Point((4, 0))
p3 = Point((0, 4))
p4 = Point((4, 4))
c = Cluster()
c.add_point(p1)
print(c)
print("-----")
c.add_point(p2)
print(c)
print("-----")
c.add_point(p3)
print(c)
print("-----")
c.add_point(p4)
print(c)
print("-----")

Index: None
Centroid: (0.00, 0.00)
Avg. Distance: 0.00
Points: [(0.00, 0.00)]
Number of Points: 1

-----
Index: None
Centroid: (2.00, 0.00)
Avg. Distance: 2.00
Points: [(0.00, 0.00), (4.00, 0.00)]
Number of Points: 2

-----
Index: None
Centroid: (1.33, 1.33)
Avg. Distance: 2.62
Points: [(0.00, 0.00), (4.00, 0.00), (0.00, 4.00)]
Number of Points: 3

-----
Index: None
Centroid: (2.00, 2.00)
Avg. Distance: 2.83
Points: [(0.00, 0.00), (4.00, 0.00), (0.00, 4.00), (4.00, 4.00)]
Number of Points: 4

-----


In [48]:
pts = [p1, p2, p3, p4]
cs = [Point([0,2]), Point([2,0]), Point([3,3])]

x = np.matrix([p.coordinates for p in pts])
y = np.matrix([p.coordinates for p in cs])
print(x)
print(y)
# print(x.dot(y))

print([[np.linalg.norm(x[j] - y[i]) for i in range(len(y))] for j in range(len(x))])

print([p1.euclidean_distance(cs[i]) for i in range(len(cs))])
print([p2.euclidean_distance(cs[i]) for i in range(len(cs))])

[[0 0]
 [4 0]
 [0 4]
 [4 4]]
[[0 2]
 [2 0]
 [3 3]]
[[2.0, 2.0, 4.242640687119285], [4.47213595499958, 2.0, 3.1622776601683795], [2.0, 4.47213595499958, 3.1622776601683795], [4.47213595499958, 4.47213595499958, 1.4142135623730951]]
[2.0, 2.0, 4.242640687119285]
[4.47213595499958, 2.0, 3.1622776601683795]


In [5]:
weights = [0.6, 0.3, 0.05, 0.03, 0.01, 0.01]

a = [7, 7, 5, 1, 1, 10]
b = [7, 7, 4, 2, 4, 10]

def compare(a,b,w):
    a_comp = sum([a[i] * w[i] for i in range(len(w))])
    b_comp = sum([b[i] * w[i] for i in range(len(w))])
    print("A: {}\nB: {}".format(a_comp, b_comp))

compare(a,b,weights)

A: 6.69
B: 6.7
