# Week 4 - Max distance clustering

## Дано

Возьмем множество S={S1, S2, ..., S10}, где S1 = (0, 0), S2 = (3, 8), S3 = (2, 2), S4 = (1, 1), S5 = (5, 3), S6 = (4, 8), S7 = (6, 3), S8 = (5, 4), S9 = (6, 4), S10 = (7, 5).
Используя алгоритм, определяем количество кластеров на основе выделения центров кластеров.

## Инициализация

In [1]:
import numpy as np

## Создаем класс наблюдения (S1, S2, ..., SN)

In [18]:
class Observation:
    def __init__(self, vector=[]):
        self.vector = vector
        self.label = None
        
    def is_centroid(self):
        return bool(self.label)
    
    def set_centroid(self, label):
        self.label = label
        return self
    
    def calc_distance(self, observation):
        return np.linalg.norm(np.array(self.vector) - np.array(observation.vector))
    
    def __repr__(self):
        return f'S({", ".join(map(str, self.vector))})'

In [19]:
s1 = Observation((0, 0))
s2 = Observation((3, 8))

In [20]:
s1

S(0, 0)

In [21]:
s1.calc_distance(s2)

8.54400374531753

## Создаем класс кластеризатора

In [30]:
class MaxDistanceClustering:
    def __init__(self, n_clusters=2):
        self.data = []
        self.n_clusters = n_clusters
        self.centroids = ()
    
    def fit(self, X):
        for x in X:
            self.data.append(Observation(x))
            
        self.centroids = set([self.data.pop().set_centroid(0)])
        for i in range(min(self.n_clusters - 1, len(self.data))):
            new_centroid = max(
                self.data,
                key = lambda o: min(o.calc_distance(c) for c in self.centroids)
            )
            self.data.remove(new_centroid)
            self.centroids.add(new_centroid.set_centroid(i+1))
    
    
    def predict(self, X):
        predicted = []
        
        for x in X:
            nearest_centroid = min(self.centroids, key=lambda c: c.calc_distance(Observation(x)))
            predicted.append(nearest_centroid.label)
        
        return predicted

## Пример

In [31]:
cluster_machine = MaxDistanceClustering(n_clusters=2)

In [32]:
data = [
    (0, 0), (3, 8), (2, 2), (1, 1), (5, 3), 
    (4, 8), (6, 3), (5, 4), (6, 4), (7, 5)
]

In [33]:
cluster_machine.fit(data)

In [34]:
cluster_machine.centroids

{S(0, 0), S(7, 5)}

In [36]:
cluster_machine.predict([
    (1, 1),
    (2, 3),
    (10, 10)
])

[1, 1, 0]

In [39]:
cluster_machine = MaxDistanceClustering(n_clusters=4)

In [40]:
data = [
    (0, 0, 4), (3, 8, 3), (2, 2, 60), (1, 1, 0), (5, 3, 2), 
    (4, 8, 100), (6, 3, -22), (5, 4, 20), (6, 4, 10), (7, 5, 1)
]

In [41]:
cluster_machine.fit(data)

In [42]:
cluster_machine.centroids

{S(2, 2, 60), S(4, 8, 100), S(6, 3, -22), S(7, 5, 1)}

In [43]:
cluster_machine.predict([
    (1, 1, 100),
    (2, 3, -200),
    (10, 10, 5)
])

[1, 3, 0]