In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
cancer_data = pd.read_csv("../datasets/cancer.csv")
print(cancer_data.shape)
cancer_labels = cancer_data['label'].values
cancer_features = cancer_data.drop(['label'], axis=1).values
cancer_data.head()

(569, 31)


Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
blobs_data = pd.read_csv("../datasets/blobs.csv")
print(blobs_data.shape)
blobs_features = blobs_data.values
blobs_data.head()

(400, 2)


Unnamed: 0,X,Y
0,-0.727696,1.403
1,-1.875566,0.589247
2,-0.891047,-1.399924
3,-0.224115,0.868289
4,-0.465409,0.306718


In [105]:
def purity(y_pred, y_true):
    clusters_label_count = {}
    score = 0
    count = 0
    for i, pred_label in enumerate(y_pred):
        if pred_label < 0:
            continue
        count += 1
        if pred_label not in clusters_label_count:
            clusters_label_count[pred_label] = {}
        if y_true[i] not in clusters_label_count[pred_label]:
            clusters_label_count[pred_label][y_true[i]] = 0
        clusters_label_count[pred_label][y_true[i]] += 1
    for cluster_label, true_labels in clusters_label_count.items():
        biggest_class_size = max(true_labels.values())
        score += biggest_class_size
    if count == 0:
        return 0.
    return score / count

In [38]:
class KMeans:
    def __init__(self, n_clusters, norm=np.linalg.norm, random_state=None):
        self.n_clusters = n_clusters
        self.norm = norm
        self.random_state = random_state
        
    def _rand_centers(self, X):
        return X[np.random.choice(np.arange(X.shape[0]), self.n_clusters)]
        
    def _new_centroids(self, X):
        sums = np.zeros_like(self.centers)
        counts = np.zeros(self.n_clusters)
        for x in X:
            nearest_label = 0
            dists_to_centers = np.array([self.norm(center - x) for center in self.centers])
            nearest_label = np.argmin(dists_to_centers)
            counts[nearest_label] += 1
            sums[nearest_label] += x
        for i in range(self.n_clusters):
            sums[i] /= counts[i]
        return sums

    def fit(self, X, max_iter=100):
        np.random.seed(self.random_state)
        self.centers = self._rand_centers(X)
        for iter_num in range(max_iter):
            self.centers = self._new_centroids(X)
        return self

    def predict(self, X):
        labels = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            dists_to_centers = np.array([self.norm(center - x) for center in self.centers])
            labels[i] = np.argmin(dists_to_centers)
        return labels

In [54]:
for n_clusters in range(2, 6):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(cancer_features, max_iter=10)
    pred = kmeans.predict(cancer_features)
    print(f"purity for KMeans on {n_clusters} clusters on cancer data = {purity(y_pred=pred, y_true=cancer_labels)}")

purity for 2 clusters on cancer data = 0.8541300527240774
purity for 3 clusters on cancer data = 0.8875219683655536
purity for 4 clusters on cancer data = 0.8400702987697716
purity for 5 clusters on cancer data = 0.8910369068541301


In [56]:
for n_clusters in range(2, 6):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(blobs_features, max_iter=10)
    pred = kmeans.predict(blobs_features)

In [81]:
class DBSCAN():
    def __init__(self, minN, eps, norm=np.linalg.norm):
        self.minN = minN
        self.eps = eps
        self.norm = norm
        
    def _get_nearest(self, p):
        nearest = []
        for i, x in enumerate(self.points):
            if self.norm(p - x) < self.eps:
                nearest.append(i)
        return nearest
        
    def fit_predict(self, X):
        self.points = X
        self.labels = np.ones(X.shape[0]) * -1
        self.cluster_num = 0
        for i, p in enumerate(self.points):
            nearest = self._get_nearest(p)
            if len(nearest) < self.minN:
                self.labels[i] = -2
                continue
            self.labels[i] = self.cluster_num
            cluster = set((j for j in nearest if j != i))
            while len(cluster) > 0:
                j = min(cluster)
                cluster.remove(j)
                if self.labels[j] == -2:
                    self.labels[j] = self.cluster_num
                if self.labels[j] != -1:
                    continue
                j_nearest = self._get_nearest(self.points[j])
                if len(j_nearest) > self.minN:
                    for k in j_nearest:
                        cluster.add(k)
            self.cluster_num += 1
        return self.labels
    
    def n_clusters(self):
        return self.cluster_num

In [106]:
minN = 5
eps = 15
dbscan = DBSCAN(minN=minN, eps=eps)
pred = dbscan.fit_predict(cancer_features)
print(np.mean(pred))
print(f"purity for DBSCAN on ({minN}, {eps}) on cancer data = {purity(y_pred=pred, y_true=cancer_labels)}")
print(f"num of clusters = {dbscan.n_clusters()}")

KeyboardInterrupt: 

In [None]:
class AgglomerativeClustering:
    def __init__(self):
        pass
    
    def fit_predict(self, X, n_clusters):
        self.n_clusters = n_clusters
        self.clusters = [set(x) for x in X]
        while len(self.clusters) > self.n_clusters:
            

In [57]:
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(cancer_features, max_iter=10)
    pred = kmeans.predict(cancer_features)
    print(f"purity for KMeans on {n_clusters} clusters on cancer data = {purity(y_pred=pred, y_true=cancer_labels)}")

purity for 2 clusters on cancer data = 0.8541300527240774
purity for 3 clusters on cancer data = 0.8840070298769771
purity for 4 clusters on cancer data = 0.8980667838312829
purity for 5 clusters on cancer data = 0.8769771528998243
purity for 6 clusters on cancer data = 0.8734622144112478
purity for 7 clusters on cancer data = 0.8963093145869947
purity for 8 clusters on cancer data = 0.8910369068541301
purity for 9 clusters on cancer data = 0.8892794376098418
purity for 10 clusters on cancer data = 0.9314586994727593


In [58]:
cancer_normed_features = MinMaxScaler().fit_transform(cancer_features)
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(cancer_normed_features, max_iter=10)
    pred = kmeans.predict(cancer_normed_features)
    print(f"purity for KMeans on {n_clusters} clusters on normed cancer data = {purity(y_pred=pred, y_true=cancer_labels)}")

purity for 2 clusters on normed cancer data = 0.9279437609841827
purity for 3 clusters on normed cancer data = 0.9086115992970123
purity for 4 clusters on normed cancer data = 0.8927943760984183
purity for 5 clusters on normed cancer data = 0.8822495606326889
purity for 6 clusters on normed cancer data = 0.9156414762741653




purity for 7 clusters on normed cancer data = 0.6274165202108963
purity for 8 clusters on normed cancer data = 0.8857644991212654
purity for 9 clusters on normed cancer data = 0.9560632688927944
purity for 10 clusters on normed cancer data = 0.9490333919156415
