In [12]:
import numpy as np
from scipy.spatial.distance import cdist


In [7]:
np.random.choice(1000,3,replace=False)

array([569, 159, 512])

In [26]:
test = np.random.choice(10,(20,3))
print(test)
centroids = test[np.random.choice(20,5)]
print("\n", centroids)

[[6 6 6]
 [6 3 8]
 [1 8 9]
 [5 3 6]
 [5 9 8]
 [4 6 8]
 [3 7 1]
 [5 4 9]
 [7 5 6]
 [5 7 6]
 [0 7 6]
 [4 1 3]
 [7 6 0]
 [8 7 4]
 [9 4 4]
 [3 2 6]
 [7 4 6]
 [5 6 7]
 [6 5 8]
 [9 9 1]]

 [[6 3 8]
 [7 4 6]
 [9 4 4]
 [4 6 8]
 [6 6 6]]


In [27]:
cdist(test, centroids)

array([[ 3.60555128,  2.23606798,  4.12310563,  2.82842712,  0.        ],
       [ 0.        ,  2.44948974,  5.09901951,  3.60555128,  3.60555128],
       [ 7.14142843,  7.81024968, 10.24695077,  3.74165739,  6.164414  ],
       [ 2.23606798,  2.23606798,  4.58257569,  3.74165739,  3.16227766],
       [ 6.08276253,  5.74456265,  7.54983444,  3.16227766,  3.74165739],
       [ 3.60555128,  4.12310563,  6.70820393,  0.        ,  2.82842712],
       [ 8.60232527,  7.07106781,  7.34846923,  7.14142843,  5.91607978],
       [ 1.73205081,  3.60555128,  6.40312424,  2.44948974,  3.74165739],
       [ 3.        ,  1.        ,  3.        ,  3.74165739,  1.41421356],
       [ 4.58257569,  3.60555128,  5.38516481,  2.44948974,  1.41421356],
       [ 7.48331477,  7.61577311,  9.69535971,  4.58257569,  6.08276253],
       [ 5.74456265,  5.19615242,  5.91607978,  7.07106781,  6.164414  ],
       [ 8.60232527,  6.32455532,  4.89897949,  8.54400375,  6.08276253],
       [ 6.        ,  3.74165739,  3.1

In [36]:
np.argmin(cdist(test, centroids, metric="euclidean"), axis=1)

array([4, 0, 3, 0, 3, 3, 4, 0, 1, 4, 3, 1, 2, 4, 2, 0, 1, 3, 0, 2])

In [37]:
np.inf

inf

In [40]:
np.zeros((3,3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [44]:
np.sum(np.square(test - centroids[0]))

580

In [122]:
import numpy as np
from scipy.spatial.distance import cdist


class KMeans:
    def __init__(self, k: int, tol: float = 1e-6, max_iter: int = 100):
        """
        In this method you should initialize whatever attributes will be required for the class.

        You can also do some basic error handling.

        What should happen if the user provides the wrong input or wrong type of input for the
        argument k?

        inputs:
            k: int
                the number of centroids to use in cluster fitting
            tol: float
                the minimum error tolerance from previous error during optimization to quit the model fit
            max_iter: int
                the maximum number of iterations before quitting model fit
        """

        if k <= 0: raise Exception("k must be a positive integer") #TypeError

        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self, mat: np.ndarray):
        """
        Fits the kmeans algorithm onto a provided 2D matrix.
        As a bit of background, this method should not return anything.
        The intent here is to have this method find the k cluster centers from the data
        with the tolerance, then you will use .predict() to identify the
        clusters that best match some data that is provided.

        In sklearn there is also a fit_predict() method that combines these
        functions, but for now we will have you implement them both separately.

        inputs:
            mat: np.ndarray
                A 2D matrix where the rows are observations and columns are features
        """

        self.mat = mat
        self.observations, self.features = self.mat.shape

        if self.observations < self.k: raise Exception("Cannot assign " + str(self.observations) + " observations to " + str(self.k) + " clusters")

        self.centroids = mat[np.random.choice(self.observations, self.k, replace=False)]
        self.pred_labels = np.zeros((self.observations, 1))
     
        i = 0
        error = np.inf

        while i < self.max_iter and error > self.tol:
            self.pred_labels = self.predict(self.mat)
            print("pred labels: ", self.pred_labels)
            self.centroids = self.get_centroids()
            print("centroids: ", self.centroids)
            if i == 1: error = self.get_error()
            else: error -= self.get_error()
            i += 1


    def predict(self, mat: np.ndarray) -> np.ndarray:
        """
        Predicts the cluster labels for a provided matrix of data points--
            question: what sorts of data inputs here would prevent the code from running?
            How would you catch these sorts of end-user related errors?
            What if, for example, the matrix is of a different number of features than
            the data that the clusters were fit on?

        inputs:
            mat: np.ndarray
                A 2D matrix where the rows are observations and columns are features

        outputs:
            np.ndarray
                a 1D array with the cluster label for each of the observations in `mat`
        """
        
        return np.argmin(cdist(self.mat, self.centroids, metric="euclidean"), axis=1)


    def get_error(self) -> float:
        """
        Returns the final squared-mean error of the fit model. You can either do this by storing the
        original dataset or recording it following the end of model fitting.

        outputs:
            float
                the squared-mean error of the fit model
        """
        mse = np.zeros(self.k)

        for cluster in range(self.k):
            mse[cluster] = np.sum(np.square(self.mat[cluster == self.pred_labels] - self.centroids[cluster]))
        return np.sum(mse)

    def get_centroids(self) -> np.ndarray:
        """
        Returns the centroid locations of the fit model.

        outputs:
            np.ndarray
                a `k x m` 2D matrix representing the cluster centroids of the fit model
        """

        fit_centroids = np.zeros((self.k, self.features))

        for cluster in range(self.k):
            #print("centroids fit: ", self.mat[cluster == self.pred_labels, :])
            #break
            fit_centroids[cluster, :] = np.mean(self.mat[cluster == self.pred_labels, :], axis = 0)

        return fit_centroids




In [123]:
new = KMeans(3)
new

<__main__.KMeans at 0x7fde189ead90>

In [124]:
new.fit(test)

pred labels:  [2 0 1 0 1 1 2 1 2 1 1 0 2 2 0 0 0 1 1 2]
centroids:  [[5.66666667 2.83333333 5.5       ]
 [3.875      6.5        7.625     ]
 [6.66666667 6.66666667 3.        ]]
pred labels:  [1 0 1 0 1 1 2 1 0 1 1 0 2 2 2 0 0 1 1 2]
centroids:  [[5.33333333 3.         5.83333333]
 [4.11111111 6.44444444 7.44444444]
 [7.2        6.6        2.        ]]
pred labels:  [1 0 1 0 1 1 2 1 0 1 1 0 2 2 2 0 0 1 1 2]
centroids:  [[5.33333333 3.         5.83333333]
 [4.11111111 6.44444444 7.44444444]
 [7.2        6.6        2.        ]]


In [121]:
import numpy as np
from cluster import (
        KMeans, 
        Silhouette, 
        make_clusters,
        plot_clusters,
        plot_multipanel)


def main():

    # create tight clusters
    clusters, labels = make_clusters(scale=0.3)
    plot_clusters(clusters, labels, filename="figures/tight_clusters.png")

    # create loose clusters
    clusters, labels = make_clusters(scale=2)
    plot_clusters(clusters, labels, filename="figures/loose_clusters.png")

    """
    uncomment this section once you are ready to visualize your kmeans + silhouette implementation
    """
    clusters, labels = make_clusters(k=4, scale=1)
    km = KMeans(k=4)
    km.fit(clusters)
    pred = km.predict(clusters)
    #scores = Silhouette().score(clusters, pred)
    plot_multipanel(clusters, labels, pred, scores)
    

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'cluster'

In [179]:
a = [2,5,4]
cdist(a,test,metric="euclidean")

ValueError: XA must be a 2-dimensional array.

In [177]:
#print(test)
dists = cdist(test, test, metric="euclidean")
np.sum(dists, axis=1)/(dists.shape[0] - 1)
#== np.mean(dists, axis=1)
#dists
#print(dists, "\n\n\n\n\n")
#dists.ravel()[::dists.shape[1]+1] = dists.max()+1
#print(dists)
#print(test[dists.argmin(1)])
#np.triu(dists)

array([3.74426622, 4.73603335, 6.73372451, 4.28782175, 5.47567758,
       4.34892232, 6.47463949, 4.84355049, 3.95488106, 4.00596311,
       6.45355598, 6.33059406, 6.7427907 , 5.0094995 , 5.49479789,
       5.36490982, 4.14291871, 3.81448922, 4.16456103, 7.56206616])

In [305]:
import numpy as np
from scipy.spatial.distance import cdist


class Silhouette:
    def __init__(self):
        """
        inputs:
            none
        """

    def score(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        calculates the silhouette score for each of the observations

        inputs:
            X: np.ndarray
                A 2D matrix where the rows are observations and columns are features.

            y: np.ndarray
                a 1D array representing the cluster labels for each of the observations in `X`

        outputs:
            np.ndarray
                a 1D array with the silhouette scores for each of the observations in `X`
        """

        nclusters = np.amax(y) + 1
        nobs, nlabels = X.shape
        labels = np.arange(0, np.amax(y))
        a = np.zeros(nobs)

        for cluster in labels:
            
            rows, cols = np.where(X[cluster == y])

            cluster_type = X[cluster == y]
            print("cluster type: ", cluster_type)
            print("inds: ", rows)
            dists = cdist(cluster_type, cluster_type, metric="euclidean")
            
            #print("shape a: ", a[inds].shape)
            #print("shape mean: ", (np.sum(dists, axis=1) / (dists.shape[0] - 1)).shape)
            #print("inds: ", inds)
            #a[inds] = np.sum(dists, axis=1) / (dists.shape[0] - 1)

        b = np.zeros(nobs)

        for cluster in labels:
            #cluster_type = X[cluster != y]
            other_labels = np.delete(labels, cluster)
            min_dists = np.full(X[cluster == y].shape[0], np.inf)
            inds = np.where(X[cluster == y])
            
            for other_cluster in other_labels:
                cluster_type = X[other_cluster == y]

                dists = cdist(cluster_type, cluster_type, metric="euclidean")
                avg_dists_per_cluster = np.mean(dists, axis=1)
                min_dists = np.minimum(avg_dists_per_cluster, min_dists)

            b[inds] = min_dists

        return (b - a) / (np.maximum(a, b))


            #dists.ravel()[::dists.shape[1]+1] = dists.max()+1
            #test[dists.argmin(1)]

            #for obs in range(obs_in_cluster):
                # do not include pt itself
                #cdist(cluster_type[obs], cluster_type, metric="euclidean")



In [306]:
s = Silhouette()

labels=np.array([2, 0, 1, 0, 1, 1, 2, 1, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 2])
mean = np.mean(test, axis = 1)
print("mean before: ", mean)
ind = np.where(labels<1)
print("ind: ",ind)
mean[ind] = np.arange(400,406)
print("mean after: ", mean)

#s.score(test, labels)


mean before:  [6.         8.         5.66666667 3.66666667 1.66666667 2.33333333
 3.66666667 3.33333333 6.33333333 6.         3.         4.
 4.         5.         7.33333333 4.33333333 6.         2.66666667
 4.         3.66666667]
ind:  (array([ 1,  3, 11, 14, 15, 16]),)
mean after:  [  6.         400.           5.66666667 401.           1.66666667
   2.33333333   3.66666667   3.33333333   6.33333333   6.
   3.         402.           4.           5.         403.
 404.         405.           2.66666667   4.           3.66666667]


In [307]:
s = Silhouette()
test = np.random.choice(10,(20,3))
print("test: ", test)
labels=np.array([2, 0, 1, 0, 1, 1, 2, 1, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 2])
s.score(test,labels)

test:  [[1 5 0]
 [3 6 9]
 [7 5 4]
 [2 9 8]
 [6 1 7]
 [4 3 9]
 [1 3 1]
 [0 1 4]
 [6 8 3]
 [3 3 3]
 [4 1 5]
 [6 7 8]
 [5 6 5]
 [9 8 1]
 [8 7 7]
 [6 5 3]
 [4 2 6]
 [8 0 6]
 [1 8 0]
 [1 9 5]]


  rows, cols = np.where(X[cluster] == y)


ValueError: not enough values to unpack (expected 2, got 1)

In [190]:
c=np.arange(0,14)
c

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [193]:
min_dists = np.full(test.shape[0], np.inf)
min_dists

array([inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf])

In [194]:
x=[0,1,2,3,4]
np.amax(x)

4

In [214]:
x>2

TypeError: '>' not supported between instances of 'list' and 'int'