# KMeans Clustering

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [None]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

In [None]:
import numpy as np
import pandas as pd

class kmeans:
    
    def __init__(self, k = 5, random_seed=None, iters=1000, n_init=10):
        self._k = k
        self._iters = iters
        self._n_init = n_init
        if random_seed:
            np.random.seed(random_seed)

    def computeDistance(self, clst, pt):
        return np.sqrt(np.sum((clst.mean - pt)**2))

    def classify(self, pt):
        cluster_num = self.get_clust_id(pt)
        self.clusters[cluster_num].add_member(pt)

    def get_clust_id(self,pt):
        return min(range(self._k), key=lambda i: self.computeDistance(self.clusters[i],pt))
    
    def init_clusters(self, X):
        self.clusters = [self.cluster() for _ in range(0,self._k)]
        rand_points = np.copy(X)
        np.random.shuffle(rand_points)
        rand_points = rand_points.tolist()
        for c in self.clusters:
            c.mean = rand_points.pop()
        for p in X:
            self.classify(p)  

    def fit_predict(self,X):
        self.fit(X)
        return self.predict(X)
    
    def fit(self, X):
        X = self.pandas_to_numpy(X)
        
        best_inertia = None
        best_clusters = []
        for _ in range(self._n_init):
            self.init_clusters(X)
            ischange = True
            i = 0
            while ischange and i < self._iters:
                ischange = False
                for c in self.clusters:
                    c.get_mean()
                    c.set_prev_members()
                    c.members = []

                for p in X:
                    self.classify(p)

                for c in self.clusters:
                    if c.is_changed():
                        ischange = True
                i += 1 
            current_inertia = 0.
            for c in self.clusters:
                c.get_mean()
                current_inertia += c.get_total_square_distance()
            
            if not best_inertia or current_inertia < best_inertia:
                best_clusters = self.clusters
                best_inertia = current_inertia
        
        self.clusters = best_clusters
        self.inertia = best_inertia
            
    def predict(self, X):
        clust_ids = []
        for dt in self.pandas_to_numpy(X):
            clust_ids.append([self.get_clust_id(dt)])
        return np.array(clust_ids)
    
    def pandas_to_numpy(self, x):
        """
        Checks if the input is a Dataframe or series, converts to numpy matrix for
        calculation purposes.
        ---
        Input: X (array, dataframe, or series)
        
        Output: X (array)
        """
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            return x.as_matrix()
        if type(x) == type(np.array([1,2])):
            return x
        return np.array(x)
    
    def score(self):
        return self.inertia
        
    class cluster:
        def __init__(self):
            self.mean = None
            self.members = []
            self.prev_members = []

        def set_prev_members(self):
            self.prev_members = self.members
            self.members = []

        def add_member(self,pt):
            self.members.append(pt)

        def is_changed(self):
            return not np.array_equal(self.members,self.prev_members)
#             if len(self.members) != len(self.prevMembers):
#                 return True
#             for i,j in zip(self.members,self.prevMembers):
#                 if not np.array_equal(i,j):
#                     return True
#             return False

        def get_mean(self):
            if not len(self.members):
                self.mean = [-999,-999]
                return
            x,y = 0.,0.
            for p in self.members:
                x+=p[0]
                y+=p[1]
            self.mean = [x/len(self.members),y/len(self.members)]

        def get_total_square_distance(self):
            val = 0.
            for p in self.members:
                val += np.sqrt(np.sum((self.mean - p)**2))
            return val

In [None]:
def get_data(n_clust = 3):
    X1 = np.random.normal(-5,1,50).reshape(-1,1)
    y1 = np.random.normal(-5,1,50).reshape(-1,1)
    for _ in range(n_clust-1):
        X2 = np.random.normal(np.random.randint(-10,10),1,50).reshape(-1,1)
        y2 = np.random.normal(np.random.randint(-10,10),1,50).reshape(-1,1)
        X1 = np.vstack((X1,X2)).reshape(-1,1)
        y1 = np.vstack((y1,y2)).reshape(-1,1)
    X = np.hstack((X1,y1))
    return X

X = get_data(n_clust=5)
plt.scatter(X[:,0],X[:,1])

In [None]:
km = kmeans(k=5)

In [None]:
km.fit(X)

In [None]:
for c in km.clusters:
    print(c.mean)

In [None]:
from itertools import cycle
colors = cycle('byrcmk')
preds = km.predict(X)
for cl in km.clusters:
    xplot, yplot = zip(*cl.members)
    col = next(colors)
    plt.scatter(xplot,yplot, c=col);
    plt.scatter(cl.mean[0],cl.mean[1],marker='x',s=300,c='magenta')

In [None]:
km.inertia

In [None]:
inertia = []
for k in range(1,21):
    km = kmeans(k=k, random_seed=42)
    km.fit(X)
    inertia.append(km.score())

In [None]:
plt.plot(range(1,21),inertia);
plt.xlabel("Num Clusters")
plt.ylabel("Inertia")
plt.title("kMeans Score Report");
plt.xticks(np.arange(1,21,2));