In [1]:
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
ds = pd.read_csv('./train.csv')

data = ds.values
print (data.shape) 
print (data)

(42000, 785)
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [7 0 0 ... 0 0 0]
 [6 0 0 ... 0 0 0]
 [9 0 0 ... 0 0 0]]


In [4]:
split = int(0.8 * data.shape[0])
X_train = data[:split, 1:]
y_train = data[:split, 0]

X_test = data[split:, 1:]
y_test = data[split:, 0]

In [5]:
def dist(p1, p2):
    return np.sqrt(((p1 - p2)**2).sum())

def KMeansClustering(X_data, y_data, k=10, stop_iter=10, def_clusters=None):
    dim = X_data.shape[1]
    if def_clusters is None:
        centers = np.random.uniform(low=X_data.min(), high=X_data.max(), size=(k, dim))
        clusters = {}
        for kx in range(k):
            clusters[kx] = {
                'center': centers[kx, :],
                'points': [],
                'answer': [],
                'final' : 0,
            }
    else:
        clusters = def_clusters
    
    curr_iter = 0
    while curr_iter < stop_iter:
        for px in range(X_data.shape[0]):
            distance_px = []
            for kx in range(k):
                try:
                    distance_px.append(dist(X_data[px, :], clusters[kx]['center']))
                except:
                    distance_px.append(float("inf"))
            distance_px = np.asarray(distance_px)
            c_id = distance_px.argmin()
            clusters[c_id]['points'].append(X_data[px, :])
            clusters[c_id]['answer'].append(y_data[px])
        
        for kx in range(k):
            try:
                pts = np.asarray(clusters[kx]['points'])
            except:
                continue
            if len(clusters[kx]['points']) == 0:
                del clusters[kx]
            else:
                clusters[kx]['center'] = pts.mean(axis=0)
                temp = np.asarray(clusters[kx]['answer'])
                temp = np.unique(temp, return_counts = True)
                #print temp
                idx = temp[1].argmax()
                clusters[kx]['final'] = temp[0][idx]
                clusters[kx]['points'] = []
                clusters[kx]['answer'] = []
            
        curr_iter += 1

    return clusters

In [8]:
main_centers = KMeansClustering(X_train, y_train, k=10, stop_iter=12)
print (len(main_centers.keys()))

valid_values = []

for kx in main_centers.keys():
    if main_centers[kx]['final'] not in valid_values:
        valid_values.append(main_centers[kx]['final'])

print ('Clusters formed for digits: ' + str(valid_values))

9
Clusters formed for digits: [1, 7, 8, 0, 6, 3, 4]


In [9]:
def hellinger_dist(x1, x2):
    return np.sqrt(0.5*((np.sqrt(x1) - np.sqrt(x2))**2).sum())

def findCluster(x):    
    vals = []
    for kx in main_centers.keys():
        v = [hellinger_dist(x, main_centers[kx]['center']), main_centers[kx]['final']]
        vals.append(v)
    vals = sorted(vals, key=lambda x:x[0])
    return vals[0][1]

In [11]:
correct = 0
incorrect = 0
start = datetime.datetime.now()

for ix in range(X_test.shape[0]):
    res = findCluster(X_test[ix])
    if res == y_test[ix]:
        correct += 1
    else:
        incorrect += 1

end = datetime.datetime.now()
accuracy = (float(correct)/(correct+incorrect))*100
print ('Accuracy for K-Nearest Neighbours: ', accuracy)
print ('Time Taken: ', (end - start).seconds, 'seconds')

Accuracy for K-Nearest Neighbours:  44.285714285714285
Time Taken:  1 seconds
