In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as km
from sklearn.cluster import AgglomerativeClustering as ag
from auxiliary import *
import csv
import scipy.stats
import pandas as pd

In [2]:
mnist_X = np.load('mnist-sample-X.npy')
mnist_Y = np.load('mnist-sample-y.npy')

In [3]:
def majority_voting(cluster_ids, true_labels, nclasses=10):
    cluster_preds = np.zeros(len(cluster_ids))
    
    for c in range(nclasses):
        #so c goes from 0 to 9
        pred_class = cluster_ids == c
        true_labels_c = true_labels[pred_class]
        maj_vote = scipy.stats.mode(true_labels_c)[0].item()
        
        cluster_preds[pred_class] = maj_vote
    
    pred_diff = cluster_preds - mnist_Y
    n_ = len(cluster_ids)
    training_error = sum( pred_diff != 0 )/n_ * 100 ###counts number of wrong predictions
    return cluster_preds, training_error

In [4]:
#linkage options: {“ward”, “complete”, “average”, “single”},
#affinity: 'euclidean', 'manhattan' (ward only works with euclidean)

model_euc_ward=ag(n_clusters=10,affinity='euclidean',linkage='ward')
clusters_euc_ward=model_euc_ward.fit_predict(mnist_X)
euc_ward_preds, euc_ward_training = majority_voting(clusters_euc_ward,mnist_Y)
print('Percent incorrect with Euclidean/Ward combo: ',euc_ward_training)

Percent incorrect with Euclidean/Ward combo:  35.52


In [5]:
model_euc_com=ag(n_clusters=10,affinity='euclidean',linkage='complete')
clusters_euc_com=model_euc_com.fit_predict(mnist_X)
euc_com_preds, euc_com_training = majority_voting(clusters_euc_com,mnist_Y)
print('Percent incorrect with Euclidean/Complete combo: ',euc_com_training)

Percent incorrect with Euclidean/Complete combo:  62.160000000000004


In [6]:
model_euc_avg=ag(n_clusters=10,affinity='euclidean',linkage='average')
clusters_euc_avg=model_euc_avg.fit_predict(mnist_X)
euc_avg_preds, euc_avg_training = majority_voting(clusters_euc_avg,mnist_Y)
print('Percent incorrect with Euclidean/Average combo: ',euc_com_training)

Percent incorrect with Euclidean/Average combo:  62.160000000000004


In [7]:
model_euc_sin=ag(n_clusters=10,affinity='euclidean',linkage='single')
clusters_euc_sin=model_euc_sin.fit_predict(mnist_X)
euc_sin_preds, euc_sin_training = majority_voting(clusters_euc_sin,mnist_Y)
print('Percent incorrect with Euclidean/Single combo: ',euc_sin_training)

Percent incorrect with Euclidean/Single combo:  89.16


In [8]:
model_man_avg=ag(n_clusters=10,affinity='manhattan',linkage='average')
clusters_man_avg=model_man_avg.fit_predict(mnist_X)
man_avg_preds, man_avg_training = majority_voting(clusters_man_avg,mnist_Y)
print('Percent incorrect with Manhattan/Average combo: ',man_avg_training)

Percent incorrect with Manhattan/Average combo:  77.58


In [9]:
model_km =km(n_clusters=10,init='random',algorithm='full',n_init=1)
km_clusters = model_km.fit_predict(mnist_X)
km_preds, km_training = majority_voting(km_clusters,mnist_Y)
print('Percent incorrect with Kmeans: ',km_training)

Percent incorrect with Kmeans:  41.78


In [10]:
model_kmpp =km(n_clusters=10,init='k-means++',algorithm='full',n_init=1)
kmpp_clusters = model_km.fit_predict(mnist_X)
kmpp_preds, km_training = majority_voting(km_clusters,mnist_Y)
print('Percent incorrect with Kmeans: ',km_training)

Percent incorrect with Kmeans:  41.78
