In [1]:
#@title Don't forget to upload usps.h5

import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred): # from https://stackoverflow.com/a/51672699/7947996; in [0,1]; 0-bad,1-good
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

from sklearn.metrics.cluster import adjusted_rand_score # in [0,1]; 0-bad,1-good
from sklearn.metrics.cluster import normalized_mutual_info_score # in [0,1]; 0-bad,1-good

!pip install coclust
from coclust.evaluation.external import accuracy # in [0,1]; 0-bad,1-good

def get_data_20news():
  import tensorflow as tf
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.feature_extraction.text import TfidfVectorizer

  _20news = fetch_20newsgroups(subset="all")
  data = _20news.data
  target = _20news.target

  vectorizer = TfidfVectorizer(max_features=2000)
  data = vectorizer.fit_transform(data)
  data = data.toarray()

  return data, target


def get_data_mnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels


def get_data_fmnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.fashion_mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_usps():
  import h5py
  path = "./usps.h5"
  with h5py.File(path, 'r') as hf:
    train = hf.get('train')
    X_tr = train.get('data')[:]
    y_tr = train.get('target')[:]
    test = hf.get('test')
    X_te = test.get('data')[:]
    y_te = test.get('target')[:]

  samples = np.concatenate((X_tr,X_te))
  real_labels = np.concatenate((y_tr,y_te))
  return samples, real_labels

original_data_name = "fmnist" # @param ["mnist", "fmnist", "20news", "usps"]

if original_data_name == "mnist":
    samples, real_labels = get_data_mnist()
elif original_data_name == "fmnist":
    samples, real_labels = get_data_fmnist()
elif original_data_name == "20news":
    samples, real_labels = get_data_20news()
elif original_data_name == "usps":
    samples, real_labels = get_data_usps()
  
k = len(np.unique(real_labels))
n_init = 10





### Random

In [2]:
predicted_random = np.random.randint(k,size=len(real_labels))

print(purity_score(real_labels,predicted_random))
print(adjusted_rand_score(real_labels,predicted_random))
print(normalized_mutual_info_score(real_labels,predicted_random))
print(accuracy(real_labels,predicted_random))

0.1065
2.381189318612532e-05
0.0002966918446209515
0.10611428571428572




### k-means

In [3]:
from sklearn.cluster import KMeans
import numpy as np
X = samples
kmeans = KMeans(n_clusters=k,n_init=n_init).fit(X)
predicted_km = kmeans.predict(X)

print(purity_score(real_labels,predicted_km))
print(adjusted_rand_score(real_labels,predicted_km))
print(normalized_mutual_info_score(real_labels,predicted_km))
print(accuracy(real_labels,predicted_km))

0.5546285714285715
0.34812705737755
0.5119649719039909
0.4740714285714286




In [4]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[1845  202    0    0  229  662   33   27    3 3999]
 [ 158   72    0    0 6307  172    0    3    0  288]
 [2135 4094    1    0   13  559   30   30    1  137]
 [ 649   59    0    0 3714  599   10    7    0 1962]
 [1232 4229    0    0  182  288   34   19    0 1016]
 [  36    0  306  547    1 4439   13    4 1652    2]
 [2436 2272    1    0   72  870   18   76    6 1249]
 [   0    0   28  943    0  582    0    1 5446    0]
 [ 281  336    7   72   32  565 2855 2552  274   26]
 [  40    1 3117 3444    3  200    2    4  187    2]]
[[2.63571429e-01 2.88571429e-02 0.00000000e+00 0.00000000e+00
  3.27142857e-02 9.45714286e-02 4.71428571e-03 3.85714286e-03
  4.28571429e-04 5.71285714e-01]
 [2.25714286e-02 1.02857143e-02 0.00000000e+00 0.00000000e+00
  9.01000000e-01 2.45714286e-02 0.00000000e+00 4.28571429e-04
  0.00000000e+00 4.11428571e-02]
 [3.05000000e-01 5.84857143e-01 1.42857143e-04 0.00000000e+00
  1.85714286e-03 7.98571429e-02 4.28571429e-03 4.28571429e-03
  1.42857143e-04 1.95714286e-02

### GMM

In [5]:
import numpy as np
from sklearn.mixture import GaussianMixture
X = samples
gm = GaussianMixture(n_components=k,n_init=n_init).fit(X)
predicted_gmm = gm.predict(X)

print(purity_score(real_labels,predicted_gmm))
print(adjusted_rand_score(real_labels,predicted_gmm))
print(normalized_mutual_info_score(real_labels,predicted_gmm))
print(accuracy(real_labels,predicted_gmm))

0.5587428571428571
0.3728084346212849
0.5365879871727295
0.5396142857142857




In [6]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[1845  202    0    0  229  662   33   27    3 3999]
 [ 158   72    0    0 6307  172    0    3    0  288]
 [2135 4094    1    0   13  559   30   30    1  137]
 [ 649   59    0    0 3714  599   10    7    0 1962]
 [1232 4229    0    0  182  288   34   19    0 1016]
 [  36    0  306  547    1 4439   13    4 1652    2]
 [2436 2272    1    0   72  870   18   76    6 1249]
 [   0    0   28  943    0  582    0    1 5446    0]
 [ 281  336    7   72   32  565 2855 2552  274   26]
 [  40    1 3117 3444    3  200    2    4  187    2]]
[[2.63571429e-01 2.88571429e-02 0.00000000e+00 0.00000000e+00
  3.27142857e-02 9.45714286e-02 4.71428571e-03 3.85714286e-03
  4.28571429e-04 5.71285714e-01]
 [2.25714286e-02 1.02857143e-02 0.00000000e+00 0.00000000e+00
  9.01000000e-01 2.45714286e-02 0.00000000e+00 4.28571429e-04
  0.00000000e+00 4.11428571e-02]
 [3.05000000e-01 5.84857143e-01 1.42857143e-04 0.00000000e+00
  1.85714286e-03 7.98571429e-02 4.28571429e-03 4.28571429e-03
  1.42857143e-04 1.95714286e-02