In [1]:
#@title Don't forget to upload usps.h5

import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred): # from https://stackoverflow.com/a/51672699/7947996; in [0,1]; 0-bad,1-good
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

from sklearn.metrics.cluster import adjusted_rand_score # in [0,1]; 0-bad,1-good
from sklearn.metrics.cluster import normalized_mutual_info_score # in [0,1]; 0-bad,1-good

!pip install coclust
from coclust.evaluation.external import accuracy # in [0,1]; 0-bad,1-good

def get_data_20news():
  import tensorflow as tf
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.feature_extraction.text import TfidfVectorizer

  _20news = fetch_20newsgroups(subset="all")
  data = _20news.data
  target = _20news.target

  vectorizer = TfidfVectorizer(max_features=2000)
  data = vectorizer.fit_transform(data)
  data = data.toarray()

  return data, target


def get_data_mnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_cifar10():
  import tensorflow as tf
  mnist = tf.keras.datasets.cifar10
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))
  y_train = y_train.squeeze()

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_mnist5():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  indices = y_train < 5
  x_train = x_train[indices]
  y_train = y_train[indices]

  real_labels = y_train

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_fmnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.fashion_mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_usps():
  import h5py
  path = "./usps.h5"
  with h5py.File(path, 'r') as hf:
    train = hf.get('train')
    X_tr = train.get('data')[:]
    y_tr = train.get('target')[:]
    test = hf.get('test')
    X_te = test.get('data')[:]
    y_te = test.get('target')[:]

  samples = np.concatenate((X_tr,X_te))
  real_labels = np.concatenate((y_tr,y_te))
  return samples, real_labels

original_data_name = "cifar10" # @param ["mnist", "mnist5", "cifar10", "fmnist", "20news", "usps"]

if original_data_name == "mnist":
    samples, real_labels = get_data_mnist()
elif original_data_name == "mnist5":
    samples, real_labels = get_data_mnist5()
elif original_data_name == "cifar10":
    samples, real_labels = get_data_cifar10()
elif original_data_name == "fmnist":
    samples, real_labels = get_data_fmnist()
elif original_data_name == "20news":
    samples, real_labels = get_data_20news()
elif original_data_name == "usps":
    samples, real_labels = get_data_usps()

  
k = len(np.unique(real_labels))
n_init = 10





### Random

In [2]:
predicted_random = np.random.randint(k,size=len(real_labels))

print(purity_score(real_labels,predicted_random))
print(adjusted_rand_score(real_labels,predicted_random))
print(normalized_mutual_info_score(real_labels,predicted_random))
print(accuracy(real_labels,predicted_random))

0.10635
2.997195984511677e-05
0.00035189400777027184
0.10553333333333334




### k-means

In [3]:
from sklearn.cluster import KMeans
import numpy as np
X = samples
kmeans = KMeans(n_clusters=k,n_init=n_init).fit(X)
predicted_km = kmeans.predict(X)

print(purity_score(real_labels,predicted_km))
print(adjusted_rand_score(real_labels,predicted_km))
print(normalized_mutual_info_score(real_labels,predicted_km))
print(accuracy(real_labels,predicted_km))

0.22121666666666667
0.0417808638470051
0.07932388205690752
0.20623333333333332




In [4]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 642  912  592  161  485 1188  272  347 1154  247]
 [1028  588  996  455  490  184  892  518  258  591]
 [ 393  213  324  410  356  650 1087 1404  446  717]
 [ 580  143  393  900  460  550  856  979  309  830]
 [ 308  205  335 1115  212  393 1243 1325  140  724]
 [1037  112  248 1013  339  733  676  982  198  662]
 [ 450   53  174  553  459  170 1424 1226  207 1284]
 [ 477  253  857  884  683  366  764 1202  140  374]
 [1011 1889 1093  253  210  635  193  209  283  224]
 [ 493  907 2056  192  722  167  637  466  200  160]]
[[0.107      0.152      0.09866667 0.02683333 0.08083333 0.198
  0.04533333 0.05783333 0.19233333 0.04116667]
 [0.17133333 0.098      0.166      0.07583333 0.08166667 0.03066667
  0.14866667 0.08633333 0.043      0.0985    ]
 [0.0655     0.0355     0.054      0.06833333 0.05933333 0.10833333
  0.18116667 0.234      0.07433333 0.1195    ]
 [0.09666667 0.02383333 0.0655     0.15       0.07666667 0.09166667
  0.14266667 0.16316667 0.0515     0.13833333]
 [0.05133333 0

### GMM

In [5]:
import numpy as np
from sklearn.mixture import GaussianMixture
X = samples
gm = GaussianMixture(n_components=k,n_init=n_init).fit(X)
predicted_gmm = gm.predict(X)

print(purity_score(real_labels,predicted_gmm))
print(adjusted_rand_score(real_labels,predicted_gmm))
print(normalized_mutual_info_score(real_labels,predicted_gmm))
print(accuracy(real_labels,predicted_gmm))

0.20651666666666665
0.0226805969131825
0.05698943167015307
0.19153333333333333




In [6]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 642  912  592  161  485 1188  272  347 1154  247]
 [1028  588  996  455  490  184  892  518  258  591]
 [ 393  213  324  410  356  650 1087 1404  446  717]
 [ 580  143  393  900  460  550  856  979  309  830]
 [ 308  205  335 1115  212  393 1243 1325  140  724]
 [1037  112  248 1013  339  733  676  982  198  662]
 [ 450   53  174  553  459  170 1424 1226  207 1284]
 [ 477  253  857  884  683  366  764 1202  140  374]
 [1011 1889 1093  253  210  635  193  209  283  224]
 [ 493  907 2056  192  722  167  637  466  200  160]]
[[0.107      0.152      0.09866667 0.02683333 0.08083333 0.198
  0.04533333 0.05783333 0.19233333 0.04116667]
 [0.17133333 0.098      0.166      0.07583333 0.08166667 0.03066667
  0.14866667 0.08633333 0.043      0.0985    ]
 [0.0655     0.0355     0.054      0.06833333 0.05933333 0.10833333
  0.18116667 0.234      0.07433333 0.1195    ]
 [0.09666667 0.02383333 0.0655     0.15       0.07666667 0.09166667
  0.14266667 0.16316667 0.0515     0.13833333]
 [0.05133333 0