In [1]:
#@title Don't forget to upload usps.h5

import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred): # from https://stackoverflow.com/a/51672699/7947996; in [0,1]; 0-bad,1-good
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

from sklearn.metrics.cluster import adjusted_rand_score # in [0,1]; 0-bad,1-good
from sklearn.metrics.cluster import normalized_mutual_info_score # in [0,1]; 0-bad,1-good

!pip install coclust
from coclust.evaluation.external import accuracy # in [0,1]; 0-bad,1-good

def get_data_20news():
  import tensorflow as tf
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.feature_extraction.text import TfidfVectorizer

  _20news = fetch_20newsgroups(subset="all")
  data = _20news.data
  target = _20news.target

  vectorizer = TfidfVectorizer(max_features=2000)
  data = vectorizer.fit_transform(data)
  data = data.toarray()

  return data, target


def get_data_mnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_cifar10():
  import tensorflow as tf
  mnist = tf.keras.datasets.cifar10
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))
  y_train = y_train.squeeze()

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_mnist5():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  indices = y_train < 5
  x_train = x_train[indices]
  y_train = y_train[indices]

  real_labels = y_train
  
  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_fmnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.fashion_mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_usps():
  import h5py
  path = "./usps.h5"
  with h5py.File(path, 'r') as hf:
    train = hf.get('train')
    X_tr = train.get('data')[:]
    y_tr = train.get('target')[:]
    test = hf.get('test')
    X_te = test.get('data')[:]
    y_te = test.get('target')[:]

  samples = np.concatenate((X_tr,X_te))
  real_labels = np.concatenate((y_tr,y_te))
  return samples, real_labels

original_data_name = "cifar10" # @param ["mnist", "mnist5", "cifar10", "fmnist", "20news", "usps"]

if original_data_name == "mnist":
    samples, real_labels = get_data_mnist()
elif original_data_name == "mnist5":
    samples, real_labels = get_data_mnist5()
elif original_data_name == "cifar10":
    samples, real_labels = get_data_cifar10()
elif original_data_name == "fmnist":
    samples, real_labels = get_data_fmnist()
elif original_data_name == "20news":
    samples, real_labels = get_data_20news()
elif original_data_name == "usps":
    samples, real_labels = get_data_usps()
  
k = len(np.unique(real_labels))
n_init = 10
dim_pca = 100

if dim_pca is not None:
    import numpy as np
    from sklearn.decomposition import PCA
    X = samples
    pca = PCA(n_components=dim_pca)
    samples = pca.fit_transform(X)

Collecting coclust
  Downloading https://files.pythonhosted.org/packages/5d/44/ad5a69c7187c2b7bcf2c45596e9052811a3be52f4fcaa6709937c5146ee2/coclust-0.2.1.tar.gz
Building wheels for collected packages: coclust
  Building wheel for coclust (setup.py) ... [?25l[?25hdone
  Created wheel for coclust: filename=coclust-0.2.1-cp37-none-any.whl size=29871 sha256=ee54dd41657e88a4238afb42b14985759c1b2d3357acda37816b4c07587e2883
  Stored in directory: /root/.cache/pip/wheels/cd/d7/68/df601d0b5f8b934cf890dc626c2271df381fb0c3e910b0a34e
Successfully built coclust
Installing collected packages: coclust
Successfully installed coclust-0.2.1




Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


### Random

In [2]:
predicted_random = np.random.randint(k,size=len(real_labels))

print(purity_score(real_labels,predicted_random))
print(adjusted_rand_score(real_labels,predicted_random))
print(normalized_mutual_info_score(real_labels,predicted_random))
print(accuracy(real_labels,predicted_random))

0.10578333333333333
2.0707135223154302e-05
0.0003345272053564096
0.1055




### k-means

In [3]:
from sklearn.cluster import KMeans
import numpy as np
X = samples
kmeans = KMeans(n_clusters=k,n_init=n_init).fit(X)
predicted_km = kmeans.predict(X)

print(purity_score(real_labels,predicted_km))
print(adjusted_rand_score(real_labels,predicted_km))
print(normalized_mutual_info_score(real_labels,predicted_km))
print(accuracy(real_labels,predicted_km))

0.22128333333333333
0.041738892707942295
0.07927056034155963
0.20603333333333335




In [4]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 158  913 1177  492  274  651  250  587 1147  351]
 [ 471  587  180  489  889 1025  592  989  256  522]
 [ 417  217  631  367 1092  403  713  322  444 1394]
 [ 910  143  554  459  858  568  829  393  307  979]
 [1123  206  391  209 1242  312  721  333  139 1324]
 [1040  113  737  341  678 1012  665  245  196  973]
 [ 569   53  174  457 1440  431 1282  171  206 1217]
 [ 882  250  363  675  772  475  374  857  139 1213]
 [ 255 1894  616  210  191 1048  225 1077  281  203]
 [ 192  892  159  716  638  512  162 2056  200  473]]
[[0.02633333 0.15216667 0.19616667 0.082      0.04566667 0.1085
  0.04166667 0.09783333 0.19116667 0.0585    ]
 [0.0785     0.09783333 0.03       0.0815     0.14816667 0.17083333
  0.09866667 0.16483333 0.04266667 0.087     ]
 [0.0695     0.03616667 0.10516667 0.06116667 0.182      0.06716667
  0.11883333 0.05366667 0.074      0.23233333]
 [0.15166667 0.02383333 0.09233333 0.0765     0.143      0.09466667
  0.13816667 0.0655     0.05116667 0.16316667]
 [0.18716667 

### GMM

In [5]:
import numpy as np
from sklearn.mixture import GaussianMixture
X = samples
gm = GaussianMixture(n_components=k,n_init=n_init).fit(X)
predicted_gmm = gm.predict(X)

print(purity_score(real_labels,predicted_gmm))
print(adjusted_rand_score(real_labels,predicted_gmm))
print(normalized_mutual_info_score(real_labels,predicted_gmm))
print(accuracy(real_labels,predicted_gmm))

0.298
0.09402174449013917
0.1619164030939813
0.28731666666666666




In [6]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_gmm)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 228  835 1144  285  200 1040 1012  387  417  452]
 [ 397  299   39   82  105  978  459    6 2431 1204]
 [ 323  299  132 1314 1045  328  301 1409  355  494]
 [ 387   91    9 1855  921  349  169  312  521 1386]
 [ 592  323   86  607 1369  395  585 1351  364  328]
 [ 537   65    6 2385  659  230  100  385  369 1264]
 [ 148  149   27  542 2726  301   96  806  581  624]
 [2627   86    8  597  344  432  353  114  477  962]
 [ 138 1596  800  137  212 1077  873  107  725  335]
 [ 478  169   44  149  110  950  377    5 2245 1473]]
[[0.038      0.13916667 0.19066667 0.0475     0.03333333 0.17333333
  0.16866667 0.0645     0.0695     0.07533333]
 [0.06616667 0.04983333 0.0065     0.01366667 0.0175     0.163
  0.0765     0.001      0.40516667 0.20066667]
 [0.05383333 0.04983333 0.022      0.219      0.17416667 0.05466667
  0.05016667 0.23483333 0.05916667 0.08233333]
 [0.0645     0.01516667 0.0015     0.30916667 0.1535     0.05816667
  0.02816667 0.052      0.08683333 0.231     ]
 [0.09866667 0