In [1]:
#@title Don't forget to upload usps.h5

import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred): # from https://stackoverflow.com/a/51672699/7947996; in [0,1]; 0-bad,1-good
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

from sklearn.metrics.cluster import adjusted_rand_score # in [0,1]; 0-bad,1-good
from sklearn.metrics.cluster import normalized_mutual_info_score # in [0,1]; 0-bad,1-good

!pip install coclust
from coclust.evaluation.external import accuracy # in [0,1]; 0-bad,1-good

def get_data_20news():
  import tensorflow as tf
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.feature_extraction.text import TfidfVectorizer

  _20news = fetch_20newsgroups(subset="all")
  data = _20news.data
  target = _20news.target

  vectorizer = TfidfVectorizer(max_features=2000)
  data = vectorizer.fit_transform(data)
  data = data.toarray()

  return data, target


def get_data_mnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_fmnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.fashion_mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_usps():
  import h5py
  path = "./usps.h5"
  with h5py.File(path, 'r') as hf:
    train = hf.get('train')
    X_tr = train.get('data')[:]
    y_tr = train.get('target')[:]
    test = hf.get('test')
    X_te = test.get('data')[:]
    y_te = test.get('target')[:]

  samples = np.concatenate((X_tr,X_te))
  real_labels = np.concatenate((y_tr,y_te))
  return samples, real_labels

original_data_name = "fmnist" # @param ["mnist", "fmnist", "20news", "usps"]

if original_data_name == "mnist":
    samples, real_labels = get_data_mnist()
elif original_data_name == "fmnist":
    samples, real_labels = get_data_fmnist()
elif original_data_name == "20news":
    samples, real_labels = get_data_20news()
elif original_data_name == "usps":
    samples, real_labels = get_data_usps()
  
k = len(np.unique(real_labels))
n_init = 10
dim_pca = 100

if dim_pca is not None:
    import numpy as np
    from sklearn.decomposition import PCA
    X = samples
    pca = PCA(n_components=dim_pca)
    samples = pca.fit_transform(X)

Collecting coclust
  Downloading https://files.pythonhosted.org/packages/5d/44/ad5a69c7187c2b7bcf2c45596e9052811a3be52f4fcaa6709937c5146ee2/coclust-0.2.1.tar.gz
Building wheels for collected packages: coclust
  Building wheel for coclust (setup.py) ... [?25l[?25hdone
  Created wheel for coclust: filename=coclust-0.2.1-cp37-none-any.whl size=29871 sha256=97037498ed9a16ef4c9974f337a9f4dae33fab4391569c4eddae04efaba91126
  Stored in directory: /root/.cache/pip/wheels/cd/d7/68/df601d0b5f8b934cf890dc626c2271df381fb0c3e910b0a34e
Successfully built coclust
Installing collected packages: coclust
Successfully installed coclust-0.2.1




Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


### Random

In [2]:
predicted_random = np.random.randint(k,size=len(real_labels))

print(purity_score(real_labels,predicted_random))
print(adjusted_rand_score(real_labels,predicted_random))
print(normalized_mutual_info_score(real_labels,predicted_random))
print(accuracy(real_labels,predicted_random))

0.10544285714285714
7.021638035521908e-06
0.0002652432490291231
0.10452857142857143




### k-means

In [3]:
from sklearn.cluster import KMeans
import numpy as np
X = samples
kmeans = KMeans(n_clusters=k,n_init=n_init).fit(X)
predicted_km = kmeans.predict(X)

print(purity_score(real_labels,predicted_km))
print(adjusted_rand_score(real_labels,predicted_km))
print(normalized_mutual_info_score(real_labels,predicted_km))
print(accuracy(real_labels,predicted_km))

0.5757571428571429
0.3741965909771554
0.5124030132172156
0.5399714285714285




In [4]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 198   23   34   47 1984    0  499    3  208 4004]
 [6296    2    0   55  186    0  145    0   30  286]
 [  12   24   27 2162 2019    0  469    1 2188   98]
 [3634    5   11   66  798    1  477    0   20 1988]
 [ 172   15   30 3546 1072    0  228    0 1122  815]
 [   1   13   12    0   59  562 4695 1655    0    3]
 [  61   56   19 1392 2426    2  726    6 1089 1223]
 [   0    7    0    0    1  338  733 5921    0    0]
 [  24 2510 2836   43  385    8  491  332  347   24]
 [   3    7    5   12   69 5881  236  778    2    7]]
[[2.82857143e-02 3.28571429e-03 4.85714286e-03 6.71428571e-03
  2.83428571e-01 0.00000000e+00 7.12857143e-02 4.28571429e-04
  2.97142857e-02 5.72000000e-01]
 [8.99428571e-01 2.85714286e-04 0.00000000e+00 7.85714286e-03
  2.65714286e-02 0.00000000e+00 2.07142857e-02 0.00000000e+00
  4.28571429e-03 4.08571429e-02]
 [1.71428571e-03 3.42857143e-03 3.85714286e-03 3.08857143e-01
  2.88428571e-01 0.00000000e+00 6.70000000e-02 1.42857143e-04
  3.12571429e-01 1.40000000e-02

### GMM

In [5]:
import numpy as np
from sklearn.mixture import GaussianMixture
X = samples
gm = GaussianMixture(n_components=k,n_init=n_init).fit(X)
predicted_gmm = gm.predict(X)

print(purity_score(real_labels,predicted_gmm))
print(adjusted_rand_score(real_labels,predicted_gmm))
print(normalized_mutual_info_score(real_labels,predicted_gmm))
print(accuracy(real_labels,predicted_gmm))

0.5529571428571428
0.37112165905915
0.5726910419313281
0.5027




In [6]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_gmm)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[   3   86   36    0    0  610 2014    0 4232   19]
 [   0    1 6117    0    0  650  231    0    1    0]
 [   0 4539    1    0    0   86 2343    0   23    8]
 [   0  119 1651    0    0 4149  861    0  219    1]
 [   0 4565    2    0    0  928 1493    2    6    4]
 [1843    0    0 3085   31    6   25    0    0 2010]
 [   0 3016   11    0    0  361 2529    0 1064   19]
 [  46    0    0 4941 1382    0    0    0    0  631]
 [   2    3    3    7    0  253 1225 3020    0 2487]
 [3041    0    0   72 3626    0   14    0    0  247]]
[[4.28571429e-04 1.22857143e-02 5.14285714e-03 0.00000000e+00
  0.00000000e+00 8.71428571e-02 2.87714286e-01 0.00000000e+00
  6.04571429e-01 2.71428571e-03]
 [0.00000000e+00 1.42857143e-04 8.73857143e-01 0.00000000e+00
  0.00000000e+00 9.28571429e-02 3.30000000e-02 0.00000000e+00
  1.42857143e-04 0.00000000e+00]
 [0.00000000e+00 6.48428571e-01 1.42857143e-04 0.00000000e+00
  0.00000000e+00 1.22857143e-02 3.34714286e-01 0.00000000e+00
  3.28571429e-03 1.14285714e-03