In [4]:
#@title Don't forget to upload usps.h5

import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred): # from https://stackoverflow.com/a/51672699/7947996; in [0,1]; 0-bad,1-good
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

from sklearn.metrics.cluster import adjusted_rand_score # in [0,1]; 0-bad,1-good
from sklearn.metrics.cluster import normalized_mutual_info_score # in [0,1]; 0-bad,1-good

!pip install coclust
from coclust.evaluation.external import accuracy # in [0,1]; 0-bad,1-good

def get_data_20news():
  import tensorflow as tf
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.feature_extraction.text import TfidfVectorizer

  _20news = fetch_20newsgroups(subset="all")
  data = _20news.data
  target = _20news.target

  vectorizer = TfidfVectorizer(max_features=2000)
  data = vectorizer.fit_transform(data)
  data = data.toarray()

  return data, target


def get_data_mnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_mnist5():
  import tensorflow as tf
  mnist = tf.keras.datasets.mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  indices = y_train < 5
  x_train = x_train[indices]
  y_train = y_train[indices]

  real_labels = y_train
  
  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_fmnist():
  import tensorflow as tf
  mnist = tf.keras.datasets.fashion_mnist
  (x_train, y_train),(x_test, y_test) = mnist.load_data()

  x_train = np.concatenate((x_train,x_test))
  y_train = np.concatenate((y_train,y_test))

  real_labels = y_train

  # # indices = np.isin(y_train,range(number_of_dist))
  # x_train = x_train[indices]
  # y_train = y_train[indices]

  samples = (x_train.reshape((x_train.shape[0],-1))/255.).astype(np.float32)
  
  return samples, real_labels

def get_data_usps():
  import h5py
  path = "./usps.h5"
  with h5py.File(path, 'r') as hf:
    train = hf.get('train')
    X_tr = train.get('data')[:]
    y_tr = train.get('target')[:]
    test = hf.get('test')
    X_te = test.get('data')[:]
    y_te = test.get('target')[:]

  samples = np.concatenate((X_tr,X_te))
  real_labels = np.concatenate((y_tr,y_te))
  return samples, real_labels

original_data_name = "mnist5" # @param ["mnist", "mnist5", "fmnist", "20news", "usps"]

if original_data_name == "mnist":
    samples, real_labels = get_data_mnist()
elif original_data_name == "mnist5":
    samples, real_labels = get_data_mnist5()
elif original_data_name == "fmnist":
    samples, real_labels = get_data_fmnist()
elif original_data_name == "20news":
    samples, real_labels = get_data_20news()
elif original_data_name == "usps":
    samples, real_labels = get_data_usps()
  
k = len(np.unique(real_labels))
n_init = 10
dim_pca = 100

if dim_pca is not None:
    import numpy as np
    from sklearn.decomposition import PCA
    X = samples
    pca = PCA(n_components=dim_pca)
    samples = pca.fit_transform(X)



### Random

In [5]:
predicted_random = np.random.randint(k,size=len(real_labels))

print(purity_score(real_labels,predicted_random))
print(adjusted_rand_score(real_labels,predicted_random))
print(normalized_mutual_info_score(real_labels,predicted_random))
print(accuracy(real_labels,predicted_random))

0.22042815167203023
-2.5435655715452638e-05
0.00010758729404844848
0.20481320833916328




### k-means

In [6]:
from sklearn.cluster import KMeans
import numpy as np
X = samples
kmeans = KMeans(n_clusters=k,n_init=n_init).fit(X)
predicted_km = kmeans.predict(X)

print(purity_score(real_labels,predicted_km))
print(adjusted_rand_score(real_labels,predicted_km))
print(normalized_mutual_info_score(real_labels,predicted_km))
print(accuracy(real_labels,predicted_km))

0.8810969637610186
0.7325175963854366
0.7099972651849052
0.8810969637610186




In [7]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_km)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 248   21 6131  115  388]
 [ 119 7702    0   17   39]
 [5134  787   94  379  596]
 [ 495  401   45  176 6024]
 [  41  268   15 6495    5]]
[[3.59264088e-02 3.04215558e-03 8.88164566e-01 1.66594234e-02
  5.62074460e-02]
 [1.51072743e-02 9.77783420e-01 0.00000000e+00 2.15818205e-03
  4.95112352e-03]
 [7.34477825e-01 1.12589413e-01 1.34477825e-02 5.42203147e-02
  8.52646638e-02]
 [6.93180227e-02 5.61546002e-02 6.30163843e-03 2.46464081e-02
  8.43579331e-01]
 [6.00820633e-03 3.92731536e-02 2.19812427e-03 9.51787808e-01
  7.32708089e-04]]


### GMM

In [8]:
import numpy as np
from sklearn.mixture import GaussianMixture
X = samples
gm = GaussianMixture(n_components=k,n_init=n_init).fit(X)
predicted_gmm = gm.predict(X)

print(purity_score(real_labels,predicted_gmm))
print(adjusted_rand_score(real_labels,predicted_gmm))
print(normalized_mutual_info_score(real_labels,predicted_gmm))
print(accuracy(real_labels,predicted_gmm))

0.8650622638869456
0.6773498866819344
0.7357071788402717
0.8650622638869456




In [9]:
import sklearn.metrics
matrix = sklearn.metrics.cluster.contingency_matrix(real_labels, predicted_gmm)
print(matrix)
print(matrix/matrix.sum(axis=1, keepdims=True))

[[ 788 6106    0    4    5]
 [ 797    0 6188    3  889]
 [6959    3    0    9   19]
 [1136    9    1 5952   43]
 [1115    0    1    0 5708]]
[[1.14153267e-01 8.84542952e-01 0.00000000e+00 5.79458207e-04
  7.24322758e-04]
 [1.01180653e-01 0.00000000e+00 7.85578266e-01 3.80855656e-04
  1.12860226e-01]
 [9.95565093e-01 4.29184549e-04 0.00000000e+00 1.28755365e-03
  2.71816881e-03]
 [1.59081361e-01 1.26032769e-03 1.40036409e-04 8.33496709e-01
  6.02156561e-03]
 [1.63393904e-01 0.00000000e+00 1.46541618e-04 0.00000000e+00
  8.36459555e-01]]
