In [1]:
!pip3 install mnist
!pip3 install minisom

# imports
import matplotlib.pyplot as plt
import numpy as np
import mnist
import scipy.misc
import math

from sklearn import datasets,metrics
from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_rcv1

from sklearn.decomposition import PCA, TruncatedSVD, KernelPCA
from sklearn.manifold import TSNE
import umap
from minisom import MiniSom  


from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, SpectralClustering

from sklearn.model_selection import GridSearchCV
import dask_ml.model_selection as dcv

import warnings
warnings.filterwarnings('ignore')


!pip install ipython-autotime
%load_ext autotime



  warn("Tensorflow not installed; ParametricUMAP will be unavailable")


time: 553 µs (started: 2021-01-13 18:56:08 -05:00)


In [2]:
# Load Digits dataset
digits = datasets.load_digits()
digits_n_samples = len(digits.images)
data = digits.images.reshape((digits_n_samples, -1))
digits_X_train, digits_X_test, digits_y_train, digits_y_test = train_test_split(data, digits.target, test_size=0.3, shuffle=True)


# Load full MNIST dataset
MNIST_X_train = mnist.train_images()
MNIST_X_train = MNIST_X_train.reshape((len(MNIST_X_train), -1)) 
MNIST_y_train = mnist.train_labels()

MNIST_X_test = mnist.test_images()
MNIST_X_test = MNIST_X_test.reshape((len(MNIST_X_test), -1)) 
MNIST_y_test = mnist.test_labels()

total_samples = len(MNIST_X_train) + len(MNIST_X_test)
blob_test_size = len(MNIST_X_test)/total_samples

# Load generated blobs dataset

blobs_X, blobs_y = datasets.make_blobs(n_samples=total_samples, centers=10, n_features=MNIST_X_train.shape[1], random_state=0)
blobs_X_train, blobs_X_test, blobs_y_train, blobs_y_test = train_test_split(blobs_X, blobs_y, test_size=blob_test_size, shuffle=True)

# Keep number of training samples consistent
assert (len(blobs_X_train) == len(MNIST_X_train)), print(len(blobs_X_train), len(MNIST_X_train))
assert (len(blobs_X_test) == len(MNIST_X_test)), print(len(blobs_X_test), len(MNIST_X_test))



time: 3.7 s (started: 2021-01-13 18:56:08 -05:00)


In [3]:
# k-Means clustering
def kmeans(X_train, X_test, n_clusters):
  kmeans = KMeans(n_clusters=n_clusters, random_state=0)
  y_pred_train = kmeans.fit_predict(X_train)
  train_centers = kmeans.cluster_centers_

  y_pred_test = kmeans.fit_predict(X_test)
  test_centers = kmeans.cluster_centers_

  return y_pred_train, y_pred_test, train_centers, test_centers
  

time: 545 µs (started: 2021-01-13 18:56:11 -05:00)


In [4]:
# Spectral clustering 
def spectral(X_train, X_test, n_clusters):
  spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=0)
  y_pred_train = spectral.fit_predict(X_train)
  y_pred_test = spectral.fit_predict(X_test)

  return y_pred_train, y_pred_test


time: 1.13 ms (started: 2021-01-13 18:56:11 -05:00)


In [5]:
# Evaluation function
def evaluate(X, y, y_pred):
    print("Adjusted Mutual Information Score: %0.3f" % metrics.adjusted_mutual_info_score(y, y_pred))
    print("Adjusted Rand Index Score: %0.3f" % metrics.adjusted_rand_score(y, y_pred))
    print("Normalized Mutual Information Score: %0.3f" % metrics.normalized_mutual_info_score(y, y_pred))

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, y_pred))
    print("Completeness: %0.3f" % metrics.completeness_score(y, y_pred))
    print("V-measure: %0.3f" % metrics.v_measure_score(y, y_pred))

    #print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, y_pred, sample_size=1000))
    print()



time: 725 µs (started: 2021-01-13 18:56:11 -05:00)


In [None]:
# dimension reduction

def cv_score(estimator, X, y):
    print("here")
    X_reduced = estimator.fit_transform(X)
    kmeans = KMeans(n_clusters=len(np.unique(y)), random_state=0)
    y_pred = kmeans.fit_predict(X)
    return metrics.adjusted_mutual_info_score(y, y_pred)
    
    

def dimension_reduce(dimensions, X_train, X_test, y_train, y_test, n_clusters):    
    
    param_grid = [{
        "n_neighbors": [2, 5, 10, 20, 50, 100],
        "metric": ['euclidean'],
        "min_dist": np.linspace(0.1, 1, 10)
    }]
    #cv = [(slice(None), slice(None))]
    
    
    for dim in dimensions:
        umapr = umap.UMAP(n_components=dim)
        
        gs = dcv.GridSearchCV(umapr, param_grid, scoring=cv_score, scheduler='multiprocessing')
        gs.fit(X_train, y_train)
        
        X_train_reduce = gs.best_estimator_.fit_transform(X_train)
        X_test_reduce = gs.best_estimator_.fit_transform(X_test)
        
        y_pred_train, y_pred_test, _, _ = kmeans(X_train_reduce, X_test_reduce, n_clusters)
        
        print(gs.best_params_)
        print("KMeans train dimension reduce: " + str(dim))
        evaluate(X_train_reduce ,y_train, y_pred_train)
        print("KMeans test dimension reduce: " + str(dim))
        evaluate(X_test_reduce ,y_test, y_pred_test)
        print()
    


#dimension_reduce([8], digits_X_train, digits_X_test, digits_y_train, digits_y_test, 10)
dimension_reduce([700, 600], MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, 10)

In [None]:
# Visualization general

def visualize(X_train, X_test, y_train, y_test, reduce, title, n_clusters):

  X_train_r, X_test_r = reduce(X_train, X_test, 2)


  kmy_pred_train, kmy_pred_test, train_centers, test_centers = kmeans(X_train_r, X_test_r,  n_clusters)
  spy_pred_train, spy_pred_test = spectral(X_train_r, X_test_r, n_clusters)

  
  fig=plt.figure(figsize=(14,10))
  # Train orginial labels
  plt.subplot(2, 3, 1)
  plt.gca().set_title('Train Data With Orginial Labels')
  plt.scatter(X_train_r[:, 0], X_train_r[:, 1], c= y_train, s=50, cmap='Set3')

  # Train kmeans labels
  plt.subplot(2, 3, 2)
  plt.gca().set_title('Train Data With KMeans Clusters')
  plt.scatter(X_train_r[:, 0], X_train_r[:, 1], c= kmy_pred_train, s=50, cmap='Set3')
  plt.scatter(train_centers [:, 0], train_centers [:, 1], c='black', s=200, alpha=0.8)

  # Train spectral labels
  plt.subplot(2, 3, 3)
  plt.gca().set_title('Train Data With Spectral Clusters')
  plt.scatter(X_train_r[:, 0], X_train_r[:, 1], c= spy_pred_train, s=50, cmap='Set3')


  # Test orginial labels
  plt.subplot(2, 3, 4)
  plt.gca().set_title('Test Data With Orginial Labels')
  plt.scatter(X_test_r[:, 0], X_test_r[:, 1], c= y_test, s=50, cmap='Set3')

  # Test kmeans labels
  plt.subplot(2, 3, 5)
  plt.gca().set_title('Test Data With KMeans Clusters')
  plt.scatter(X_test_r[:, 0], X_test_r[:, 1], c= kmy_pred_test, s=50, cmap='Set3')
  plt.scatter(test_centers[:, 0], test_centers[:, 1], c='black', s=200, alpha=0.8)

  # Test spectral labels
  plt.subplot(2, 3, 6)
  plt.gca().set_title('Test Data With Spectral Clusters')
  plt.scatter(X_test_r[:, 0], X_test_r[:, 1], c= spy_pred_test, s=50, cmap='Set3')

  fig.suptitle(title + " Reduction", fontsize=16)

  fig.tight_layout()
  fig.subplots_adjust(top=0.93)
  plt.show()



visualize(digits_X_train, digits_X_test, digits_y_train, digits_y_test, kernel_pcapoly_reduce, "", 10)


In [None]:
# Digits reduction 
dimension_reduce([48, 32, 16, 8, 4, 2], digits_X_train, digits_X_test, digits_y_train, digits_y_test, kernel_pcarbf_reduce, 10)
#visualize(digits_X_train, digits_X_test, digits_y_train, digits_y_test, som_reduce, "SOM", 10)

#visualize(digits_X_train, digits_X_test, digits_y_train, digits_y_test, tsne_reduce, "TSNE", 10)

In [None]:
# MNIST reduction 

#dimension_reduce([700, 600, 500, 400, 300, 200, 100, 50], MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test,umap_reduce, 10)
#dimension_reduce([700, 600, 500, 400, 300, 200, 100, 50], MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test,kernel_pcarbf_reduce, 10)


#visualize(MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, umap_reduce, "UMAP Reduce", 10)
#visualize(MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, kernel_pcapoly_reduce, "Kernel PCA (Poly) Reduce", 10)
#visualize(MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, kernel_pcasig_reduce, "Kernel PCA (Sigmoid) Reduce", 10)
#visualize(MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, kernel_pcacos_reduce, "Kernel PCA (Cosine) Reduce", 10)
dimension_reduce([700, 600, 500, 400, 300, 200, 100, 50], MNIST_X_train, MNIST_X_test, MNIST_y_train, MNIST_y_test, kernel_pcarbf_reduce, 10)



In [None]:
# blobs reduction 
dimension_reduce([700, 600, 500, 400, 300, 200, 100, 50, 4], blobs_X_train, blobs_X_test, blobs_y_train, blobs_y_test, 10)
#visualize_pca(blobs_X_train, blobs_X_test, blobs_y_train, blobs_y_test, 10)
#visualize_tsne(blobs_X_train, blobs_X_test, blobs_y_train, blobs_y_test, 10)

In [None]:
# fetch 20 reduction 
#dimension_reduce([ 1000, 500, 300, 200, 100, 50, 20], newsgroups_X_train , newsgroups_X_test , newsgroups_y_train , newsgroups_y_test, 20)
visualize_pca(newsgroups_X_train , newsgroups_X_test , newsgroups_y_train , newsgroups_y_test, 20)
visualize_tsne(newsgroups_X_train , newsgroups_X_test , newsgroups_y_train , newsgroups_y_test, 20, True)

In [None]:
# rcv1 reduction
dimension_reduce([ 1000, 500, 300, 200, 100, 50, 20], rcv1_X_train , rcv1_X_test , rcv1_y_train , rcv1_y_test, 103)
visualize_pca(rcv1_X_train , rcv1_X_test , rcv1_y_train , rcv1_y_test, 103)
visualize_tsne(rcv1_X_train , rcv1_X_test , rcv1_y_train , rcv1_y_test, 103, True)