In [None]:
#%matplotlib notebook
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot, iplot, init_notebook_mode
from plotly.subplots import make_subplots
from sklearn.neighbors import KNeighborsClassifier

from keras.datasets import mnist
import seaborn as sns
import pandas as pd
from collections import Counter, OrderedDict

from sklearn.mixture import GaussianMixture
from filterpy.kalman import unscented_transform, MerweScaledSigmaPoints
import scipy.stats as stats

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import random
from sklearn.cluster import KMeans
from itertools import chain

from filterpy.kalman import unscented_transform, MerweScaledSigmaPoints
import scipy.stats as stats

In [None]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.reshape((-1, 784)).astype('float32') / 255.0
X_test = X_test.reshape((-1, 784)).astype('float32') / 255.0
# y_train = to_categorical(y_train, 10)
# y_test = to_categorical(y_test, 10)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
mnist_digits = np.concatenate([X_train, X_test], axis=0)
print(mnist_digits.shape)

In [None]:
latent_space_train = np.load('./dataset/MNIST_train_latent_space_z7.npy')
print(latent_space_train.shape, Y_train.shape)

In [None]:
latent_space_test = np.load('./dataset/MNIST_test_latent_space_z7.npy')
print(latent_space_test.shape,Y_test.shape)

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'C' : [0.1, 1, 10, 100]},
                    {'kernel': ['linear'], 'C' : [0.1, 1, 10, 100]}]
# ,
#                    {'kernel': ['poly'], 'C' : [0.1, 1, 10, 100], 'gamma': [1,0.1,0.01,0.001]},
#                    {'kernel': ['sigmoid'], 'C' : [0.1, 1, 10, 100], 'gamma': [1,0.1,0.01,0.001]}]
score = 'accuracy'
clf = GridSearchCV(SVC(), tuned_parameters, scoring=score, n_jobs=-1,refit=True,verbose=2)

# z=7

In [None]:
####### till 500 ########

uniform_is = np.load('./result/mean_accuracy_rs_MNIST_input_space.npy')
uniform_is_sd = np.load('./result/sd_accuracy_rs_MNIST_input_space.npy')

uniform_ls = np.load('./result/mean_accuracy_rs_MNIST_z7.npy')
uniform_ls_sd = np.load('./result/sd_accuracy_rs_MNIST5_z7.npy')

random_coreset_is50 = np.load('./result/mean_accuracy_cs_MNIST_input_space_K50.npy')
random_coreset_is50_sd = np.load('./result/sd_accuracy_cs_MNIST_input_space_K50.npy')

random_coreset_ls50 = np.load('./result/mean_accuracy_cs_MNIST_z7_K50.npy')
random_coreset_ls50_sd = np.load('./result/sd_accuracy_cs_MNIST5_z7_K50.npy')

sensitivity_ls50 = np.load('./result/mean_accuracy_sensitivity_cs_MNIST_z7_K50.npy')
sensitivity_ls50_sd = np.load('./result/sd_accuracy_sensitivity_cs_MNIST_z7_K50.npy')

In [None]:
number_of_labeled_data = range(50,501,50)
plt.figure(figsize=(20,10))

plt.plot(number_of_labeled_data,uniform_is, marker='o', markersize=10)
plt.plot(number_of_labeled_data,uniform_ls, marker='*', markersize=10)
plt.plot(number_of_labeled_data,random_coreset_is50, marker='X', markersize=10)
plt.plot(number_of_labeled_data,random_coreset_ls50, marker='D', markersize=10) 
plt.plot(number_of_labeled_data,sensitivity_ls50, marker='s', markersize=10)


plt.fill_between(number_of_labeled_data, (uniform_is - uniform_is_sd), (uniform_is + uniform_is_sd), alpha=.1)
plt.fill_between(number_of_labeled_data, (uniform_ls - uniform_ls_sd), (uniform_ls + uniform_ls_sd), alpha=.1)
plt.fill_between(number_of_labeled_data, (random_coreset_is50 - random_coreset_is50_sd), (random_coreset_is50 + random_coreset_is50_sd), alpha=.1)
plt.fill_between(number_of_labeled_data, (random_coreset_ls50 - random_coreset_ls50_sd), (random_coreset_ls50 + random_coreset_ls50_sd), alpha=.1)
plt.fill_between(number_of_labeled_data, (sensitivity_ls50 - sensitivity_ls50_sd), (sensitivity_ls50 + sensitivity_ls50_sd), alpha=.1)





plt.legend(['input space (uniform)',         
            'latent space (uniform)',
          'input space (Random Coreset,K=50)',
            'latent space (Random Coreset, K=50)'\
           ,'latent space (Sensitivity Sampling)'
           ], loc = 4, fontsize=25)
plt.grid()
plt.xticks(number_of_labeled_data)
plt.xlabel("Number of labelled points")
plt.ylabel("Accuracy")
plt.title("Accuracy comparison for 500 labeled points")

# UT

# Z=7

In [None]:
n_components = np.arange(1, 81)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(latent_space_train) #z=7
          for n in n_components]
plt.figure(figsize=(20,10))
plt.grid()
plt.plot(n_components, [m.bic(latent_space_train) for m in models], label='BIC')
plt.plot(n_components, [m.aic(latent_space_train) for m in models], label='AIC')
plt.legend(loc='best')

plt.xlabel('n_components');

In [None]:
np.argmin([m.bic(latent_space_train) for m in models])

In [None]:
np.argmin([m.aic(latent_space_train) for m in models])

# GMM + UT

In [None]:
from filterpy.kalman import unscented_transform, MerweScaledSigmaPoints
import scipy.stats as stats

In [None]:
# gmm = GaussianMixture(n_components=20).fit(latent_space_train)
gmm = GaussianMixture(n_components=100).fit(latent_space_train)
labels = gmm.predict(latent_space_train)

In [None]:
print(Counter(labels))

In [None]:
gmm.covariances_.shape

In [None]:
gmm.means_.shape

In [None]:
all_sigmas = []
for i in range(100):#range(20):
    mean = gmm.means_[i]
    p = gmm.covariances_[i]
    points = MerweScaledSigmaPoints(n=7, alpha=0.1, beta=2., kappa=(3-7))
    sigmas = points.sigma_points(mean, p)
    all_sigmas.append(sigmas)
    #print(sigmas)

In [None]:
points

In [None]:
all_sigmas = np.array(all_sigmas, dtype=np.float32)
all_sigmas.shape

In [None]:
sigma_points = all_sigmas.reshape(100*15,7)
sigma_points.shape

In [None]:
# REMOVE DUPLICATE SIGMA POINTS
sigma_points = np.unique(sigma_points, axis=0)
sigma_points.shape

In [None]:
# fig = go.Figure()

# fig.add_trace(go.Scatter3d(x=sigma_points[:,0], y=sigma_points[:,1], \
#                                    z=sigma_points[:,2], mode='markers',\
#                                   marker=dict(
#         size=2,
#         color='black',                # set color to an array/list of desired values
#        # colorscale='Viridis',   # choose a colorscale
#         opacity=1.0
#     )))

# fig.add_trace(go.Scatter3d(x=latent_space_train[:,0], y=latent_space_train[:,1], z=latent_space_train[:,2], mode='markers',\
#                                   marker=dict(
#         size=1,
#         color='yellow',#kmeans.labels_,                # set color to an array/list of desired values
#        # colorscale='Viridis',   # choose a colorscale
#         opacity=0.5
#     )))


# fig.update_layout(
#     autosize=False,
#     width=1000,
#     height=1000)

# fig.show()

In [None]:
# def closest_node(node, nodes):
#     dist_2 = np.sum((nodes - node)**2, axis=1)
#     return np.argmin(dist_2)

## Find class of sigma point
In order to find the class of sigma points we use KNN with Squared Euclidean Distance.

In [None]:
neigh = KNeighborsClassifier(n_neighbors=1,weights='distance')
neigh.fit(latent_space_train, Y_train)

In [None]:
prediction = neigh.predict(sigma_points)

In [None]:
print(Counter(prediction))

In [None]:
kmeans = KMeans(n_clusters=50, random_state=0).fit(sigma_points)
labels = kmeans.labels_

In [None]:
print(Counter(labels))

# train a model to check

In [None]:
mean_random = []
sd_random = []

for coreset_size in range(50,501,50): # start from 1000 labeled points
    print("*********************** Training on {} points ***********************".format(coreset_size))

    accuracy = []
    m = int(coreset_size/50) # m=B/K, number of points from each cluster
    iterations = 0
    while iterations < 500: # run 100 simulations and take average 
        train_data = []
        train_labels = []
        indices_to_pick = []
        
        print("Choosing {} points from each cluster".format(m))
        for cluster_index in range(50):
            C_i = np.where(labels == cluster_index)[0].tolist()
            sample_i = random.sample(C_i, m)
            indices_to_pick.append(sample_i)
        
        indices_to_pick = list(chain.from_iterable(indices_to_pick)) # flatten the 2D list
        
        assert len(indices_to_pick)==coreset_size, "Sample size mismatch!!!!"
        
        for index in indices_to_pick:
            train_data.append(sigma_points[index])
            train_labels.append(prediction[index]) 
        
        train_x = np.array(train_data)
        train_y = np.array(train_labels)
        
        print()
        print("Distribution of data in the training points")
        print(Counter(train_y))

        clf.fit(train_x, train_y)
        print("Best parameters set found on {} data points:".format(coreset_size))
        print(clf.best_params_)
        print()
        y_true, y_pred = Y_test, clf.predict(latent_space_test)
        accuracy.append(accuracy_score(y_true, y_pred))
        iterations += 1

    accuracy = np.asarray(accuracy)
    mean_accuracy = accuracy.mean()
    sd_accuracy = accuracy.std()


    mean_random.append(mean_accuracy)
    sd_random.append(sd_accuracy)
    
mean_random = np.array(mean_random)
sd_random = np.array(sd_random)

In [None]:
mean_random.shape

In [None]:
np.save('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_7_N_200.npy',mean_random)
np.save('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_7_N_200.npy',sd_random)

In [None]:
sigma_knn_mean1 = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_1_N_100.npy')
sigma_knn_sd1 = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_1_N_100.npy')

sigma_knn_mean3 = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_3_N_100.npy')
sigma_knn_sd3 = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_3_N_100.npy')

sigma_knn_mean5 = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_5_N_100.npy')
sigma_knn_sd5 = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_5_N_100.npy')

sigma_knn_mean7 = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_7_N_100.npy')
sigma_knn_sd7 = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_7_N_100.npy')

sigma_knn_mean9 = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_9_N_100.npy')
sigma_knn_sd9 = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_9_N_100.npy')


number_of_labeled_data = range(50,501,50)
plt.figure(figsize=(8,8))

plt.plot(number_of_labeled_data,sigma_knn_mean1,'go--',linewidth=1, markersize=10, mfc='none') 
plt.plot(number_of_labeled_data,sigma_knn_mean3,'r*-.',linewidth=1, markersize=10, mfc='none')
plt.plot(number_of_labeled_data,sigma_knn_mean5,'ms-',linewidth=1, markersize=10, mfc='none')
plt.plot(number_of_labeled_data,sigma_knn_mean7,'b^:',linewidth=1, markersize=10, mfc='none')
plt.plot(number_of_labeled_data,sigma_knn_mean9,'cx:',linewidth=1, markersize=10, mfc='none')

# plt.fill_between(number_of_labeled_data, (sigma_knn_mean1 - sigma_knn_sd1), (sigma_knn_mean1 + sigma_knn_sd1), alpha=.1, color = 'g')
# plt.fill_between(number_of_labeled_data, (sigma_knn_mean3 - sigma_knn_sd3), (sigma_knn_mean3 + sigma_knn_sd3), alpha=.1, color = 'r')
# plt.fill_between(number_of_labeled_data, (sigma_knn_mean5 - sigma_knn_sd5), (sigma_knn_mean5 + sigma_knn_sd5), alpha=.1, color = 'm')
# plt.fill_between(number_of_labeled_data, (sigma_knn_mean7 - sigma_knn_sd7), (sigma_knn_mean7 + sigma_knn_sd7), alpha=.1, color = 'b')
# plt.fill_between(number_of_labeled_data, (sigma_knn_mean9 - sigma_knn_sd9), (sigma_knn_mean9 + sigma_knn_sd9), alpha=.1, color = 'c')



plt.legend([         
            r'$\alpha = 0.1$',
            r'$\alpha = 0.3$',
            r'$\alpha = 0.5$',
            r'$\alpha = 0.7$',
            r'$\alpha = 0.9$'
           ], loc = 4, fontsize=35)
                 
plt.grid()
plt.xticks(number_of_labeled_data, fontsize = 25, rotation = 45)
plt.yticks(fontsize = 25)
plt.xlabel("Number of labelled points", fontsize = 25)
plt.ylabel("Accuracy", fontsize = 25)
plt.savefig('./analysis/MNIST10_trained-on-sigma-with-kmeans_comparison_alpha_N_100_test.pdf', bbox_inches = 'tight')

In [None]:
sigma_knn_mean = np.load('./result/mean_accuracy_MNIST10_sigma_KNN_alpha_0_8_N_100.npy')
sigma_knn_sd = np.load('./result/sd_accuracy_MNIST10_sigma_KNN_alpha_0_8_N_100.npy')


number_of_labeled_data = range(50,501,50)
plt.figure(figsize=(8,8))

plt.plot(number_of_labeled_data,uniform_ls,'go--',linewidth=1, markersize=10, mfc='none') 
plt.plot(number_of_labeled_data,random_coreset_ls50,'r*-.',linewidth=1, markersize=10, mfc='none')
plt.plot(number_of_labeled_data,sensitivity_ls50,'ms-',linewidth=1, markersize=10, mfc='none')
plt.plot(number_of_labeled_data,sigma_knn_mean,'b^:',linewidth=1, markersize=10, mfc='none')

plt.fill_between(number_of_labeled_data, (uniform_ls - uniform_ls_sd), (uniform_ls + uniform_ls_sd), alpha=.1, color = 'g')
plt.fill_between(number_of_labeled_data, (random_coreset_ls50 - random_coreset_ls50_sd), (random_coreset_ls50 + random_coreset_ls50_sd), alpha=.1, color = 'r')
plt.fill_between(number_of_labeled_data, (sensitivity_ls50 - sensitivity_ls50_sd), (sensitivity_ls50 + sensitivity_ls50_sd), alpha=.1, color = 'r')
plt.fill_between(number_of_labeled_data, (sigma_knn_mean - sigma_knn_sd), (sigma_knn_mean + sigma_knn_sd), alpha=.1, color = 'b')



plt.legend([         
            'Uniform coreset sampling',
            'Random coreset sampling',
            'Sensitivity coreset sampling',
            'Sigma point coreset sampling' 
           ], loc = 4, fontsize=20)
                 
plt.grid()
plt.xticks(number_of_labeled_data, fontsize = 25, rotation = 45)
plt.yticks(fontsize = 25)
plt.xlabel("Number of labelled points", fontsize = 25)
plt.ylabel("Accuracy", fontsize = 25)
plt.savefig('./analysis/MNIST10_0_8_N_100_trained-on-sigma-with-kmeans_comparison.pdf', bbox_inches = 'tight')