In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.cluster import KMeans

import time

import sys
import numpy as np
import matplotlib.pyplot as plt

# Absolute path is needed to load libraries 
ROOT_PATH = os.path.abspath('')
sys.path.append(ROOT_PATH + '/lib')

from lib.CustomLayer_lib import Custom_Layer
from lib.utils import *
from lib.Kmeans_lib import cluster_to_label

# Implementazione V1: Creo Kmeans con labeled + unlabeled data - (implementazione attuale nel progetto)
Per ogni batch unisco labeled e unlabeled data. Quindi creo ogni volta un clustering

In [None]:
# Dataset
n_samples = 100

# Load model and features
n_feat = 10 # Select number of features
MODEL_PATH = 'Models/{}/'.format(n_feat)
features_saved = np.loadtxt(MODEL_PATH + 'll_features.txt')
labels_saved = np.loadtxt(MODEL_PATH + 'll_labels_features.txt').astype(int)
keras_model = keras.models.load_model(MODEL_PATH + 'original_mnist_cnn.h5') # Original model 

In [None]:
# Define and extract the features for the labels we need

# Define initial set of features
# labels_init_list = list([1, 9, 5, 0])
# labels_init_list = list(range(0,9))
labels_init_list = model.std_label
n_cluster = len(labels_init_list)

# Extract from the saved features the labels that we need
features_saved_init = []
labels_saved_init = []
# Extract features of digits considered in labels_init_list
for i in range(0, len(features_saved)):
    if labels_saved[i] in labels_init_list:
      features_saved_init.append(features_saved[i,:])
      labels_saved_init.append(labels_saved[i])

In [None]:
# Create dataset, and add the concatenate the saved and new features together
digits_run, labels_run, _, _ = create_dataset(n_samples, 0)
model = Custom_Layer(keras_model)
features_run = model.ML_frozen.predict(digits_run.reshape((n_samples,28,28,1)), verbose = False)

# Convert list to nparray
features = np.array(features_saved_init)
features = features.astype('float32')
labels_features = np.array(labels_saved_init)  

# Concateno al vettore delle features iniziali le features della nuova batch da analizzare
features = np.concatenate((features, features_run))
labels_features = np.append(labels_features, labels_run).astype(int)

In [None]:
# KMean Clustering
k = KMeans(n_cluster, n_init=100)
k.fit(features)

# Find pseudolabels for each new image
# Pseudolabels are computed by looking at the confusion matrix of the saved dataset (where ground truth is known)
clusters_features_saved = list(k.labels_[0:len(labels_saved_init)])
cluster_list = list(range(0,n_cluster))
map_clu2lbl, map_lbl2clu = cluster_to_label(clusters_features_saved, list(labels_saved_init), cluster_list, labels_init_list, verbose = (model.settings.verbosity == 'DEBUG'))

clusters_features = k.labels_

# Compute pseudolabels
pseudolabels = []
for i in range(0, len(clusters_features)):
  pseudolabel = map_clu2lbl[clusters_features[i]]
  pseudolabels.append(pseudolabel)

pseudolabels_run = pseudolabels[len(clusters_features) - len(labels_run): len(clusters_features)]

err = 0 # Initialize error counter
for i in range(len(labels_run)):
  if pseudolabels_run[i] != labels_run[i]:
    err += 1

# Implementazione V2: Comparo due Kmeans - (funzionante)
Runno un Kmeans sulle saved features e uno sulle nuove. Comparo le distanze tra i centroidi per determinare la corrispondenza tra i cluster

In [None]:
# Options
n_feat = 50 # Select number of features
n_samples = 100

# Load model and features
MODEL_PATH = 'Models/{}/'.format(n_feat)
features_saved = np.loadtxt(MODEL_PATH + 'll_features.txt')
labels_saved = np.loadtxt(MODEL_PATH + 'll_labels_features.txt').astype(int)
keras_model = keras.models.load_model(MODEL_PATH + 'original_mnist_cnn.h5') # Original model 

In [None]:
from sklearn.cluster import KMeans

# Create Kmeans with the saved features
n_cluster = 10
k1 = KMeans(n_cluster, n_init=100)
k1.fit(features_saved)

k1.cluster_centers_.shape

# Map cluster to labels
map_clu2lbl, map_lbl2clu = cluster_to_label(k1.labels_, labels_saved, list(range(0,n_cluster)), model.std_label)

map_clu2lbl

# Per migliorare questa parte sarebbe da riuscire a creare i cluster usando le labels dato che sono note. Così non abbiamo errori

In [None]:
# Create Kmeans with the new features
# Create dataset
digits_run, labels_run, _, _ = create_dataset(n_samples, 0)
model = Custom_Layer(keras_model)
features_run = model.ML_frozen.predict(digits_run.reshape((n_samples,28,28,1)), verbose = False)

n_clusters = 10
k2 = KMeans(n_clusters, n_init=100)
k2.fit(features_run)

In [None]:
matrix = np.zeros([10,10])

for i in range(0, n_cluster):
    matrix[i,:] = k2.transform(k1.cluster_centers_[i,:].reshape(1, -1))

# Rows = distances of center k1_i to the centers of k_2

np.around(matrix, 1)

In [None]:
# Map cluster to cluster (argmin)

# Find max in each row -> cluster corresponding to each label
argmin_axis0 = np.argmin(matrix, axis = 0) # Min of each col 
argmin_axis1 = np.argmin(matrix, axis = 1) # Min of each row

print(argmin_axis0, argmin_axis1)
print(set(argmin_axis0), set(argmin_axis1))

# Using argmin axis = 1 seems better
map_idx = argmin_axis0

In [None]:
# Fill dictionary with map
map_k1_2_k2 = {}
map_k2_2_k1 = {}

for i in range(0, len(map_idx)):
  map_k1_2_k2[map_idx[i]] = model.std_label[i]
  map_k2_2_k1[model.std_label[i]] = map_idx[i]

print(np.around(matrix, 1))
print("Argmax:", map_idx)
print("Cluster K1 (saved) to K2 (new) map: ", map_k1_2_k2)


In [None]:
k2.labels_

In [None]:
# Find pseudolabels
pseudolabels = np.zeros(n_samples)
errs = 0
for i in range(0, n_samples):
    pseudolabels[i] = map_clu2lbl[map_k2_2_k1[k2.labels_[i]]]
    if pseudolabels[i] != labels_run[i]:
        errs += 1 
        print("True label:", labels_run[i],"Pseudolabel:", pseudolabels[i],"K2 cluster:", k2.labels_[i], "K1-mapped cluster:", map_k2_2_k1[k2.labels_[i]])

# print(pseudolabels)
print("error:", errs)

# Implementazione V3: Modifica di V1 che però aggiorna i cluster anzichè ricrearli

In [None]:
# Dataset
n_samples = 10000

# Load model and features
n_feat = 10 # Select number of features
MODEL_PATH = 'Models/{}/'.format(n_feat)
features_saved = np.loadtxt(MODEL_PATH + 'll_features.txt')
labels_saved = np.loadtxt(MODEL_PATH + 'll_labels_features.txt').astype(int)
keras_model = keras.models.load_model(MODEL_PATH + 'original_mnist_cnn.h5') # Original model 

# Create dataset, and add the concatenate the saved and new features together
digits_run, labels_run, _, _ = create_dataset(n_samples, 0)
model = Custom_Layer(keras_model)
features_run = model.ML_frozen.predict(digits_run.reshape((n_samples,28,28,1)), verbose = False)

In [None]:
# Definisco labels set
labels_init_list = model.std_label
n_cluster = len(labels_init_list)

# Extract from the saved features the labels that we need
features_saved_init = []
labels_saved_init = []
# Extract features of digits considered in labels_init_list
for i in range(0, len(features_saved)):
    if labels_saved[i] in labels_init_list:
        features_saved_init.append(features_saved[i,:])
        labels_saved_init.append(labels_saved[i])

# Convert list to nparray
features = np.array(features_saved_init)
labels_features = np.array(labels_saved_init)  

In [None]:
# Creo un dizionario per linkare le features (salvate) al cluster di appartenenza
# creates dictionary using dictionary comprehension -> list [] is mutable object
features_saved_dict = { key : [] for key in labels_init_list}

for i in range(0, len(features_saved_init)):
    lbl = labels_saved_init[i]
    features_saved_dict[lbl].append(features_saved_init[i])

# print(features_saved_dict[2][1][230])

In [None]:
# Definisco i centroidi iniziali facendo una media ndei samples nel cluster.
# cluster_mean_dict = { key : [] for key in labels_init_list}
cluster_mean = []
# Converto list-of-arrays in 2D array
for key in labels_init_list:
  features_saved_dict[key] = np.array(features_saved_dict[key])
  cluster_mean.append(np.mean(features_saved_dict[key], axis=0))

cluster_mean = np.array(cluster_mean)
# print(cluster_mean.shape)

# Create KMeans
kmeans = KMeans(n_cluster)
kmeans.fit(cluster_mean)
map_clu2lbl, map_lbl2clu = cluster_to_label(kmeans.labels_, labels_init_list, list(range(0,n_cluster)), model.std_label)

# print(kmeans.predict(cluster_mean))
print("Map cluster to label:", map_clu2lbl)

# Passo una nuova immagine al Kmeans. Ne determino il cluster e ne calcolo la pseudolabel
errs = 0
cluster_label = np.zeros(n_samples, dtype=int)
for i in range(0, n_samples):
    labels_new = labels_run[i]
    features_new = np.array(features_run[i,:], dtype = type(features_saved[0,0]))

    # Find the cluster for the new features
    cluster_label[i] = kmeans.predict(features_new.reshape(1, -1))
    pseudolabel = map_clu2lbl[cluster_label[i]]

    if labels_new != pseudolabel:
        # print("True label:", labels_new,"Pseudolabel:", pseudolabel, "Index:", i)
        errs += 1
    else:
        pass
        #print("CORRECT!!!", "True label:", labels_new,"Pseudolabel:", pseudolabel, "Index:")

    # Update the cluster center
    l_rate = 0.02
    kmeans.cluster_centers_[cluster_label[i],:] = (kmeans.cluster_centers_[cluster_label[i],:] + features_new * l_rate)/(1 + l_rate)
    # print(cluster_label[i])

print("Errors:", errs, "Accuracy: {:.1%}".format(1- errs/n_samples))

In [None]:
# Ora sarebbe da capire quando bisogna aggiungere un nuovo centroide, osservando le metriche interne

#   kmeans.score()
#   kmeans.transform() 
#   sklearn metrics -> es. silhouette
# 
# Idea: faccio il clustering variando il numero di cluster e vedo di minimizzare la distanza..