In [69]:
import os
import argparse
import numpy as np
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import torch
from models.model_linear import Linearnet
from models.model_mlp import Mlp
from models.model_cnn import Cnn
from models.model_resnet import Resnet
from utils.utils_data import generate_real_dataloader
from utils.utils_data import prepare_cv_datasets

In [70]:
dset = 'cifar10'
B = 100

if dset in ['mnist', 'kmnist', 'fashion', 'cifar10']:
    (full_train_loader, train_loader, test_loader, ordinary_train_dataset, test_dataset, K) = prepare_cv_datasets(dataname=dset, batch_size=B)

for i, (data, labels) in enumerate(full_train_loader):
    K = torch.max(
        labels
    ) + 1  # K is number of classes, full_train_loader is full batch
    N,c,row,col = data.shape

flattened_data = data.reshape((N, c*row*col))
flattened_data_plus_label = torch.cat((flattened_data.reshape((c*row*col, N)), labels.unsqueeze(0))).reshape(N, c*row*col+1)

Files already downloaded and verified


In [71]:
print("Number of classes: ", K.item())
num_clusters = 1*K.item()
print("Number of clusters: ", num_clusters)
X = flattened_data.numpy()
print(X.shape)
kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=1).fit(X)
print(kmeans.labels_)
print(labels.numpy())

# confusion_labels = {}
# confusion_labels.update([(cluster,set()) for cluster in range(num_clusters)])
# for i,cluster in enumerate(kmeans.labels_):
#     true_label_i = labels[i].item()
#     confusion_labels[cluster].add(true_label_i)

# for cluster in confusion_labels.keys():
#     print(f"Cluster {cluster} Candidate Labels {confusion_labels[cluster]}")

sample_size = int(N*0.01) # 1% 
sample = random.sample(list(range(N)), sample_size)	
confusion_labels = np.eye(K)
for i,cluster_i in enumerate(kmeans.labels_[sample]):
    for j,cluster_j in enumerate(kmeans.labels_):
        if cluster_i==cluster_j:
            true_label_i = labels[i].item()
            true_label_j = labels[j].item()
            if true_label_i!=true_label_j:
                confusion_labels[true_label_i, true_label_j] += 1
                confusion_labels[true_label_j, true_label_i] += 1

# normalize to get probs
confusion_labels = normalize(confusion_labels, axis=1, norm='l1')
np.fill_diagonal(confusion_labels, 1.0)
print(np.around(confusion_labels, 2))
print("Ambiguity degree: ", confusion_labels[confusion_labels<1.0].max())

Number of classes:  10
Number of clusters:  10
(50000, 3072)
[6 3 0 ... 1 3 1]
[1 6 6 ... 3 0 1]
[[1.   0.11 0.11 0.1  0.11 0.12 0.13 0.11 0.09 0.11]
 [0.1  1.   0.12 0.11 0.12 0.13 0.13 0.11 0.08 0.1 ]
 [0.1  0.12 1.   0.1  0.11 0.13 0.12 0.11 0.09 0.11]
 [0.1  0.12 0.11 1.   0.12 0.12 0.11 0.11 0.09 0.12]
 [0.1  0.12 0.11 0.1  1.   0.12 0.12 0.12 0.09 0.12]
 [0.1  0.12 0.12 0.1  0.12 1.   0.12 0.11 0.09 0.12]
 [0.12 0.12 0.12 0.1  0.11 0.12 1.   0.11 0.1  0.12]
 [0.11 0.12 0.12 0.1  0.12 0.12 0.12 1.   0.09 0.11]
 [0.1  0.1  0.11 0.11 0.11 0.12 0.13 0.11 1.   0.11]
 [0.1  0.1  0.12 0.11 0.12 0.13 0.13 0.11 0.09 1.  ]]
Ambiguity degree:  0.13492047614284286
