In [51]:
import os
import argparse
import numpy as np
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances as dist, euclidean_distances as dist2
import torch
from models.model_linear import Linearnet
from models.model_mlp import Mlp
from models.model_cnn import Cnn
from models.model_resnet import Resnet
from utils.utils_data import generate_real_dataloader
from utils.utils_data import prepare_cv_datasets

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
%matplotlib inline

from scipy.io import savemat

### Synthetic Partial Label Generation: based on the “Madelon” dataset.

In [56]:
from synthetic_classification_generator import make_classification

datadir = "./data/realworld/"
num_classes = 10
num_samples = 1000
feature_dim = 150
distractor_ratios = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9]

for distractor_ratio in distractor_ratios:
    num_distractors = int(distractor_ratio*num_classes)

    # n_features = informative + redundant + repeated + random/useless

    X, y_true, centroids = make_classification(n_samples=num_samples,
                                            n_features=feature_dim, 
                                            n_informative=feature_dim, # all features are informative
                                            n_redundant=0,
                                            n_repeated=0,
                                            n_classes=num_classes,
                                            n_clusters_per_class=1, # each class is associated with a single cluster
                                            flip_y=0.01,
                                            class_sep=1.0,          # default 1.0
                                            hypercube=True,
                                            shift=0.0,
                                            scale=1.0,
                                            shuffle=False,
                                            random_state=None,
                                            return_centroids=True)

    # print('features', X.shape, X.dtype)
    # print('logitlabels', y_true.shape, y_true.dtype)

    y = partial_y = np.zeros((num_samples, num_classes))
    y[np.arange(y_true.size), y_true] = 1
    partial_y[np.arange(y_true.size), y_true] = 1

    ## Generate Partial Label
    # print('centroids', centroids.shape)
    sample_centroid_distances = dist(X, Y=centroids)
    for x in range(num_samples):
        distractor_weights = sample_centroid_distances[x]
        distractor_weights_norm = sample_centroid_distances[x] / sample_centroid_distances[x].sum()
        # print(f'sample {x} - norm centroid distances {np.around(distractor_weights,4)} - true label {y_true[x]} - argsort {np.argsort(distractor_weights)}')
        num_partial_labels = random.randint(1, num_distractors+1)
        partial_y[x, np.argsort(distractor_weights)[0:num_partial_labels]] = 1
        # print(partial_y[x])
        
    dt = dict()
    dt['features'] = X
    dt['p_labels'] = partial_y
    dt['logitlabels'] = y

    datapath = os.path.join(datadir, "synthetic-{}.mat".format(distractor_ratio))
    savemat(datapath, dt)

### GMM-based Synthetic Partial Label Generation

In [8]:

def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    """
    Draws the ellipses for each of the predicted cluster
    """
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

In [17]:
from sklearn.mixture import GaussianMixture as GMM
from sklearn.datasets import make_blobs

C = 10
std = 0.9
N = 10
d = 10

## Making a synthetic dataset with 4 clusters, and number of samples is 400.
X_base, y_base = make_blobs(n_samples=100*C, n_features=d, centers=C, cluster_std=std, random_state=0)
X = X_base[:, ::-1]

## Fit (and plot) a GMM on the synthetic dataset
gmm = GMM(n_components=C, random_state=42)
gmm.fit(X_base)
# plot_gmm(gmm, X_base)

## Sample from the fitted GMM
X, y = gmm.sample(N)
print(X.shape)
print(y.shape)

(10, 10)
(10,)


### Clustering-based Partial Label Generation for Real Data

In [70]:
dset = 'cifar10'
B = 100

if dset in ['mnist', 'kmnist', 'fashion', 'cifar10']:
    (full_train_loader, train_loader, test_loader, ordinary_train_dataset, test_dataset, K) = prepare_cv_datasets(dataname=dset, batch_size=B)

for i, (data, labels) in enumerate(full_train_loader):
    K = torch.max(
        labels
    ) + 1  # K is number of classes, full_train_loader is full batch
    N,c,row,col = data.shape

flattened_data = data.reshape((N, c*row*col))
flattened_data_plus_label = torch.cat((flattened_data.reshape((c*row*col, N)), labels.unsqueeze(0))).reshape(N, c*row*col+1)

Files already downloaded and verified


In [71]:
print("Number of classes: ", K.item())
num_clusters = 1*K.item()
print("Number of clusters: ", num_clusters)
X = flattened_data.numpy()
print(X.shape)
kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=1).fit(X)
print(kmeans.labels_)
print(labels.numpy())

# confusion_labels = {}
# confusion_labels.update([(cluster,set()) for cluster in range(num_clusters)])
# for i,cluster in enumerate(kmeans.labels_):
#     true_label_i = labels[i].item()
#     confusion_labels[cluster].add(true_label_i)

# for cluster in confusion_labels.keys():
#     print(f"Cluster {cluster} Candidate Labels {confusion_labels[cluster]}")

sample_size = int(N*0.01) # 1% 
sample = random.sample(list(range(N)), sample_size)	
confusion_labels = np.eye(K)
for i,cluster_i in enumerate(kmeans.labels_[sample]):
    for j,cluster_j in enumerate(kmeans.labels_):
        if cluster_i==cluster_j:
            true_label_i = labels[i].item()
            true_label_j = labels[j].item()
            if true_label_i!=true_label_j:
                confusion_labels[true_label_i, true_label_j] += 1
                confusion_labels[true_label_j, true_label_i] += 1

# normalize to get probs
confusion_labels = normalize(confusion_labels, axis=1, norm='l1')
np.fill_diagonal(confusion_labels, 1.0)
print(np.around(confusion_labels, 2))
print("Ambiguity degree: ", confusion_labels[confusion_labels<1.0].max())

Number of classes:  10
Number of clusters:  10
(50000, 3072)
[6 3 0 ... 1 3 1]
[1 6 6 ... 3 0 1]
[[1.   0.11 0.11 0.1  0.11 0.12 0.13 0.11 0.09 0.11]
 [0.1  1.   0.12 0.11 0.12 0.13 0.13 0.11 0.08 0.1 ]
 [0.1  0.12 1.   0.1  0.11 0.13 0.12 0.11 0.09 0.11]
 [0.1  0.12 0.11 1.   0.12 0.12 0.11 0.11 0.09 0.12]
 [0.1  0.12 0.11 0.1  1.   0.12 0.12 0.12 0.09 0.12]
 [0.1  0.12 0.12 0.1  0.12 1.   0.12 0.11 0.09 0.12]
 [0.12 0.12 0.12 0.1  0.11 0.12 1.   0.11 0.1  0.12]
 [0.11 0.12 0.12 0.1  0.12 0.12 0.12 1.   0.09 0.11]
 [0.1  0.1  0.11 0.11 0.11 0.12 0.13 0.11 1.   0.11]
 [0.1  0.1  0.12 0.11 0.12 0.13 0.13 0.11 0.09 1.  ]]
Ambiguity degree:  0.13492047614284286
