In [1]:
import os
import os.path as osp
import sys
import numpy as np
import itertools
import random
import pickle
import json
import torch
import torch.nn as nn
import csv

sys.path.append('utils')

import matplotlib as mpl
%matplotlib inline
mpl.use('Agg')
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.decomposition import PCA

In [3]:
plot_colors = [ '#377eb8', # blue
                '#ff7f00', # orange
                '#4daf4a', # green
                '#984ea3', # purple
                '#EDB120', # mustard
                '#00FFFF', # cyan
                '#A52A2A', # brown
            ] 
markers = [ 'o', 'v', '>', '<', '^', 's', 'p']

## Artificially Corrupted Datasets

In [3]:
dataset = 'cifar10' # cifar10 mnist
gen = '01' # 01 04
lw = 0 # 1 2 0
plot_train = True
plot_test = True


files = [('lws', f'Res-sgd_{dataset}_{gen}_mlp_lws_1_{lw}.0_0.1_0.001_0.5_50_250_256_0.csv'), 
         ('prp', f'Res-sgd_{dataset}_{gen}_mlp_prp_0.1_0.001_0.5_50_250_256_0.csv')]
source_dir = 'results_cv_best'

fig = plt.figure()
ax = fig.add_axes([0.13, 0.13, 0.8, 0.8])

for i,(loss,f_name) in enumerate(files):
    # load data
    train_acc = []
    test_acc = []
    epochs = []
    with open(os.path.join(source_dir, f_name), newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                train_acc.append(round(100*float(row['train_acc']),2))
                test_acc.append(round(100*float(row['test_acc']),2))
                epochs.append(int(row['epoch']))
            except:
                pass
    # plot lines
    if plot_train:
        ax.plot(epochs, train_acc, label = loss + "-train", color=plot_colors[i], linestyle ='dashed', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_test:
        ax.plot(epochs, test_acc, label = loss + "-test", color=plot_colors[i], linestyle ='solid', linewidth = 0.8,
                marker=markers[i], markersize=0)

plt.xlabel("training epochs")
plt.ylabel("accuracy")
plt.legend()
title = f"Data {dataset} - PLL {gen}" 
plt.title(title)
plt.savefig(f"./figs/learning_curves_dataset_{dataset}_PLL_generation_method_{gen}_beta_{lw}.png")

## Benchmark Datasets

In [6]:
dataset = 'birdac' # birdac lost MSRCv2
model = 'mlp' # mlp linear
lw = 1 # 1 2 0
plot_train = True
plot_test = True
plot_probs = True


files = [('lws', f'Res-sgd_{dataset}_{model}_lws_1_{lw}.0_0.5_0.001_0.5_50_1000_256_0.csv'), 
         ('prp', f'Res-sgd_{dataset}_{model}_prp_0.5_0.001_0.5_50_1000_256_0.csv')]
source_dir = 'results_cv_best'

fig = plt.figure()
ax = fig.add_axes([0.13, 0.13, 0.8, 0.8])

for i,(loss,f_name) in enumerate(files):
    # load data
    train_acc = []
    test_acc = []
    train_pos_probs = []
    epochs = []
    with open(os.path.join(source_dir, f_name), newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                train_acc.append(round(100*float(row['train_acc']),2))
                test_acc.append(round(100*float(row['test_acc']),2))
                train_pos_probs.append(round(100*float(row['train_pos_prob']),2))
                epochs.append(int(row['epoch']))
            except:
                pass
    # plot lines
    if plot_train:
        ax.plot(epochs, train_acc, label = loss + "-train", color=plot_colors[i], linestyle ='dashed', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_test:
        ax.plot(epochs, test_acc, label = loss + "-test", color=plot_colors[i], linestyle ='solid', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_probs:
        ax.plot(epochs, train_pos_probs, label = loss + "-prob", color=plot_colors[i+2], linestyle ='dotted', linewidth = 0.8,
                marker=markers[i], markersize=1)

plt.xlabel("training epochs")
plt.ylabel("accuracy")
plt.legend()
title = f"Data {dataset} - Model {model}" 
plt.title(title)
fig_name = f"learning_curves_dataset_{dataset}_PLL_model_{model}_beta_{lw}"
if plot_probs:
    fig_name += '_w_probs'
plt.savefig(f"./figs/{fig_name}.png")

In [11]:
dataset = 'birdac' # birdac lost MSRCv2
model = 'mlp' # mlp linear
alphas = ['0.5', '1.0', '1.5']
plot_train = True
plot_test = True
plot_probs = False

source_dir = 'results_cv_best'

fig = plt.figure()
ax = fig.add_axes([0.13, 0.13, 0.8, 0.8])


for i, alpha in enumerate(alphas):
    # load data
    train_acc = []
    test_acc = []
    train_pos_probs = []
    epochs = []
    f_name = f'Res-sgd_{dataset}_{model}_hprp_0.5_0.001_0.5_50_1000_256_0_{alpha}.csv'
    with open(os.path.join(source_dir, f_name), newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                train_acc.append(round(100*float(row['train_acc']),2))
                test_acc.append(round(100*float(row['test_acc']),2))
                train_pos_probs.append(round(100*float(row['train_pos_prob']),2))
                epochs.append(int(row['epoch']))
            except:
                pass
    # plot lines
    if plot_train:
        ax.plot(epochs, train_acc, label = f"hprp-{alpha}-train", color=plot_colors[i], linestyle ='dashed', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_test:
        ax.plot(epochs, test_acc, label = f"hprp-{alpha}-test", color=plot_colors[i], linestyle ='solid', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_probs:
        ax.plot(epochs, train_pos_probs, label = f"hprp-{alpha}-prob", color=plot_colors[i+len(alphas)], linestyle ='loosely dotted', linewidth = 0.8,
                marker=markers[i], markersize=1)

plt.xlabel("training epochs")
plt.ylabel("accuracy")
plt.legend()
title = f"Data {dataset} - Model {model}" 
plt.title(title)
fig_name = f"learning_curves_dataset_{dataset}_PLL_model_{model}_hprp"
if plot_probs:
    fig_name += '_w_probs'
plt.savefig(f"./figs/{fig_name}.png")

## Benchmark Dataset Investigation 

In [16]:
from utils.utils_data import generate_real_dataloader
from scipy.io import loadmat

datadir = './data/realworld/'
dataset = 'birdac' # lost birdac
batch_size = 10

# (partial_matrix_train_loader, train_loader, 
# eval_loader, test_loader, partialY, dim, K) = generate_real_dataloader(dataset, './data/realworld/', batch_size, 42)

# for i, (images, labels, true_labels, index) in enumerate(partial_matrix_train_loader):
#     pass

datapath = os.path.join(datadir, "{}.mat".format(dataset))
dt = loadmat(datapath)

X = dt['features']
partial_y = dt['p_labels']
y = dt['logitlabels']

X = np.float32(X)
partial_y = np.float32(partial_y)
y = np.float32(np.argmax(y, axis=1))

s,f = X.shape
_,c = partial_y.shape

print("#Samples", s)
print("#Classes", c)
print("Feature dim", f)

num_candidate_labels = partial_y.sum(1)

print("Avg #candidates", np.mean(num_candidate_labels))

# Creating histogram
fig, axs = plt.subplots(1, 1,
                        figsize =(10, 7),
                        tight_layout = True)
 
axs.hist(num_candidate_labels, bins = 3)
 
# Show plot
plt.savefig(f"./figs/{dataset}_candidate_classes.png")


#Samples 4998
#Classes 13
Feature dim 38
Avg #candidates 2.17547


## Benchmark + Clustering Partial Label Generation

In [4]:
dataset = 'mnist' # mnist cifar10
model = 'mlp' # mlp cnn
epochs = 1000
plot_train = False
plot_test = True
plot_probs = True


files = [('lws', f'Res-sgd_{dataset}_{model}_lws_1_1.0_0.05_0.001_0.5_50_{epochs}_256_0.csv'), 
         ('prp', f'Res-sgd_{dataset}_{model}_prp_0.05_0.001_0.5_50_{epochs}_256_0.csv')]
source_dir = 'results_cv_best'

fig = plt.figure()
ax = fig.add_axes([0.13, 0.13, 0.8, 0.8])


for i,(loss,f_name) in enumerate(files):
    # load data
    train_acc = []
    test_acc = []
    train_pos_probs = []
    epochs = []
    with open(os.path.join(source_dir, f_name), newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                train_acc.append(round(100*float(row['train_acc']),2))
                test_acc.append(round(100*float(row['test_acc']),2))
                train_pos_probs.append(round(100*float(row['train_pos_prob']),2))
                epochs.append(int(row['epoch']))
            except:
                pass
    # plot lines
    if plot_train:
        ax.plot(epochs, train_acc, label = loss + "-train", color=plot_colors[i], linestyle ='dashed', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_test:
        ax.plot(epochs, test_acc, label = loss + "-test", color=plot_colors[i], linestyle ='solid', linewidth = 0.8,
                marker=markers[i], markersize=0)
    if plot_probs:
        ax.plot(epochs, train_pos_probs, label = loss + "-prob", color=plot_colors[i+2], linestyle ='dotted', linewidth = 0.8,
                marker=markers[i], markersize=1)

plt.xlabel("training epochs")
plt.ylabel("accuracy")
plt.legend()
title = f"Data {dataset} - Model {model}" 
plt.title(title)
fig_name = f"learning_curves_dataset_{dataset}_PLL_model_{model}_clustering_based_pl_generation"
if plot_probs:
    fig_name += '_w_probs'
plt.savefig(f"./figs/{fig_name}.png")