In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import numpy as np
import cvxpy as cp
import plotly.graph_objects as go
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import SpectralClustering
from data.simulation.simulator import Simulator
from learner import SVMLearner
template = "plotly_white"

In [22]:
# Initialize
simulator = Simulator("moon", noise=0.2)
learner = SVMLearner()
npr = np.random.RandomState(123)
sigma = 5 # sigma > 1

In [30]:
# Dataset
N = 500             # Number of simulation data
input_dim = 2       # Feature dimension
train_X, train_y = simulator.simulate(N, input_dim)
labeled_mask = np.zeros(N).astype(np.bool)
labeled_ratio = 0.1
labeled_mask[:int(labeled_ratio * N)] = True

test_X, test_y = simulator.simulate(N, input_dim)

In [31]:
def vis_scatter(X, color):
    # Visualize the data
    fig = go.Figure(data=go.Scatter(x=X[:, 0], 
                                    y=X[:, 1], 
                                    mode='markers', 
                                    marker=dict(color=color), 
                                    marker_line_width=2))
    fig.update_layout(template=template)
    fig.update_xaxes(showline=True, linewidth=1.5, linecolor='Black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1.5, linecolor='Black', mirror=True)
    fig.update_layout(width=800, height=600)
    fig.show()

# Visaulize train data
vis_scatter(train_X, train_y)

In [32]:
def cal_diversity(X, labeled_mask, sigma):
    s = rbf_kernel(X[~labeled_mask], X[labeled_mask])
    d = s.max() - s
    diversity = sigma - (sigma - 1) * d.min(1) / d.min(1).max()
    return diversity
    

def cal_dissimilarity_over_unlabeled_set(X, labeled_mask):
    s = rbf_kernel(X[~labeled_mask])
    d = s.max() - s
    return d

    
def cal_uncertainty(X, sigma, K):
    prob = learner.predict_proba(X)
    uncertainty = learner.cal_uncertainty(prob, sigma, K)
    return uncertainty
    
    
def eval_acc(X, y, vis=False):
    prob = learner.predict_proba(X)
    pred = prob.argmax(1)
    
    acc = (pred == y).mean()
    print("accuracy: {:.2f}".format(acc))
    
    if vis:
        vis_scatter(X, pred)
    return acc

In [33]:
learner.fit(train_X[labeled_mask], train_y[labeled_mask])
acc = eval_acc(test_X, test_y, vis=True)

accuracy: 0.91


## Construct and solve the convex optimization problem

In [None]:
# Solve convex optimization problem
n_unlabeled = (~labeled_mask).sum()

uncertainty = cal_uncertainty(train_X[~labeled_mask], sigma, K=2)
diversity = cal_diversity(train_X, labeled_mask, sigma)
C = np.zeros([n_unlabeled, n_unlabeled])
np.fill_diagonal(C, np.stack([uncertainty, diversity], 0).min(0))
D = cal_dissimilarity_over_unlabeled_set(train_X, labeled_mask)
ones = np.ones(n_unlabeled)
Z = cp.Variable([n_unlabeled, n_unlabeled])


q = 2         # q-norm
alpha = 1     # to balance two costs
objective_fn = cp.atoms.affine.trace.trace(D.T @ Z) + alpha * sum(cp.atoms.norm(C @ Z, q, axis=1))
constraints = [Z >= 0, ones.T @ Z == ones.T]
prob = cp.Problem(cp.Minimize(objective_fn), constraints)

prob.solve()

## Select the data to label next

In [None]:
representatives = np.unique(Z.value.argmax(0))
print("Number of representative: {}".format(len(representatives)))
data_to_label = np.where(~labeled_mask)[0][representatives]

In [None]:
cur_labeled_mask = np.copy(labeled_mask)
cur_labeled_mask[data_to_label] = True
learner.fit(train_X[cur_labeled_mask], train_y[cur_labeled_mask])
acc = eval_acc(test_X, test_y, vis=True)