In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import itertools
from collections import OrderedDict
from functools import partial

from sklearn.datasets import load_svmlight_file, make_circles


from links import LinksClassifier
from logit import LogisticRegressionPairwise, LogisticRegression
from start_sensitivity import split_dataset_stable


from sklearn.model_selection import ParameterGrid, StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from tqdm import tqdm_notebook as tqdm

from new_experiment_runner.cacher import CSVCacher

In [8]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
params = pd.read_csv('data/params_big_22.03.csv')

In [7]:
%matplotlib tk

In [4]:
def plot_scores(ax, scores, vmin, vmax, range_x, range_y):
    r = ax.imshow(scores, 
        interpolation='nearest',
                  cmap=plt.cm.hot,
                  vmax=vmax,
                  vmin=vmin,
                origin='lower')
    ax.set_xticks(np.arange(len(range_x)))
    ax.set_xticklabels(range_x)
    ax.set_yticks(np.arange(len(range_y)))
    ax.set_yticklabels(range_y)
    return r

In [5]:
params.columns

Index([u'cv_split', u'gs_test_size', u'cv_splits', u'cv_test_size',
       u'percent_unlabeled', u'percent_links', u'dataset', u'beta', u'delta',
       u'cv_random_state', u'gs_splits', u'percent_labels', u'alpha',
       u'cv_score', u'gamma', u'test_score'],
      dtype='object')

In [8]:
datasets

array(['diabetes_scale', 'breast-cancer_scale', 'australian_scale',
       'circles', 'moons'], dtype=object)

In [19]:
datasets =params.dataset.unique()
fig, ax = plt.subplots(ncols=len(datasets)/2+1, nrows=2)
ax= ax.flatten()

for i, ds in enumerate(datasets):
    ds_params = params.ix[params['dataset'] == ds]
    ds_score_grouped = pd.groupby(ds_params, by=['beta', 'delta']).agg({'test_score':np.mean})
    ds_score_grouped = ds_score_grouped.unstack(level=-1)
    
    r = plot_scores(ax[i], ds_score_grouped, vmin=0.5, vmax=1, range_y=np.sort(ds_params.beta.unique()),
                range_x=np.sort(ds_params.delta.unique()))
    
    ax[i].set_xlabel('delta')
    ax[i].set_ylabel('beta')
    ax[i].set_title(ds)
    labels = ax[i].get_xticklabels() 
    plt.setp(labels, rotation=45, fontsize=10) 
    
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(r, cax=cbar_ax)
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)

In [10]:
tradeoff = pd.read_csv('data/tradeoff_22.03.csv')

In [20]:
datasets =tradeoff.dataset.unique()
fig, ax = plt.subplots(ncols=len(datasets)/2+1, nrows=2)
ax= ax.flatten()

for i, ds in enumerate(datasets):
    print(ds)
    ds_params = tradeoff.ix[tradeoff['dataset'] == ds]
    ds_score_grouped = pd.groupby(ds_params, by=['percent_labels', 'percent_links']).agg({'test_score':np.mean})
    ds_score_grouped = ds_score_grouped.unstack(level=-1)
    
    r = plot_scores(ax[i], ds_score_grouped, vmin=ds_params.test_score.min(), vmax=1,
                    range_y=np.round(np.sort(ds_params.percent_labels.unique()), 2),
                    range_x=np.round(np.sort(ds_params.percent_links.unique()), 2))
    
    ax[i].set_xlabel('% links')
    ax[i].set_ylabel('% labels')
    ax[i].set_title(ds)
    labels = ax[i].get_xticklabels() 
    plt.setp(labels, rotation=45, fontsize=10) 
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(r, cax=cbar_ax)
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)

diabetes_scale
breast-cancer_scale
australian_scale
ionosphere_scale


In [40]:
def split_dataset(X, y, percent_labels, percent_links, percent_unlabeled, random_state=42):
    if random_state:
        np.random.seed(random_state)

    #X = X.toarray()
    
    if percent_links > 0:
        choice1 = next(StratifiedShuffleSplit(n_splits=1, train_size=percent_links).split(X, y))[0]
        choice1 = np.in1d(np.arange(len(y)), choice1)

        choice2 = next(StratifiedShuffleSplit(n_splits=1, train_size=percent_links).split(X, y))[0]
        choice2 = np.in1d(np.arange(len(y)), choice2)

        z = (y[choice1] == y[choice2]).astype(float)
    else:
        choice1 = np.zeros(len(y), dtype=bool)
        choice2 = np.zeros(len(y), dtype=bool)
        z = np.array([])

    links_index = choice1 | choice2
    # print(links_index.sum())


    if percent_labels < 1:
        not_links_where = np.where(~links_index)[0]
        if percent_labels > 0:
            labels_choice = next(StratifiedShuffleSplit(n_splits=1,
                                                        train_size=int(percent_labels * len(y))).split(
                X[not_links_where], y[not_links_where]))[0]
        else:
            labels_choice = np.zeros(len(y), dtype=bool)

        # print(not_links_where.shape)
        labels_choice = not_links_where[labels_choice]
    else:
        raise Exception()
        # labels_choice = np.arange(0, len(X))
    labels_index = np.in1d(np.arange(len(y)), labels_choice)

    unsup_index = ~(labels_index & links_index)
    unsup_where = np.where(unsup_index)[0]
    unsup_choice = np.random.choice(unsup_where, size=int(percent_unlabeled * len(y)),
                                    replace=False)

    # print(labels_index.sum(), links_index.sum(), unsup_index.sum())
    assert (labels_index | links_index | unsup_index).sum() == len(y)

    return labels_index, choice1, choice2, unsup_choice

In [1]:
from logit import LogisticRegressionPairwise

In [112]:
from sklearn.datasets import make_moons
from matplotlib.colors import ListedColormap
import matplotlib.lines as mlines
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.base import clone
import matplotlib.cm
X, y = make_moons(400, noise=0.2)
#y[y==0] = -1

In [10]:
%matplotlib tk

In [19]:
X = StandardScaler().fit_transform(X)
alpha=0.1
gamma=2.5
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

fig, ax = plt.subplots(ncols=3)
plt.tight_layout()

labels, links1, links2, z, unsup = split_dataset_stable(X, y, 
                                                     labels_and_links_separation_degree=2,
                                                     percent_labels=0.1,
                                                     percent_links=0.2,
                                                     percent_unlabeled=0.3, 
                                                     return_index=True)
# z = (y[links1] == y[links2]).astype(int)
#z = 1- z

h = 0.1
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


pos_labels=y == 1
ax[0].scatter(X[pos_labels][:, 0], X[pos_labels][:, 1], c='#0000FF',  label='positive samples')
ax[0].scatter(X[~pos_labels][:, 0], X[~pos_labels][:, 1], c='#FF0000', label='negative samples')


ax[0].legend(loc=3, prop={'size':9})
ax[0].set_title('Log. reg. on full dataset')

ax[0].set_xlim(xx.min(), xx.max())
ax[0].set_ylim(yy.min(), yy.max())
ax[0].set_xticks(())
ax[0].set_yticks(())

estimator0 = LogisticRegression(alpha=alpha, kernel='rbf', gamma=gamma)
estimator0.fit(X, y)
Z0 = estimator0.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z0 = Z0.reshape(xx.shape)
ax[0].contourf(xx, yy, Z0, cmap=cm, alpha=.35, levels=np.linspace(0, 1, 20))

pos_labels=y[labels] == 1
ax[1].scatter(X[labels][pos_labels][:, 0], X[labels][pos_labels][:, 1], c='#0000FF',  label='positive samples')
ax[1].scatter(X[labels][~pos_labels][:, 0], X[labels][~pos_labels][:, 1], c='#FF0000', label='negative samples')
pos_labels=y[~labels] == 1
ax[1].scatter(X[~labels][pos_labels][:, 0], X[~labels][pos_labels][:, 1], c='#0000FF', alpha=0.3)
ax[1].scatter(X[~labels][~pos_labels][:, 0], X[~labels][~pos_labels][:, 1], c='#FF0000', alpha=0.3)

ax[1].legend(loc=3, prop={'size':9})
ax[1].set_title('Log. reg. on labeled part')

ax[1].set_xlim(xx.min(), xx.max())
ax[1].set_ylim(yy.min(), yy.max())
ax[1].set_xticks(())
ax[1].set_yticks(())

estimator1 = LogisticRegression(alpha=alpha, kernel='rbf', gamma=gamma)

estimator1.fit(X[labels], y[labels])
Z1 = estimator1.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z1 = Z1.reshape(xx.shape)
ax[1].contourf(xx, yy, Z1, cmap=cm, alpha=.35, levels=np.linspace(0, 1, 20))


# just plot the dataset first



pos_labels=y[labels] == 1
#ind = np.zeros(len(y))

ax[2].plot([], [], 'g:', label='must-link')
ax[2].plot([], [], 'r--', label='cannot-link')

# Pot the training points
ax[2].scatter(X[labels][pos_labels][:, 0], X[labels][pos_labels][:, 1], c='#0000FF',  label='positive samples')
ax[2].scatter(X[labels][~pos_labels][:, 0], X[labels][~pos_labels][:, 1], c='#FF0000', label='negative samples')

# and testing points
ax[2].scatter(X[links1][:, 0], X[links1][:, 1], c='green', cmap=cm_bright, alpha=1, marker='o', label='linked samples')
ax[2].scatter(X[links2][:, 0], X[links2][:, 1], c='green', cmap=cm_bright, alpha=1, marker='o')
ax[2].scatter(X[unsup][:, 0], X[unsup][:, 1], c='black', cmap=cm_bright, marker='x', label='unlabeled samples')
ax[2].scatter(X[~labels][:, 0], X[~labels][:, 1],c=y[~labels], cmap=cm, alpha=0.2)

for i in xrange(len(z)):
    ax[2].plot([X[links1][i, 0], X[links2][i, 0]], [X[links1][i, 1], X[links2][i, 1]], 'g-' if z[i] else 'r--', alpha=0.5)

    
# blue_line = mlines.Line2D([], [], color='blue', marker='*',
#                           markersize=15, label='Blue stars')

l_handles, l_labels = ax[2].get_legend_handles_labels()
order = np.arange(len(l_labels))



l_handles = [x for (o,x) in sorted(zip(order,l_handles))]
l_labels = [x for (o,x) in sorted(zip(order,l_labels))]


ax[2].legend(handles=l_handles, labels=l_labels, loc=3, prop={'size':9})
ax[2].set_title('Log. reg. with links and unlabeled')

ax[2].set_xlim(xx.min(), xx.max())
ax[2].set_ylim(yy.min(), yy.max())
ax[2].set_xticks(())
ax[2].set_yticks(())

estimator2 = LinksClassifier(alpha=1, kernel='rbf',gamma=gamma, solver='tnc', beta=1000, delta=110)
estimator2.fit(X[labels], y[labels], X1=X[links1], X2=X[links2], z=z, Xu=X[unsup])
# score = estimator.score(X_test, y_test)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = estimator2.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax[2].contourf(xx, yy, Z, cmap=cm, alpha=.35, levels=np.linspace(0, 1, 20))

<matplotlib.contour.QuadContourSet at 0xe01b780>

In [142]:
X, y = make_moons(800, noise=0.1, random_state=42)
decision_boundary = 0.25
y = X[:, 1] > decision_boundary
X = StandardScaler().fit_transform(X)

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

fig, ax = plt.subplots(ncols=2, nrows=2)
ax = ax.ravel()
plt.tight_layout()

labels, links1, links2, z, unsup = split_dataset_stable(X, y, 
                                                     labels_and_links_separation_degree=0,
                                                     percent_labels=0.5,
                                                     percent_links=0.0,
                                                     percent_unlabeled=0.5, 
                                                     return_index=True)


def plot_with_estimator(ax, estimator):
    h = 0.1
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))


    pos_labels= y == 1
    selected_labels = np.in1d(np.arange(len(y)), labels)
    ax.scatter(X[pos_labels & selected_labels][:, 0], X[pos_labels & selected_labels][:, 1], 
               c='#0000FF',  label='positive samples')
    ax.scatter(X[~pos_labels & selected_labels][:, 0], X[~pos_labels & selected_labels][:, 1], 
               c='#FF0000', label='negative samples')
    ax.scatter(X[unsup][:, 0], X[unsup][:, 1], 
               c='#000000', label='unlabeled')


    ax.legend(loc=3, prop={'size':9})
    ax.set_title('delta=%d' % estimator.delta)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    # ax.set_xticks(())
    # ax.set_yticks(())
    ax.plot([xx.min(), xx.max()], [0, 0], 'r--')

    
    estimator.fit(X[labels], y[labels], Xu = X[unsup])
    #empty_array=np.zeros(shape=(0, X.shape[1]))
#     loss_lab, loss_links, loss_unsup, loss_norm = estimator0.calc_loss(X[labels], y[labels],
#                                                                        X1=empty_array, 
#                                                                        X2=empty_array, 
#                                                                        z=np.zeros(shape=0),
#                                                                        Xu = X[unsup])
    # print(loss_lab, loss_unsup, loss_norm)

    Z0 = estimator.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z0 = Z0.reshape(xx.shape)
    ax.contourf(xx, yy, Z0, cmap=cm, alpha=.35, levels=np.linspace(0, 1, 20))
    
for i, delta in enumerate([0, 20, 25, 60]):
    estimator0 = LinksClassifier(alpha=10, kernel='rbf', gamma=2, delta=delta,
                                 verbose=False, sampling='predefined')
    plot_with_estimator(ax[i], clone(estimator0))

In [134]:
X.shape

(800L, 2L)