In [1]:
from __future__ import division, print_function

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import matplotlib.cm

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [4]:
%matplotlib tk

In [5]:
#sorted(benchmarks.groupby(['percent_labels', 'percent_links', 'percent_unlabeled']).groups.keys())

In [6]:
def plot_benchmarks_subset(benchmarks_subset):
    fig, axs= plt.subplots(ncols=len(benchmarks_subset.percent_links.unique()), nrows=len(benchmarks_subset.dataset.unique()))

    for i_ds, (ds_name, ds_df) in enumerate(benchmarks_subset.groupby('dataset')):
        svm_index = ds_df.estimator.str.startswith('SVM')
            #print(percent_subset.ix[svm_index, 'estimator'].unique())
        svm_scores = ds_df.ix[ds_df.estimator.str.startswith('SVM')]
        
        for i_percent, (p_links, percent_subset) in enumerate(ds_df.groupby('percent_links')):
            #print(p_links, percent_subset.estimator.unique())
            #print(p_labels, p_links)
            #print(percent_subset.estimator.unique())
            ax = axs[i_ds, i_percent]
            
            ax.set_xlim(ds_df.percent_unlabeled.min(), ds_df.percent_unlabeled.max())
            if i_percent == 0:
                ax.set_title(ds_name)
            else:
                ax.set_title('+%d%% links' % int(p_links*100))
                #ax.text(1.5, 1 ,ds_name)
            
            #ax.set_xlabel('% unlabeled')
            #ax.set_ylim(0.5, 1)
            mean_test_score = ds_df.groupby(['estimator', 'percent_links', 'percent_unlabeled']).agg({'test_score':'mean'})
            ax.set_ylim(mean_test_score.values.min()-0.02, mean_test_score.values.max()+0.03)
            #ax.set_ylim(np.percentile(mean_test_score.values, 0.1), np.percentile(mean_test_score.values, 99.1))

        
            #ax.set_ylabel('accuracy')
            if i_ds != len(benchmarks_subset.dataset.unique()) - 1:
                ax.set_xticklabels([])
            
            if i_percent != 0:
                ax.set_yticklabels([])
            
            #print(i_ds, i_percent, axs.shape)
            
            
            npklr_scores = percent_subset.ix[percent_subset.estimator.str.startswith('NPKLR')]
            links_scores = percent_subset.ix[percent_subset.estimator.str.startswith('Links')]
            
            links_variants = links_scores.estimator.unique()
            styles = ['co-', 'ro-'][::-1]
            y_min = 1
            y_max = 0
            for i_est, estimator in enumerate(links_variants):
                links_score_estimator = links_scores.ix[links_scores.estimator == estimator]
                links_score_by_pun = links_score_estimator['test_score'].\
                    groupby(links_score_estimator['percent_unlabeled']).agg('mean')
                ax.plot(links_score_by_pun.index.values, links_score_by_pun.values,
                        styles[i_est], label=estimator)
                y_min = min(y_min, links_score_by_pun.min())
                y_max = max(y_max, links_score_by_pun.max())
            
            #
            ax.grid()
            #assert svm_scores.
            svm_styles = ['go-', 'go--']
            for (percent_labels, svm_score_p_labels), style in itertools.izip(
                svm_scores.groupby('percent_labels'),
                itertools.cycle(svm_styles)):
                if percent_labels == -1:
                    label = 'SVM'
                else:
                    label = 'SVM%d%%' % int(percent_labels*100)
                ax.plot([0, 0.5], [svm_score_p_labels.test_score.mean()]*2, style, label=label)
            ax.plot([0, 0.5], [npklr_scores.test_score.mean()]*2, 'yo-', label='NPKLR')
            y_min = min(y_min, svm_scores.test_score.min())
            y_max = max(y_max, svm_scores.test_score.max())
            y_min = min(y_min, npklr_scores.test_score.min())
            y_max = max(y_max, npklr_scores.test_score.max())
            
            #ax.set_ylim(y_min, y_max)
            
            if i_ds == axs.shape[0] - 1 and i_percent == axs.shape[1] - 1:
                ax.legend(loc=(0.75, -0.75))
        #break
    fig.subplots_adjust(left=0.05, bottom=0.08, right=0.90, top=0.90,
                    wspace=0.07, hspace=0.15)
    return fig

In [50]:
benchmarks_2_labels = pd.read_csv('data/benchmarks_linear_2_labels_2.csv')
fig = plot_benchmarks_subset(benchmarks_2_labels)
fig.suptitle('training with 2 labeled examples, score - accuracy')
fig.set_size_inches(16, 9)
fig.savefig('data/plots/2_labels.png', dpi=125)

In [37]:
benchmarks_no_labels = pd.read_csv('data/benchmarks_linear_no_labels_2.csv')
benchmarks_svm = pd.read_csv('data/benchmarks_svm_linear_adj_rand.csv')
benchmarks_svm.percent_links = 0.1
benchmarks = pd.concat([benchmarks_no_labels, benchmarks_svm])
fig = plot_benchmarks_subset(benchmarks)
fig.suptitle('training with no labeled examples, score - adj. rand index')
fig.set_size_inches(16, 9)
fig.savefig('data/plots/no_labels_with_svm.png', dpi=200)

In [39]:
def plot_benchmarks_subset_2(benchmarks_subset): # version for no labels and same unlabeled subset
    fig, axs= plt.subplots(ncols=int(len(benchmarks_subset.dataset.unique())/2+1), nrows=2)
    axs = axs.flatten()
    grouper_by_dataset = benchmarks_subset.groupby('dataset')
    for i_ds, (ds_name, ds_df) in enumerate(grouper_by_dataset):
        svm_index = ds_df.estimator.str.startswith('SVM')
            #print(percent_subset.ix[svm_index, 'estimator'].unique())
        
        
        ax = axs[i_ds]
            
        ax.set_xlim(ds_df.percent_links.min(), ds_df.percent_links.max())
            
        ax.set_title(ds_name)
            
            
        mean_test_score = ds_df.groupby(['estimator', 'percent_links', 'percent_unlabeled']).agg({'test_score':'mean'})
        ax.set_ylim(mean_test_score.values.min()-0.02, mean_test_score.values.max()+0.03)
            
        
            
            #print(i_ds, i_percent, axs.shape)
            
        svm_scores = ds_df.ix[ds_df.estimator.str.startswith('SVM')]

        npklr_scores = ds_df.ix[ds_df.estimator.str.startswith('NPKLR')]
        links_scores = ds_df.ix[ds_df.estimator.str.startswith('Links')]
        gmm_scores = ds_df.ix[ds_df.estimator.str.startswith('GMM')]
        
        #print(ds_df.groupby(['estimator', 'percent_unlabeled']).agg('count').unstack(-1))
            
        links_variants = links_scores.estimator.unique()
        styles = ['co-', 'ro-'][::-1]
        y_min = 1
        y_max = 0
        for i_est, estimator in enumerate(links_variants):
            links_score_estimator = links_scores.ix[links_scores.estimator == estimator]
            links_score_by_plinks = links_score_estimator['test_score'].\
                groupby(links_score_estimator['percent_links']).agg('mean')
            ax.plot(links_score_by_plinks.index.values, links_score_by_plinks.values,
                    styles[i_est], label=estimator)
            
            #
            ax.grid()
        #assert svm_scores.
        svm_styles = ['go-', 'go--']
        for (percent_labels, svm_score_p_labels), style in itertools.izip(
            svm_scores.groupby('percent_labels'),
            itertools.cycle(svm_styles)):
            if percent_labels == -1:
                label = 'SVM'
            else:
                label = 'SVM%d%%' % int(percent_labels*100)
            ax.plot([0, 0.5], [svm_score_p_labels.test_score.mean()]*2, style, label=label)
            
        npklr_score_by_plinks = npklr_scores['test_score'].\
                groupby(npklr_scores['percent_links']).agg('mean')
        ax.plot(npklr_score_by_plinks.index.values, npklr_score_by_plinks.values,
                'yo-', label='NPKLR')
        
        gmm_score_by_plinks = gmm_scores['test_score'].\
                groupby(gmm_scores['percent_links']).agg('mean')
        ax.plot(gmm_score_by_plinks.index.values, gmm_score_by_plinks.values,
                'bo-', label='GMM')
#         ax.plot([0, 0.5], [npklr_scores.test_score.mean()]*2, 'yo-', label='NPKLR')

            
            #ax.set_ylim(y_min, y_max)
        if i_ds == len(grouper_by_dataset) - 1:
            #print('legend')
            ax.legend(loc=(2.1, -0.15))
        #break
    fig.subplots_adjust(left=0.05, bottom=0.08, right=0.90, top=0.90,
                    wspace=0.07, hspace=0.15)
    return fig

In [34]:
benchmarks_no_labels = pd.read_csv('data/benchmarks_linear_no_labels_3.csv')
benchmarks_svm = pd.read_csv('data/benchmarks_svm_linear_adj_rand_2.csv')
#benchmarks_svm.percent_links = 0.1
benchmarks_no_labels = pd.concat([benchmarks_no_labels, benchmarks_svm])
fig = plot_benchmarks_subset_2(benchmarks_no_labels)
fig.suptitle('training with no labeled examples, score - adj. rand index')
fig.set_size_inches(16, 9)
fig.savefig('data/plots/no_labels_linear_svm.png', dpi=200)

legend


In [35]:
benchmarks_no_labels = pd.read_csv('data/benchmarks_rbf_no_labels_3.csv')
benchmarks_svm = pd.read_csv('data/benchmarks_svm_rbf_adj_rand_2.csv')
# benchmarks_svm.percent_links = 0.1
benchmarks_no_labels = pd.concat([benchmarks_no_labels, benchmarks_svm])
fig = plot_benchmarks_subset_2(benchmarks_no_labels)
fig.suptitle('training with no labeled examples, score - adj. rand index')
fig.set_size_inches(16, 9)
fig.savefig('data/plots/no_labels_rbf_svm.png', dpi=200)

legend


In [42]:
benchmarks_no_labels = pd.read_csv('data/benchmarks_linear_no_labels_3.csv')
benchmarks_svm = pd.read_csv('data/benchmarks_svm_linear_adj_rand_2.csv')
benchmarks_gmm = pd.read_csv('data/benchmarks_gmm_linear_adj_rand.csv')
benchmarks_no_labels = pd.concat([benchmarks_no_labels, benchmarks_svm, benchmarks_gmm])
fig = plot_benchmarks_subset_2(benchmarks_no_labels)
fig.suptitle('training with no labeled examples, score - adj. rand index')
fig.set_size_inches(16, 9)
fig.savefig('data/plots/no_labels_linear_svm_gmm.png', dpi=200)

In [38]:
#sorted(benchmarks_gmm.groupby(['percent_labels', 'percent_links', 'percent_unlabeled']).groups.keys())

[(0.2, 0.0, 0.4),
 (0.2, 0.1, 0.4),
 (0.2, 0.2, 0.4),
 (0.2, 0.3, 0.4),
 (0.2, 0.4, 0.4),
 (0.2, 0.5, 0.4)]

In [49]:
benchmarks_no_labels = pd.read_csv('data/benchmarks_linear_no_labels.csv')

#benchmarks_subindex = benchmarks_linear.dataset.isin(benchmarks_linear.dataset.unique())
plot_benchmarks_subset(benchmarks_no_labels)

In [55]:
benchmarks_no_labels.to_csv('data/benchmarks_linear_no_labels.csv', index=False)

In [30]:
benchmarks_linear = pd.read_csv('data/benchmarks_uci_linear_few_labels.csv')


In [15]:

benchmarks_subindex = benchmarks_linear.dataset.isin(benchmarks_linear.dataset.unique())
plot_benchmarks_subset(benchmarks_linear.ix[benchmarks_subindex])

In [56]:
benchmarks_linear = pd.read_csv('data/benchmarks_linear_trzmiel_2.csv')


In [57]:
benchmarks_subindex = benchmarks_linear.dataset.isin(benchmarks_linear.dataset.unique()) & benchmarks_linear.estimator.str.contains('linear')
plot_benchmarks_subset(benchmarks_linear.ix[benchmarks_subindex])

In [58]:
benchmarks_rbf = pd.read_csv('data/benchmarks_uci_rbf_few_labels.csv')

In [60]:

benchmarks_subindex = benchmarks_rbf.dataset.isin(benchmarks_rbf.dataset.unique()) & benchmarks_rbf.estimator.str.contains('rbf')
plot_benchmarks_subset(benchmarks_rbf.ix[benchmarks_subindex])

In [61]:

benchmarks_subindex = benchmarks_linear.dataset.isin(benchmarks_linear.dataset.unique()) & benchmarks_linear.estimator.str.contains('rbf')
plot_benchmarks_subset(benchmarks_linear.ix[benchmarks_subindex])

In [54]:
benchmarks.dataset.unique()

array(['diabetes_scale', 'breast-cancer_scale', 'australian_scale',
       'ionosphere_scale', 'german.numer_scale', 'heart_scale',
       'liver-disorders_scale'], dtype=object)

In [32]:
lapsvm = pd.read_csv('data/remote/lapsvm_trzmiel.csv')
lapsvm.ix[pd.isnull(lapsvm.estimator), 'estimator'] = 'Links'

In [33]:
lapsvm.head()

Unnamed: 0,dataset,percent_labels,percent_links,percent_unlabeled,cv_split,rs_iters,rs_splits,rs_test_size,cv_score,test_score,beta,delta,alpha,gamma,estimator,gamma_I,gamma_A
0,diabetes_scale,0.1,0.1,0.1,4.0,100.0,5.0,0.2,0.876923,0.694805,627.036243,62.386288,241.334168,10.0,Links,,
1,diabetes_scale,0.1,0.1,0.1,6.0,100.0,5.0,0.2,0.876923,0.616883,576.10566,14.394807,16.79907,10.0,Links,,
2,diabetes_scale,0.1,0.1,0.1,3.0,100.0,5.0,0.2,0.876923,0.74026,627.036243,62.386288,241.334168,10.0,Links,,
3,diabetes_scale,0.1,0.1,0.1,5.0,100.0,5.0,0.2,0.830769,0.636364,48.904148,4.068718,138.640532,100.0,Links,,
4,diabetes_scale,0.1,0.1,0.1,0.0,100.0,5.0,0.2,0.830769,0.766234,473.12696,0.355701,817.840352,10.0,Links,,


In [36]:
def plot_lapsvm_subset(lapsvm_subset):
    fig, axs= plt.subplots(ncols=len(lapsvm_subset.percent_labels.unique()), nrows=len(lapsvm_subset.dataset.unique()))

    for i_ds, (ds_name, ds_df) in enumerate(lapsvm_subset.groupby('dataset')):
        for i_percent, (p_labels, percent_subset) in enumerate(ds_df.groupby('percent_labels')):
            
            #print(percent_subset.estimator.unique())
            ax = axs[i_ds, i_percent]
            if i_percent == 0:
                ax.set_title(ds_name)
            ax.set_xlim(0, 0.5)
            #ax.set_xlabel('% unlabeled')
            # ax.set_ylim(0.55, 1)
            ax.set_ylim(percent_subset.test_score.min(), percent_subset.test_score.max())
            #ax.set_ylabel('accuracy')

            lapsvm_scores = percent_subset.ix[percent_subset.estimator == 'LapSVM']
            
            links_scores = percent_subset.ix[percent_subset.estimator == 'Links']
            links_score_by_plinks = links_scores['test_score'].groupby(links_scores['percent_links']).agg('mean')
            #assert svm_scores.
            ax.plot([0, 0.5], [lapsvm_scores.test_score.mean()]*2, 'g--', label='LapSVM')
            ax.plot(links_score_by_plinks.index.values, links_score_by_plinks.values, 'r', label='Links')
        #break
    #fig.tight_layout(pad=0.11, w_pad=0.1, h_pad=0.05)
    fig.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95,
                    wspace=0.07, hspace=0.05)

In [37]:
plot_lapsvm_subset(lapsvm)

In [67]:
lapsvm.percent_links.unique()

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0. ])