In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import six

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})


import bonvoyage

%load_ext autoreload
%autoreload 2

# Figures in the notebook
%matplotlib inline

# Set random seed
np.random.seed(sum(map(ord, 'bonvoyage')))


# Define folder to save figures
folder = 'figures/anchor/sfig_10'
!mkdir -p $folder


## Make "Maybe Everything" test dataset

In [10]:
size = 100

perfectly1 = np.ones(size).reshape(size, 1)
perfectly0 = np.zeros(size).reshape(size, 1)
perfectly_middle = 0.5*np.ones(size).reshape(size, 1)
maybe_middles_0 = np.vstack([np.concatenate([np.zeros(i), np.ones(size-i)*0.5]) for i in range(1, size)]).T
maybe_middles_1 = np.vstack([np.concatenate([np.ones(i), np.ones(size-i)*0.5]) for i in range(1, size)]).T
maybe_bimodals = np.vstack([np.concatenate([np.zeros(i), np.ones(size-i)]) for i in range(1, size)]).T

columns = ['perfect_middle'.format(str(i).zfill(2)) for i in range(perfectly_middle.shape[1])] \
    + ['middle0_{}'.format(str(i).zfill(2)) for i in range(maybe_middles_0.shape[1])] \
    + ['middle1_{}'.format(str(i).zfill(2)) for i in range(maybe_middles_1.shape[1])] \
    + ['bimodal_{}'.format(str(i).zfill(2)) for i in range(maybe_bimodals.shape[1])] \
    + ['perfect_included', 'perfect_excluded']

data = np.hstack([perfectly_middle, maybe_middles_0, maybe_middles_1, maybe_bimodals, perfectly1, perfectly0])
maybe_everything = pd.DataFrame(data, columns=columns)
six.print_(maybe_everything.shape)
maybe_everything.head()

(100, 300)


Unnamed: 0,perfect_middle,middle0_00,middle0_01,middle0_02,middle0_03,middle0_04,middle0_05,middle0_06,middle0_07,middle0_08,...,bimodal_91,bimodal_92,bimodal_93,bimodal_94,bimodal_95,bimodal_96,bimodal_97,bimodal_98,perfect_included,perfect_excluded
0,0.5,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0.5,0.5,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0.5,0.5,0.5,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0.5,0.5,0.5,0.5,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0.5,0.5,0.5,0.5,0.5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
from anchor.simulate import add_noise

In [14]:
maybe_everything_noisy = add_noise(maybe_everything, iteration_per_noise=10, 
                                   noise_percentages=np.arange(0, 101, 5), plot=False)
six.print_(maybe_everything_noisy.shape)
maybe_everything_noisy.head()

(100, 60300)


Unnamed: 0,perfect_middle_noise0_iter0,middle0_00_noise0_iter0,middle0_01_noise0_iter0,middle0_02_noise0_iter0,middle0_03_noise0_iter0,middle0_04_noise0_iter0,middle0_05_noise0_iter0,middle0_06_noise0_iter0,middle0_07_noise0_iter0,middle0_08_noise0_iter0,...,bimodal_91_noise100_iter9,bimodal_92_noise100_iter9,bimodal_93_noise100_iter9,bimodal_94_noise100_iter9,bimodal_95_noise100_iter9,bimodal_96_noise100_iter9,bimodal_97_noise100_iter9,bimodal_98_noise100_iter9,perfect_included_noise100_iter9,perfect_excluded_noise100_iter9
0,0.5,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0.940849,0.956541,0.505262,0.10679,0.297677,0.676394,0.821691,0.423295,0.55477,0.825465
1,0.5,0.5,0.0,0.0,0.0,0,0,0,0,0,...,0.279867,0.71427,0.264256,0.693967,0.978037,0.227072,0.404614,0.314873,0.683356,0.42062
2,0.5,0.5,0.5,0.0,0.0,0,0,0,0,0,...,0.47096,0.390565,0.694338,0.228507,0.847353,0.891389,0.981346,0.172036,0.025466,0.106788
3,0.5,0.5,0.5,0.5,0.0,0,0,0,0,0,...,0.871887,0.063509,0.614852,0.492358,0.520635,0.873891,0.337709,0.876683,0.553144,0.80843
4,0.5,0.5,0.5,0.5,0.5,0,0,0,0,0,...,0.359571,0.715005,0.760587,0.141558,0.70799,0.439283,0.030219,0.958145,0.593338,0.320228


### Save the data

In [None]:
maybe_everything_noisy.to_csv('data.csv')

### Plot a subset of the data with violinplots

In [15]:
tidy = maybe_everything_noisy.unstack().reset_index()
tidy = tidy.rename(columns={'level_0':'Feature ID', 'level_1': "Sample ID", 0:'$\Psi$'})
six.print_(tidy.shape)
tidy.head()

(6030000, 3)


Unnamed: 0,Feature ID,Sample ID,$\Psi$
0,perfect_middle_noise0_iter0,0,0.5
1,perfect_middle_noise0_iter0,1,0.5
2,perfect_middle_noise0_iter0,2,0.5
3,perfect_middle_noise0_iter0,3,0.5
4,perfect_middle_noise0_iter0,4,0.5


In [16]:
tidy['Iteration'] = tidy['Feature ID'].str.extract('iter(\d+)').astype(int)
tidy['% Noise'] = tidy['Feature ID'].str.extract('noise(\d+)').astype(int)
tidy.head()

Unnamed: 0,Feature ID,Sample ID,$\Psi$,Iteration,% Noise
0,perfect_middle_noise0_iter0,0,0.5,0,0
1,perfect_middle_noise0_iter0,1,0.5,0,0
2,perfect_middle_noise0_iter0,2,0.5,0,0
3,perfect_middle_noise0_iter0,3,0.5,0,0
4,perfect_middle_noise0_iter0,4,0.5,0,0


In [None]:
%%time
split_id = tidy['Feature ID'].str.split('_').apply(pd.Series)
tidy = pd.concat([tidy, split], axis=1)
tidy.head()

In [None]:
noise_levels = tidy['% Noise'].isin([0, 25, 50, 75])

perfects = tidy['Feature ID'].str.contains('perfect')
middles = tidy['Feature ID'].str.startswith('middle') & tidy['1'].isin(['25', '50', '75'])
bimodals = tidy['Feature ID'].str.startswith('bimodal') & tidy['1'].isin(['25', '50', '75'])

row_subsets = perfects, bimodals, middles

dfs = []

for rows in row_subsets:
    df = tidy.loc[rows & noise_levels]
    dfs.append(df)
tidy_subset = pd.concat(dfs, ignore_index=True)
six.print_(tidy_subset.shape)
tidy_subset.head()

In [None]:
fig, ax = plt.subplots()
sns.violinplot(x='Feature ID', y='$\Psi$', data=tidy_subset, bw=0.2, inner=None)

In [None]:
g = sns.factorplot(x='Feature ID', y='$\Psi$', row='% Noise', data=tidy_subset, aspect=3, size=3,
                   kind='violin', bw=0.2, inner=None, scale='width')
g.set(ylim=(0, 1), yticks=(0, 0.5, 1))
g.savefig('{}/data.pdf'.format(folder))

## Transform to waypoints

In [None]:
# Initialize the waypoints transformer
ws = bonvoyage.Waypoints()

waypoints = ws.fit_transform(maybe_everything_noisy)
six.print_(waypoints.shape)
waypoints.head()