# Supplementary Figure 7: Comparison to other methods

Must be run with Python2.7 for `diptest` package

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import six


sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})


import anchor


from anchor import MODALITY_ORDER, MODALITY_PALETTE, MODALITY_TO_COLOR, MODALITY_TO_CMAP

%load_ext autoreload
%autoreload 2

# Figures in the notebook
%matplotlib inline

# Set random seed
np.random.seed(sum(map(ord, 'anchor')))



# Define folder to save figures
folder = 'figures/anchor/sfig_7'
!mkdir -p $folder


## Get noise-added datasets from SFig5 and SFig6

In [2]:
perfect_modalities = pd.read_csv('figures/anchor/sfig_5/data.csv', index_col=0)
six.print_(perfect_modalities.shape)
perfect_modalities.head()


(100, 8004)


Unnamed: 0,excluded_noise0_iter0,bimodal_noise0_iter0,included_noise0_iter0,middle_noise0_iter0,excluded_noise5_iter0,bimodal_noise5_iter0,included_noise5_iter0,middle_noise5_iter0,excluded_noise5_iter1,bimodal_noise5_iter1,...,included_noise100_iter97,middle_noise100_iter97,excluded_noise100_iter98,bimodal_noise100_iter98,included_noise100_iter98,middle_noise100_iter98,excluded_noise100_iter99,bimodal_noise100_iter99,included_noise100_iter99,middle_noise100_iter99
0,0,0,1,0.5,0,0,1,0.5,0.0,0.0,...,0.104306,0.255304,0.607051,0.318675,0.278459,0.348157,0.092185,0.410834,0.72061,0.282821
1,0,0,1,0.5,0,0,1,0.5,0.0,0.0,...,0.024601,0.938658,0.113968,0.402693,0.76933,0.009054,0.769117,0.26101,0.299588,0.203287
2,0,0,1,0.5,0,0,1,0.5,0.0,0.0,...,0.132754,0.401305,0.920149,0.006544,0.684529,0.862542,0.949463,0.478771,0.903987,0.284627
3,0,0,1,0.5,0,0,1,0.5,0.0,0.0,...,0.006801,0.207151,0.380513,0.801769,0.305975,0.187343,0.49855,0.729321,0.892719,0.373746
4,0,0,1,0.5,0,0,1,0.5,0.129001,0.390094,...,0.845931,0.724473,0.600698,0.913603,0.556213,0.317132,0.763205,0.907491,0.130769,0.559195


In [3]:
maybe_bimodals = pd.read_csv('figures/anchor/sfig_6/data.csv', index_col=0)
six.print_(maybe_bimodals.shape)
maybe_bimodals.head()

(100, 198099)


Unnamed: 0,bimodal_01_noise0_iter0,bimodal_02_noise0_iter0,bimodal_03_noise0_iter0,bimodal_04_noise0_iter0,bimodal_05_noise0_iter0,bimodal_06_noise0_iter0,bimodal_07_noise0_iter0,bimodal_08_noise0_iter0,bimodal_09_noise0_iter0,bimodal_10_noise0_iter0,...,bimodal_90_noise100_iter99,bimodal_91_noise100_iter99,bimodal_92_noise100_iter99,bimodal_93_noise100_iter99,bimodal_94_noise100_iter99,bimodal_95_noise100_iter99,bimodal_96_noise100_iter99,bimodal_97_noise100_iter99,bimodal_98_noise100_iter99,bimodal_99_noise100_iter99
0,0,0,0,0,0,0,0,0,0,0,...,0.600152,0.155205,0.376607,0.042812,0.448701,0.202316,0.969361,0.622581,0.357064,0.029683
1,0,0,0,0,0,0,0,0,0,0,...,0.584326,0.373509,0.321302,0.844513,0.64859,0.885582,0.98733,0.923216,0.656507,0.821695
2,0,0,0,0,0,0,0,0,0,0,...,0.636276,0.479932,0.124,0.464486,0.854911,0.18294,0.758169,0.833992,0.438727,0.697401
3,0,0,0,0,0,0,0,0,0,0,...,0.713643,0.629295,0.692804,0.76847,0.816107,0.583497,0.417128,0.245534,0.990129,0.637606
4,0,0,0,0,0,0,0,0,0,0,...,0.472486,0.357911,0.281413,0.037658,0.925184,0.503677,0.359777,0.191874,0.048664,0.477254


In [4]:
datasets = {'Maybe Bimodals': maybe_bimodals, "Perfect Modalities": perfect_modalities}
for name in datasets:
    dataset_folder = '{}/{}'.format(folder, name.lower().replace(' ', '_'))
    ! mkdir $dataset_folder

mkdir: figures/anchor/sfig_7/maybe_bimodals: File exists
mkdir: figures/anchor/sfig_7/perfect_modalities: File exists


## Simple binning

In [None]:
# from evaluate_anchor import evaluate_estimators
from anchor.simulate import evaluate_estimator
import pandas as pd

binning = anchor.BinnedModalities()

for name, data in datasets.items():    
    dataset_folder = '{}/{}/binned'.format(folder, name.lower().replace(' ', '_'))
    ! mkdir -p $dataset_folder
    figure_prefix = '{}/evaluated'.format(dataset_folder)

    result = evaluate_estimator(binning, data, waypoints=None, figure_prefix=figure_prefix)
    
    if name == 'Maybe Bimodals':
        bimodals_predicted = result.predicted.reset_index()
        bimodals_predicted = bimodals_predicted.rename(columns={'index': "Feature ID"})
        bimodals_predicted['Number of Ones'] = bimodals_predicted['Feature ID'].str.split('_').str[1].astype(int) + 1
        bimodals_predicted['Original Modality'] = bimodals_predicted['Feature ID'].str.split('_').str[0]
        bimodals_predicted['Noise'] = bimodals_predicted['Feature ID'].str.extract('noise(\d+)').astype(int)
        bimodals_predicted['Predicted Bimodal'] = bimodals_predicted['Predicted Modality'] == bimodals_predicted['Original Modality']

        g = sns.factorplot(x='Number of Ones', y='Predicted Bimodal', 
                       data=bimodals_predicted, scale=0.5, color='#262626',
                       aspect=1.5, ci=None)
        for ax in g.axes.flat:
            ymin, ymax = ax.get_ylim()
            ax.vlines([10, 90], ymin, ymax, linestyle='--')
        g.set(xticks=(0, 20, 40, 60, 80, 100), xticklabels=(0, 20, 40, 60, 80, 100), ylim=(ymin, ymax))
        g.savefig('{}_bimodals_percent_predicted_bimodal.pdf'.format(figure_prefix))

        g = sns.factorplot(x='Number of Ones', y='Predicted Bimodal', 
                       data=bimodals_predicted, scale=0.5, dodge=False,
                       aspect=1.5, ci=None, hue='Noise', palette='GnBu_r', hue_order=np.arange(0, 101, 5)[::-1])
        g.set(xticks=(0, 20, 40, 60, 80, 100), xticklabels=(0, 20, 40, 60, 80, 100))
        g.savefig('{}_bimodals_percent_predicted_bimodal_with_noise.pdf'.format(figure_prefix))

        g = sns.factorplot(x='Noise', y='Predicted Bimodal', 
                       data=bimodals_predicted, scale=0.5, dodge=False, legend=False,
                       aspect=1.5, ci=None, hue='Number of Ones', palette='RdBu', hue_order=np.arange(1, 100)[::-1])
        # g.set(xticks=(0, 20, 40, 60, 80, 100), xticklabels=(0, 20, 40, 60, 80, 100))
        g.savefig('{}_bimodals_percent_predicted_bimodal_with_noise.pdf'.format(figure_prefix))
    elif name == 'Perfect Modalities':
        # Use noisy simple modalities to get correctness
        predicted_df = result.predicted.reset_index()
        predicted_df = predicted_df.rename(columns={'index': 'Feature ID', 0: "Predicted Modality"})
        predicted_df['Original Modality'] = predicted_df['Feature ID'].str.split('_').str[0]
        predicted_df['% Noise'] = predicted_df['Feature ID'].str.extract('noise(\d+)').astype(int)
        predicted_df['Predicted Original Modality'] = predicted_df['Predicted Modality'] == predicted_df['Original Modality']
        predicted_df.head()
        g = sns.factorplot(x='% Noise', y='Predicted Original Modality', data=predicted_df, hue="Original Modality",
                       hue_order=MODALITY_ORDER[:-1], palette=MODALITY_PALETTE[:-1], aspect=1.5, dodge=True, size=3, scale=0.5)
        g.set(ylim=(0, 1.05))
        g.savefig('{}_percent_predicted_original_modality_from_noise.pdf'.format(figure_prefix))


        predicted_counts = predicted_df.groupby(['Original Modality', 'Predicted Modality', "% Noise"]).size()
        predicted_counts = predicted_counts.reset_index()
        predicted_counts = predicted_counts.rename(columns={0: 'Features'})
        predicted_counts['Percent of Features'] = predicted_counts.groupby(
            ['Original Modality', '% Noise'], as_index=False, group_keys=False).apply(
            lambda x: 100*x['Features']/x['Features'].sum())

        g = sns.factorplot(x='% Noise', hue='Predicted Modality', y='Percent of Features', 
                       col='Original Modality', col_order=MODALITY_ORDER[:-1],
                       data=predicted_counts, kind='point', #col_wrap=2, 
                       hue_order=MODALITY_ORDER, size=3, scale=0.5, dodge=True,
                       palette=MODALITY_PALETTE, aspect=1.25, legend=False)
        g.savefig('{}_percent_predicted_modality_with_noise.pdf'.format(figure_prefix))

## Hartigan and Hartigan's Diptest

In [7]:
import six

In [8]:
six.moves.range?

In [9]:
from diptest import diptest

for name, data in datasets.items():
    dataset_folder = '{}/{}'.format(folder, name.lower().replace(' ', '_'))
    
    diptest_results = data.apply(lambda x: diptest(x.values))
    diptest_results = diptest_results.apply(lambda x: pd.Series(x, index=['Dip Statistic', '$p$-value']))
    diptest_results['log10_p_value'] = np.log10(diptest_results['$p$-value'])

    g = sns.jointplot(x='Dip Statistic', y='$p$-value', data=diptest_results, stat_func=None, 
                  size=4, ylim=(0, 1), xlim=(0, 0.25))
    g.savefig('{}/diptest_statistic_vs_p_value.pdf'.format(dataset_folder))

    diptest_results['Predicted Bimodal'] = diptest_results['$p$-value'] < 0.05

#     diptest_with_predicted = predicted_df.join(diptest_results, on='Feature ID')

    g = sns.factorplot(x='Noise', y='Dip Statistic', data=diptest_with_predicted, 
                       scale=0.5, size=3, aspect=1.5, hue='Original Modality', palette=palette, hue_order=hue_order)
    g.set(title='Diptest')
    g.savefig('{}/dip_statistic_in_all_modalities.pdf'.format(folder))

ImportError: No module named diptest

In [18]:
import diptest

In [19]:
from diptest import diptest

In [20]:
diptest?

## Bimodality Index

In [None]:
from sklearn.mixture import GMM

for name, data in datasets.items():

    gmm = GMM(n_components=2)
    gmm.fit(data)
    means = pd.DataFrame(gmm.means_, columns=data.columns)
    means.head()

    covars = pd.DataFrame(gmm.covars_, columns=data.columns)
    covars.head()

    weights = pd.Series(gmm.weights_)
    weights.head()

    proportions = weights.mean()
    standardized_distance = np.abs(means.loc[0] - means.loc[1])/covars.mean()

    bimodality_index = np.sqrt(proportions * (1-proportions)) * standardized_distance
    bimodality_index.name = "Bimodality Index"
    bimodality_index.head()
    
    bimodality_index = predicted_df.join(bimodality_index, on='Feature ID')
    bimodality_index.head()