In [None]:
!pip install edn_format

In [None]:
from collections import OrderedDict
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import Orange
import pandas as pd
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import edn_format
import os
from itertools import chain, groupby
from operator import itemgetter
from pprint import pprint

mpl.rcParams['figure.dpi'] = 300

## Algorithms Comparison

In [None]:
datasets = ['arem', 'breast-cancer', 'census', 'electricity', 'htru2', 'nyc-taxi', 'power-usage', 'pregnancies', 'prosper-loans', 'rbf-f', 'sea-a', 'wfr']
epsilons = [0.1, 0.2]

def load_edn_rows(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
        return edn_format.loads(text)

def load_all_rows(comparison_filename):
    return tuple(chain(*[ load_edn_rows(os.path.join('workspace', dataset, comparison_filename)) for dataset in datasets ]))

def get_first_of(d, attrs):
    for attr in attrs:
        if attr in d:
            return d[attr]
    return None

def rows_to_dict(rows, key_attr, value_attrs):
    key = itemgetter(key_attr)
    groups = groupby(sorted(rows, key=key), key)
    return {k: [ get_first_of(row, value_attrs) for row in rows ] for k, rows in groups}

mask_rows = load_all_rows('mask-comparison.edn')
mask_comparison = {epsilon: rows_to_dict(mask_rows, 'Mask', ['Performance; e={}'.format(epsilon)]) for epsilon in epsilons}
mask_names = {
    'RP Only': 'RP',
    'Cumulative Noise - Level 1': 'RPCN-1',
    'Cumulative Noise - Level 2': 'RPCN-2',
    'Cumulative Noise - Level 3': 'RPCN-3',
    'Independent Noise - Level 1': 'RPIN-1',
    'Independent Noise - Level 2': 'RPIN-2',
    'Independent Noise - Level 3': 'RPIN-3',
    **{'Sensitive-Drift (window = {}) - SD = {}'.format(w, sd): 'SD-{}-{}'.format(w, sd) for w in [10, 30, 50, 100] for sd in [0.03, 0.05, 0.1, 0.5, 1.0]}
}
for eps in epsilons:
    for old_name, new_name in mask_names.items():
        if old_name not in mask_comparison[eps]:
            continue
        mask_comparison[eps][new_name] = mask_comparison[eps].pop(old_name)
    mask_comparison[eps] = { k: v for k, v in mask_comparison[eps].items() if k in mask_names.values() }

print('Datasets: ', len(datasets))
print("Mask Comparison")
pprint(mask_comparison)

In [None]:
def friedman_test(comparison, *, reverse=False):
    df = pd.DataFrame.from_dict(comparison, orient='index')
    # Friedman test, p-value
    print(friedmanchisquare(*comparison.values()))
    # Nemenyi posthoc
    #nemenyi = sp.posthoc_nemenyi_friedman(df.transpose().values.tolist()).style.applymap(
    #    lambda x: 'color: {}'.format('red' if x < 0.05 else 'black'))
    #display(nemenyi)
    # Critical Distance Chart
    ranks = df.rank(ascending=False)
    avg_ranks_series = ranks.mean(axis=1)
    avg_ranks = avg_ranks_series.tolist()
    names = avg_ranks_series.index.tolist()
    dataset_count = len(list(comparison.values())[0])
    cd = Orange.evaluation.compute_CD(avg_ranks, dataset_count, alpha='0.05')
    print('Critical value:', cd)
    Orange.evaluation.graph_ranks(avg_ranks, names, cd=cd, width=6, textspace=1.5, reverse=reverse)
    plt.show()
    #plt.savefig('cd.svg')

## Mask Comparison - Epsilon = 0.1

In [None]:
friedman_test(mask_comparison[0.1])

## Mask Comparison - Epsilon = 0.2

In [None]:
friedman_test(mask_comparison[0.2])