In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('precision', 4)
sns.set_style('whitegrid')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [20, 5]
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['xtick.labelsize'] = 15
matplotlib.rcParams['ytick.labelsize'] = 15
matplotlib.rcParams['axes.labelsize'] = 15
matplotlib.rcParams['legend.fontsize'] = 15
matplotlib.rcParams['axes.titlesize'] = 'x-large'

In [2]:
# prefix = "../log"
prefix = "../output/streamer/scheduler/combinations"
filenames = ["{}/sched-sim-permute-all-{}.csv".format(prefix, i) for i in range(1, 10+1)]
dfs = [pd.read_csv(f, header=None) for f in filenames if os.path.isfile(f) and os.path.getsize(f) > 0]
df_all = pd.concat(dfs)

In [3]:
df_all['comb'] = df_all[0].apply(str).str.split('_')
df_all['No of applications'] = df_all['comb'].apply(len)
df_all['Avg FNR'] = df_all[1]
df_all['Avg Rel Acc Loss'] = df_all[2]
df_all['FNRs'] = df_all[3].str.split('_')
df_all['Rel Acc Losss'] = df_all[4].str.split('_')

a = ['mean={}'.format(i) for i in range(10)]
baselines = {i: df_all[df_all[0] == 'mean={}'.format(i)] for i in range(6)}
baselines_fnr = {k: float(v['Avg FNR']) for k, v in baselines.items()}

for metric in ['Rel Acc Loss', 'FNR']:
    df_all[metric + 's'] = df_all[metric + 's'].fillna(df_all['Avg ' + metric]).apply(aa)

# Normalise against baseline FNR.
df_all['Curves'] = df_all[0].apply(lambda x: x.replace("mean=", "").split("_"))
def norm(x):
    return [fnr / baselines_fnr[int(curve)]
            for curve, fnr in zip(x['Curves'], x['FNRs'])]
def c_loss(x):
    return [fnr - baselines_fnr[int(curve)]
            for curve, fnr in zip(x['Curves'], x['FNRs'])]

df_all['Normed FNRs'] = df_all.apply(norm, axis=1)
df_all['Avg Normed FNR'] = df_all['Normed FNRs'].apply(np.mean)
df_all['FNR Losss'] = df_all.apply(c_loss, axis=1)
df_all['Avg FNR Loss'] = df_all['FNR Losss'].apply(np.mean)

def aa(x):
    if not isinstance(x, list):
        x = [x]
    return map(float, x)
for metric in ['Rel Acc Loss', 'FNR', 'Normed FNR', 'FNR Loss']:
    df_all[metric + 's'] = df_all[metric + 's'].fillna(df_all['Avg ' + metric]).apply(aa)
    df_all['Min ' + metric] = df_all[metric + 's'].apply(min)
    df_all['Max ' + metric] = df_all[metric + 's'].apply(max)
    df_all['Max-Min ' + metric] = df_all[metric + 's'].apply(lambda x: max(x) - min(x))
    df_all['(Max-Min)/Max ' + metric] = df_all[metric + 's'].apply(lambda x: (max(x) - min(x))/(max(x)+.01))

NameError: name 'aa' is not defined

In [None]:
plot_kwargs = {'x_jitter': .15, 'fit_reg': False}
metrics = ['Rel Acc Loss', 'FNR', 'FNR Loss', 'Normed FNR']

In [None]:
def plot_multiple(prefix='Avg', x_var='No of applications', fn=sns.barplot, plot_kwargs={}):
    f, ax = plt.subplots(1, len(metrics))
    for i, y_var in enumerate([prefix + ' ' + m for m in metrics]):
        fn(df_all[x_var], df_all[y_var], ax=ax[i], **plot_kwargs)
    plt.suptitle(prefix)

In [None]:
plot_multiple('Max-Min')

- **This graph: gap b/w best and worst in each combination.**

#### General notes
- Left to right:
    - Rel Acc Loss
    - FNR
    - **FNR Loss (`FNR - baseline`)**
    - Normed FNR (`FNR / baseline`)
- Lower = Better.
- Error bars are the std dev (rather than percentiles.)
- Normed is a multiple, whereas the range for the rest is $[0, 1]$. Y-Axis is not synchronized b/w plots.
- Combinations, not permutations

In [None]:
plot_multiple('Avg')

**This graph: avg performance in each combination.**

In [None]:
plot_multiple('Max')

This graph: max (**worst performing**) in each combination.

In [None]:
plot_multiple('Min')

- This graph: min (**best performing**) in each combination.
- The right-most graph (Normed FNR) appears to converge to ~1.5x.

In [None]:
sns.regplot(df_all['No of applications'], df_all['Max-Min FNR Loss'], **plot_kwargs);

In [None]:
sns.regplot(df_all['No of applications'], df_all['Max-Min Normed FNR'], **plot_kwargs);

Spread for normed seems to be bucketed. Possibly due to some kind of 'modulo' effect (since there are only 6 values to divide by)?

In [None]:
sns.regplot(df_all['No of applications'], df_all['Max-Min FNR'], **plot_kwargs);

In [None]:
plot_multiple(prefix='Max-Min', fn=sns.regplot, plot_kwargs=plot_kwargs)

In [None]:
plot_multiple(prefix='Max-Min', fn=sns.boxplot)

In [None]:
df_all[['No of applications', 'comb', 'Avg FNR', 'Avg Rel Acc Loss', 'FNRs', 'Rel Acc Losss']]