In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from collections import namedtuple

In [None]:
cpu_lzf = pd.read_csv('../benchmarks/lzf.out', sep='|', header=None)
cpu_lzf.columns = ['alg', 'comp_size', 'compress_us', 'decompress_us', 'file', 'block']

In [None]:
lzf_file = cpu_lzf.groupby('file').agg(['mean', 'std', 'min', 'max'])

In [None]:
# drop first val, because of setup latency
cpu_lzf = cpu_lzf[1:]

In [None]:
file_groups = cpu_lzf.groupby('file')

In [None]:
# find stds for each group
stds = file_groups.aggregate(np.std)
stds = stds.replace(np.NaN, 0)

In [None]:
# find mean values
means = file_groups.aggregate(np.mean)

In [None]:
# find sample bounds
upper = means + (stds * 3)
lower = means - (stds * 3)

In [None]:
orig_len = len(ecoli)
ecoli = file_groups.get_group('canterbury-large/E.coli')
compress_idx = [np.abs(ecoli.compress_us-ecoli.compress_us.mean())<= 3*ecoli.compress_us.std()]
decompress_idx = [np.abs( ecoli.decompress_us-ecoli.decompress_us.mean() )<= 3*ecoli.decompress_us.std()]
compress_decomp_idx = compress_idx and decompress_idx
in_std_len = len(ecoli[compress_decomp_idx[0]])
print('Removed {} of {} records, outside of 3 std dvs'.format(orig_len - in_std_len, orig_len))
f, axarr = plt.subplots(1, 2, figsize=(12, 8))
axarr[0].bar(1, ecoli[compress_decomp_idx[0]]['compress_us'].mean())
axarr[1].bar(1, ecoli[compress_decomp_idx[0]]['decompress_us'].mean())

In [None]:
ecoli.alg.iloc[0]

In [None]:
len([c for c in compress_decomp_idx[0] if c])

In [None]:
files = list(file_groups.groups.keys())


In [None]:
CompressOut = namedtuple('CompressOut', 'path type')

In [None]:
compress_outs = [
    CompressOut('../benchmarks/bzip.out', 'bzip'),
    CompressOut('../benchmarks/gzip.out', 'gzip'),
    CompressOut('../benchmarks/lz4.out', 'lz4'),
    CompressOut('../benchmarks/lzf.out', 'lzf'),
    CompressOut('../benchmarks/lzma.out', 'lzma'),
    CompressOut('../benchmarks/lzo.out', 'lzo'),
    CompressOut('../benchmarks/snappy.out', 'snappy')
]

In [None]:
compressed_dfs = [
    pd.read_csv(c.path, sep='|', header=None) for c in compress_outs
]

In [None]:
fig, axarr = plt.subplots(25, 3, figsize=(24, 64))
metrics = ['comp_size', 'compress_us', 'decompress_us']
i = 0
algs = []
file_names = None
for c_df in compressed_dfs:
    i += 1
    c_df.columns = ['alg', 'comp_size', 'compress_us', 'decompress_us', 'file', 'block']
    algs.append(c_df.alg.iloc[0])
    c_df = c_df.groupby('file')
    if not file_names:
        file_names = list(c_df.groups.keys())
        file_names = [f.replace('//', '/') for f in files]
    files = list(c_df.groups.keys())
    assert [f.replace('//', '/') for f in files] == file_names
    
    for f in range(len(files)):
        file_df = c_df.get_group(files[f])
        file_df_len = len(file_df)
        if file_df_len > 1:
            compress_idx = [
                np.abs(file_df.compress_us-file_df.compress_us.mean())<= 3*file_df.compress_us.std()]
            decompress_idx = [
                np.abs(file_df.decompress_us-file_df.decompress_us.mean())<= 3*file_df.decompress_us.std()]
            comp_and_decomp = compress_idx and decompress_idx
        else:
            # quick and dirty way to handle the 1 block example
            comp_and_decomp = [[True]]
        comp_and_decomp_len = len([c for c in comp_and_decomp[0] if c])
        print('{}, {}, Removed {} of {} records'.format(file_df.alg.iloc[0], file_df.file.iloc[0],
                                                        file_df_len - comp_and_decomp_len, file_df_len))
        for m in range(len(metrics)):
            axarr[f, m].bar(i, file_df[comp_and_decomp[0]][metrics[m]].mean())
for f in range(len(files)):
    axarr[f, 0].set_ylabel(files[f])
    for m in range(len(metrics)):
        axarr[f, m].set_title(metrics[m])
        axarr[f, m].set_xticks(range(len(algs) + 1))
        axarr[f, m].set_xticklabels([''] + algs, minor=False)
plt.subplots_adjust(top=0.92, bottom=0.08, hspace=0.3)
#plt.show()
plt.savefig('compression_algorithms_by_file_smallest.png')