# Nature of Data

In [354]:
import os
import numpy as np
import scipy as sp
import pickle as pkl
import pandas as pd

from pybdm import BDM
from pybdm import PartitionIgnore
from pyinform.blockentropy import block_entropy
from compress import Compressor

from misc.database import Database

# plotting tools
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.models import CustomJS, Slider, ColumnDataSource, Whisker, Range1d, Span

output_notebook()

In [355]:
## Explore the nature of our data

data_path = "/media/arjun/SSD/chaos/data"

with open(os.path.join(data_path, 
         "avrora-small-HotSpot-d-l64-p4096-w100000i.analyzed-1.pkl"),
         "rb") as f:
    cache_data = pkl.load(f)

cache_data = cache_data.to_numpy()
cache_bins = np.argmax(cache_data, axis=1)

plot_options = dict(width=900,
                    plot_height=450,
                    tools='pan,xwheel_zoom,reset,save')
lamda_series = [ float((cache_bins[i+1]+1)*101/((cache_bins[i]+1)*(99-cache_bins[i]))) 
                       for i in range(cache_bins.shape[0]-1) ]
lamda_running_mean = [ np.mean(lamda_series[max(0, i-1000):i]) 
                       for i in range(1, len(lamda_series)) ]
# print(lamda_series)

plot = figure(**plot_options)
plot.line(range(cache_bins.shape[0]),
                cache_bins,
                line_width=2)

plot.xaxis.axis_label = 'time step'
plot.yaxis.axis_label = 'bin'

show(plot)

plot = figure(**plot_options)
plot.line(range(cache_bins.shape[0]),
                lamda_series,
                line_width=2)
plot.line(range(cache_bins.shape[0]),
                lamda_running_mean,
                line_width=2,
                line_color='green')
plot.y_range = Range1d(0, 10)
hline = Span(location=3.5, dimension='width', line_color='red', line_width=2)

plot.renderers.extend([hline])
plot.xaxis.axis_label = 'time step'
plot.yaxis.axis_label = 'lambda'

show(plot)



In [386]:
random_walk = np.random.choice(range(100), size=1000)

lamda_series = [ float((random_walk[i+1]+1)*101/((random_walk[i]+1)*(101-random_walk[i]))) 
                       for i in range(len(random_walk)-1) ]
lamda_running_mean = [ np.mean(lamda_series[max(0, i-1000):i]) 
                       for i in range(1, len(lamda_series)) ]
# print(lamda_series)
# print(lamda_running_mean)

plot = figure(**plot_options)
plot.line(range(random_walk.shape[0]),
                random_walk,
                line_width=2)

plot.xaxis.axis_label = 'time step'
plot.yaxis.axis_label = 'bin'

show(plot)

plot = figure(**plot_options)
plot.line(range(cache_bins.shape[0]),
                lamda_series,
                line_width=2)
plot.line(range(cache_bins.shape[0]),
                lamda_running_mean,
                line_width=2,
                line_color='green')
plot.y_range = Range1d(0, 10)
hline = Span(location=3.5, dimension='width', line_color='red', line_width=2)

plot.renderers.extend([hline])
plot.xaxis.axis_label = 'time step'
plot.yaxis.axis_label = 'lambda'

show(plot)



In [356]:
x_list = np.array(list(range(100)))
y_list = np.array([ len(list(filter(lambda x: x>=val, cache_bins))) for val in x_list ])
y_list = np.array([ np.log(np.clip(val/cache_bins.shape[0], 1e-10, 1)) for val in y_list ])

indices = filter(lambda i: y_list[i]>np.min(y_list) and y_list[i]<-0.1, 
                 range(y_list.shape[0]))
indices = list(indices)
x_list = x_list[indices]
y_list = y_list[indices]

# find m_0 and fit curve
for i in range(x_list.shape[0]):
    # determine best fit line
    par = np.polyfit(x_list[i:], y_list[i:], 1, full=True)
#     print(x_list[i], par)
    slope=par[0][0]
    intercept=par[0][1]
#     print(i)
    if par[1] < 2 or x_list[i] >= 72:
        break
    y_predicted = [slope*i + intercept  for i in x_list]

plot_options = dict(width=900,
                    plot_height=450,
                    tools='pan,xwheel_zoom,reset,save')

plot = figure(**plot_options)
plot.x(x_list,
       y_list,
       line_width=2)
plot.line(x_list[i:],
         y_predicted[i:],
         color='red',
         legend_label='y='+str(round(slope,2))+'x+'+str(round(intercept,2)))


plot.xaxis.axis_label = 'miss rate (m) log scale'
plot.yaxis.axis_label = 'log(P(M>m))'

show(plot)



In [357]:
df = pd.read_csv("/media/arjun/SSD/chaos/ratios.txt")

In [358]:
import sys
import os

os.remove("/media/arjun/SSD/chaos/ratios.csv")

with open("/media/arjun/SSD/chaos/ratios.txt") as f:
    for line in f.readlines():
        r = line.strip("\n").split(None, 10)
        fn = r.pop()
        with open("/media/arjun/SSD/chaos/ratios.csv", 'a') as the_file:
            the_file.write(",".join(r) + ",\"" + fn.replace("\"", "\"\"") + "\"\n")
        print(",".join(r) + ",\"" + fn.replace("\"", "\"\"") + "\"")

compressed,uncompressed,ratio,"uncompressed_name"
3038541,19647925,84.5%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000016384"
5852453,19647925,70.2%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000032768"
8223576,19647925,58.1%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000065536"
10692399,19647925,45.6%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000131072"
6154643,19647925,68.7%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000262144"
4313266,19647925,78.0%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00000524288"
3766088,19647925,80.8%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00001048576"
3132136,19647925,84.1%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00002097152"
2875336,19647925,85.4%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00004194304"
2695843,19647925,86.3%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00008388608"
2558320,19647925,87.0%,"508-namd-ref-namd-b-l4096-hists-100k-analyzed-bin-00016

17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-00134217728"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-00268435456"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-00536870912"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-01073741824"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-02147483648"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-04294967296"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-08589934592"
17351,17385775,99.9%,"526-blender-ref-blender-i-l4096-hists-100k-analyzed-bin-17179869184"
47301,173858,72.8%,"526-blender-ref-blender-i-l4096-hists-10m-analyzed-bin-00000016384"
55798,173858,67.9%,"526-blender-ref-blender-i-l4096-hists-10m-analyzed-bin-00000032768"
55661,173858,68.0%,"526-blender-ref-blender-i-l4096-hists-10m-analyzed-bin-00000065536"
60045,17

32409,51808,37.5%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00002097152"
33073,51808,36.2%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00004194304"
33904,51808,34.6%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00008388608"
35375,51808,31.8%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00016777216"
35915,51808,30.7%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00033554432"
34808,51808,32.8%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00067108864"
33769,51808,34.9%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00134217728"
32881,51808,36.6%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00268435456"
32542,51808,37.2%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-00536870912"
32334,51808,37.6%,"602-gcc-ref-gcc2-b-l64-hists-10m-analyzed-bin-01073741824"
153977,518079,70.3%,"602-gcc-ref-gcc2-b-l64-hists-1m-analyzed-bin-00000004096"
182954,518079,64.7%,"602-gcc-ref-gcc2-b-l64-hists-1m-analyzed-bin-00000008192"
202296,518079,61.0%,"602-gcc-ref-gcc2-b-l64-hists-1m-analyzed-

1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00008388608"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00016777216"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00033554432"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00067108864"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00134217728"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00268435456"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-00536870912"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-01073741824"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-02147483648"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-04294967296"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyzed-bin-08589934592"
1060,969897,99.9%,"607-cactus-ref-cactus-i-l4096-hists-10m-analyz

34837,116793,70.2%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00000262144"
36588,116793,68.7%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00000524288"
36423,116793,68.8%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00001048576"
36319,116793,68.9%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00002097152"
37435,116793,68.0%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00004194304"
40850,116793,65.0%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00008388608"
49064,116793,58.0%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00016777216"
3197,116793,97.3%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00033554432"
642,116793,99.5%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00067108864"
618,116793,99.5%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00134217728"
618,116793,99.5%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00268435456"
618,116793,99.5%,"625-x264-ref-ldecod-b-l64-hists-1m-analyzed-bin-00536870912"
618,116793,99.5%,"625-x264-ref-ldecod

2130,2123273,99.9%,"641-leela-ref-leela-i-l4096-hists-1m-analyzed-bin-04294967296"
2130,2123273,99.9%,"641-leela-ref-leela-i-l4096-hists-1m-analyzed-bin-08589934592"
2130,2123273,99.9%,"641-leela-ref-leela-i-l4096-hists-1m-analyzed-bin-17179869184"
5626968,21232722,73.5%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000004096"
10406167,21232722,51.0%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000008192"
10446783,21232722,50.8%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000016384"
201139,21232722,99.1%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000032768"
136788,21232722,99.4%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000065536"
21582,21232722,99.9%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000131072"
20903,21232722,99.9%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000262144"
20903,21232722,99.9%,"641-leela-ref-leela-i-l64-hists-100k-analyzed-bin-00000524288"
20903,21232722,99.9%,"641-leela-ref-leela-i-l64-hists-100k-an

946650,3305609,71.4%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00002097152"
1050802,3305609,68.2%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00004194304"
1072601,3305609,67.6%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00008388608"
1098152,3305609,66.8%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00016777216"
1108087,3305609,66.5%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00033554432"
1067953,3305609,67.7%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00067108864"
1029374,3305609,68.9%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00134217728"
1050358,3305609,68.2%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00268435456"
40153,3305609,98.8%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-00536870912"
39836,3305609,98.8%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-01073741824"
22963,3305609,99.3%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-02147483648"
10062,3305609,99.7%,"657-xz-ref-xz2-b-l4096-hists-1m-analyzed-bin-04294967296"
9949,3305609,99.7%,"657-xz-ref-xz2-b-

In [359]:
df = pd.read_csv("/media/arjun/SSD/chaos/ratios.csv")
df = df.drop(9720)

In [360]:
trace_names = df["uncompressed_name"]

In [361]:
trace_names

0       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
1       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
2       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
3       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
4       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
5       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
6       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
7       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
8       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
9       508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
10      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
11      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
12      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
13      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
14      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
15      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
16      508-namd-ref-namd-b-l4096-hists-100k-analyzed-...
17      508-na

In [362]:
prog_names = [ '-'.join(name.split('-')[:4]) for name in trace_names ]
miss_type = [ name.split('-')[4] for name in trace_names ]
cache_size = [ name.split('-')[5][1:] for name in trace_names ]
window_size = [ name.split('-')[7] for name in trace_names ]

In [363]:
print(pd.Series(cache_size).value_counts())
print(pd.Series(miss_type).value_counts())
print(pd.Series(prog_names).value_counts())
print(pd.Series(window_size).value_counts())

4096    4158
64      3762
        1800
dtype: int64
b        2640
i        2640
d        2640
trunc    1800
dtype: int64
600-perl-ref-perl3             360
620-omnetpp-ref-omnetpp        360
605-mcf-ref-mcf                360
619-lbm-ref-lbm                360
648-exchange2-ref-exchange2    360
641-leela-ref-leela            360
625-x264-ref-x264              360
602-gcc-ref-gcc2               360
510-parest-ref-parest          360
621-wrf-ref-wrf                360
603-bwaves-ref-bwaves          360
607-cactus-ref-cactus          360
508-namd-ref-namd              360
644-nab-ref-nab                360
602-gcc-ref-gcc1               360
602-gcc-ref-gcc3               360
657-xz-ref-xz2                 360
628-pop2-ref-pop2              360
631-deepsjeng-ref-deepsjeng    360
654-roms-ref-roms              360
600-perl-ref-perl1             360
511-povray-ref-povray          360
623-xalan-ref-xalan            360
657-xz-ref-xz1                 360
625-x264-ref-ldecod            360
600-

In [364]:
df["prog_name"] = prog_names
df["miss_type"] = miss_type
df["cache_size"] = cache_size
df["window_size"] = window_size

In [365]:
df1 = df[df["miss_type"] == "d"]
df1 = df1[df1["cache_size"] != ""]

In [366]:
df2[:5]

Unnamed: 0,compressed,uncompressed,ratio,uncompressed_name,prog_name,miss_type,cache_size,window_size
141,15327,196480,92.2%,508-namd-ref-namd-d-l4096-hists-10m-analyzed-b...,508-namd-ref-namd,d,4096,10m
142,31569,196480,83.9%,508-namd-ref-namd-d-l4096-hists-10m-analyzed-b...,508-namd-ref-namd,d,4096,10m
143,38536,196480,80.4%,508-namd-ref-namd-d-l4096-hists-10m-analyzed-b...,508-namd-ref-namd,d,4096,10m
144,77232,196480,60.7%,508-namd-ref-namd-d-l4096-hists-10m-analyzed-b...,508-namd-ref-namd,d,4096,10m
145,73319,196480,62.7%,508-namd-ref-namd-d-l4096-hists-10m-analyzed-b...,508-namd-ref-namd,d,4096,10m


In [367]:

# np.unique(x)

In [368]:
df2 = df1[df1["cache_size"] == "4096"]

y = df2["ratio"]
x = df2["window_size"]

x_list = []
for val in x:
    if val == "100k":
        x_list.append(1e5)
    elif val == "1m":
        x_list.append(1e6)
    elif val == "10m":
        x_list.append(1e7)

y_list = [ float(val[:4]) for val in y ]

plot_options = dict(width=900,
                    plot_height=450,
                    tools='pan,xwheel_zoom,reset,save',
                    x_axis_type="log")

plot = figure(**plot_options)
plot.x(x_list,
       y_list)

plot.xaxis.axis_label = 'window_size'
plot.yaxis.axis_label = 'compression_ratio'

show(plot)

In [369]:
df2 = df1[df1["window_size"] == "10m"]

y = df2["ratio"]
x = df2["cache_size"]

x_list = [ float(val) for val in x ]
y_list = [ float(val[:4]) for val in y ]

plot_options = dict(width=900,
                    plot_height=450,
                    tools='pan,xwheel_zoom,reset,save')

plot = figure(**plot_options)
plot.x(x_list,
       y_list)

plot.xaxis.axis_label = 'cache_size'
plot.yaxis.axis_label = 'compression_ratio'

show(plot)

In [370]:
def plot_prog_ratios(df, window_size, cache_size):
    df2 = df[df["window_size"] == window_size]
    df2 = df2[df2["cache_size"] == cache_size]

    y = df2["ratio"]
    x = df2["prog_name"]

    # x_list = [ float(val) for val in x ]
    y_list = np.array([ float(val[:4]) for val in y ])
    # ratio_argsorted = np.argsort(y_list)
    y_list = y_list

    x_list = np.array(x)

    mean_ratio_list = []
    progs = np.unique(x_list)
    for prog in progs:
        ratios = df2[df2["prog_name"] == prog]["ratio"]
        ratios = np.array([ float(val[:4]) for val in ratios ])
        mean_ratio_list.append(np.mean(ratios))
    ratio_argsorted = np.argsort(mean_ratio_list)
    progs = progs[ratio_argsorted]

    plot_options = dict(width=900,
                        plot_height=450,
                        y_range = progs,
                        tools='pan,xwheel_zoom,reset,save')

    plot = figure(**plot_options)
    plot.x(y_list,
           x_list)

    plot.xaxis.axis_label = 'compression ratio'
    plot.yaxis.axis_label = 'program_name'

    show(plot)

In [371]:
plot_prog_ratios(df, "100k", "64")

In [372]:
plot_prog_ratios(df, "100k", "4096")

In [373]:
plot_prog_ratios(df, "1m", "64")

In [374]:
plot_prog_ratios(df, "1m", "4096")

In [375]:
plot_prog_ratios(df, "10m", "64")

In [376]:
plot_prog_ratios(df, "10m", "4096")