In [150]:
import os
import pickle as pkl
import numpy as np
import colorcet as cc

# plotting tools
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.palettes import Inferno, all_palettes
from bokeh.models import CustomJS, Slider, ColumnDataSource 
from bokeh.models import Whisker, HoverTool, Span, ColorBar
from bokeh.transform import linear_cmap, log_cmap, factor_cmap
from compress import Compressor
from pyinform import entropy_rate

from misc.database import Database

output_notebook()

In [2]:
def plot_histogram(array):
    plot_options = dict(width=450,
                        plot_height=250,
                        tools='pan,wheel_zoom,reset,save')

    gate_hist = figure(**plot_options)

    hist, edges = np.histogram(array, bins=50)

    x = np.linspace(0, 1, 1000)

    gate_hist.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="#036564", line_color="#033649",\
    )

    
    
    gate_hist.xaxis.axis_label = 'X'
    gate_hist.yaxis.axis_label = 'Density'
    show(gate_hist)

In [172]:
def plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, palette, 
                  param_plot=None, max_params=0):
    idx=-1
    for key, val in l0_regs.items():
        idx+=1
        jobid = key
        l0_reg = val
    #     print("L0 regularization coefficient: {}".format(l0_reg))
#         with open("/home/arjun/research/train_infos/train_info_{}_0.pkl".format(jobid), 
#                   'rb') as f:
#             train_info = pkl.load(f)
#         seed = 0
#         train_info["cross_entropy"] = train_info["cross_entropy"]
#         train_info["l0_penalty"] = train_info["l0_penalty"]
#         train_info["total_loss"] = train_info["total_loss"]
        
#         if "weights_pruned" in train_info.keys():
            

        log_path = os.path.join("/home/arjun/research/train_infos", 
                                 "train_info_{}_{}.pkl".format(jobid, seed))

        with open(log_path, 'rb') as f:
            train_info = pkl.load(f)

    #     print(train_info["total_loss"])

        xs_total.append(list(range(len(train_info["total_loss"]))))
        ys_total.append(train_info["total_loss"])

        tloss_plot.line(list(range(len(train_info["total_loss"]))),
                        train_info["total_loss"], 
                        color=palette[idx],
                        legend_label="l0 reg: {}".format(val),
                        line_alpha=0.8,
                        line_width=2)

        xs_ce.append(list(range(len(train_info["cross_entropy"]))))
        ys_ce.append(train_info["cross_entropy"])

        ce_plot.line(list(range(len(train_info["cross_entropy"]))),
                        train_info["cross_entropy"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

        xs_l0.append(list(range(len(train_info["l0_penalty"]))))
        ys_l0.append(train_info["l0_penalty"])
        l0_plot.line(list(range(len(train_info["l0_penalty"]))),
                        train_info["l0_penalty"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)
        
        if param_plot is not None:
                param_plot.line(list(range(len(train_info["l0_penalty"]))),
                        -np.array(train_info["weights_pruned"])+max_params, 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

In [152]:
def color_to_cmap(palette, n_colors):
    color_div = int(len(palette)/n_colors)
    
    return [ palette[i] for i in range(0, len(palette), color_div) ]

In [174]:
# # 700x1
l0_regs = {
    7382177: 0.5,
    7382178: 0.1,
    7382179: 0.05,
    7382180: 0.01,
    7382181: 0.005,
    7382183: 0.001,
    7382184: 0.0005,
    7382185: 0.0001,
    7382186: 0.00005,
    7382187: 0.00001,
    7382188: 0.0
}

xs_total = []
ys_total = []
xs_ce = []
ys_ce = []
xs_l0 = []
ys_l0 = []

plot_options = dict(title="LSTM - 700x1",
                        width=1000,
                        plot_height=1000,
                        tools='pan,wheel_zoom,reset,save')

tloss_plot = figure( **plot_options)
ce_plot = figure(x_range=(0, 5000), **plot_options)
l0_plot = figure(x_range=(0, 5000), **plot_options)
params_plot = figure(x_range=(0, 5000), **plot_options)

# palette = all_palettes['Inferno'][len(l0_regs)]
palette = list(reversed(color_to_cmap(cc.kg, len(l0_regs))))
plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, palette, params_plot, 2806400)

palette = color_to_cmap(cc.blues, len(l0_regs_700))
plot_training(l0_regs_700, tloss_plot, ce_plot, l0_plot, palette)

tloss_plot.legend.click_policy = 'hide'
tloss_plot.xaxis.axis_label = 'epoch'
tloss_plot.yaxis.axis_label = 'total loss'
ce_plot.xaxis.axis_label = 'epoch'
ce_plot.yaxis.axis_label = 'cross entropy loss'
l0_plot.xaxis.axis_label = 'epoch'
l0_plot.yaxis.axis_label = 'l0 penalty'
params_plot.xaxis.axis_label = 'epoch'
params_plot.yaxis.axis_label = 'weights remaining'
grid = gridplot([[tloss_plot, params_plot], [ce_plot, l0_plot]], 
                plot_width=600, 
                plot_height=350)
show(grid)

In [6]:
db = Database()
db.open(os.path.join("/home/arjun/research/result_dbs", 
                             "result_cmr_lstm_s200_d2_omax.db"))
l0_reg_vals = l0_regs.values()
l0_params = []
# print(l0_reg_vals)
for l0_reg in l0_reg_vals:
    db.query("SELECT * FROM RESULTS WHERE l0_reg = {}".format(l0_reg))
    rows = db.cursor.fetchall()
    n_params_list = []
    for row in rows:
        trace, seed, n_params, oracle_size, accuracy, bin_tolerance, l0_reg = row
        n_params_list.append(n_params)
    l0_params.append(np.mean(n_params_list))

plot_options = dict(title="Number of Parameters vs L0 regularization coefficient".format(l0_reg),
                    width=450,
                    plot_height=250,
                    tools='pan,wheel_zoom,reset,save')
# Total loss
ctm_plot = figure(**plot_options)
ctm_plot.line(list(l0_regs.values()),
                l0_params,
                line_width=2,
             line_color='red')
ctm_plot.xaxis.axis_label = 'l0_regularization coefficient'
ctm_plot.yaxis.axis_label = 'number of parameters in model'
show(ctm_plot)
db.close()

In [7]:
folder_name = "/home/arjun/research/distributions/lstm_s700_d1_fixed_mini24_seq50"
filename = "model_0_0.0001.pkl"

with open(os.path.join(folder_name, filename+'.gate_distribution'), 
          'rb') as f:
    gate_distribution = pkl.load(f)

with open(os.path.join(folder_name, filename+'.weight_distribution'), 
          'rb') as f:
    weight_distribution = pkl.load(f)

mul_weights = gate_distribution*weight_distribution

print("Distribution of gates")
plot_histogram(gate_distribution)

print("Distribution of weights")
plot_histogram(weight_distribution)

print("Distribution of multiplied weights")
plot_histogram(mul_weights)

hist, edges = np.histogram(np.abs(mul_weights), bins=100)
edges = np.convolve(edges, [1/2, 1/2], mode='valid')
cumweights = np.cumsum(hist)

plot_options = dict(width=500,
                    plot_height=250,
                    tools='pan,xwheel_zoom,reset,save')
cumweights_plot = figure(x_axis_type='log',**plot_options)

source = ColumnDataSource(data=dict(
    x=edges,
    y=cumweights,
))

cumweights_plot.line(source = source, line_width=2)

hover = HoverTool(
    tooltips=[
        ( 'magnitude',   '$x'            ),
        ( 'number of parameters',  '$y' ), # use @{ } for field names with spaces
    ],
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)
cumweights_plot.add_tools(hover)
cumweights_plot.xaxis.axis_label = 'magnitude'
cumweights_plot.yaxis.axis_label = 'cumulative number of parameters'
show(cumweights_plot)

Distribution of gates


Distribution of weights


Distribution of multiplied weights


In [None]:
hist, edges = np.histogram(np.abs(mul_weights), bins=100)
cumweights = np.cumsum(hist)

print(hist.shape, edges.shape)
print(cumweights.shape)
print(edges)
edges = np.convolve(edges, [1/2, 1/2], mode='valid')
print(edges.shape)

In [8]:
folder_name = "/home/arjun/research/predictions/lstm_s700_d1_fixed_mini24_seq50"
filename = "model_5_1e-05.pkl.0.0"

with open(os.path.join(folder_name, filename+'.actual'), 
          'rb') as f:
    actual = pkl.load(f)

with open(os.path.join(folder_name, filename+'.predictions'), 
          'rb') as f:
    predictions = pkl.load(f)

diff_arr = np.abs(actual-predictions)
# print(diff_arr)
# print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= 1), 1))
# print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= 2), 2))
# print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= 2), 3))
# print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= 2), 2))
    
plot_options = dict(width=600,
                    plot_height=350,
                    y_range=(45, 80),
                    tools='pan,xwheel_zoom,reset,save')
predictions_plot = figure(**plot_options)
predictions_plot.line(range(len(actual)),
                actual,
#                 line_alpha=0.5,
                legend_label="actual",
                line_width=2)
predictions_plot.line(range(len(actual)),
                predictions,
                line_alpha=0.5,
                legend_label="prediction",
                line_width=2,
             line_color='red')
predictions_plot.xaxis.axis_label = 'Time index'
predictions_plot.yaxis.axis_label = 'Bin'
show(predictions_plot)

for bin_err in range(20):
    print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= bin_err), bin_err))

# for diff in diff_arr:
#     print(diff)

Accuracy: 0.9944972486243121. AE(bin): 0
Accuracy: 0.9979989994997499. AE(bin): 1
Accuracy: 0.9984992496248124. AE(bin): 2
Accuracy: 0.9989994997498749. AE(bin): 3
Accuracy: 0.9989994997498749. AE(bin): 4
Accuracy: 0.9994997498749375. AE(bin): 5
Accuracy: 0.9994997498749375. AE(bin): 6
Accuracy: 0.9994997498749375. AE(bin): 7
Accuracy: 0.9994997498749375. AE(bin): 8
Accuracy: 1.0. AE(bin): 9
Accuracy: 1.0. AE(bin): 10
Accuracy: 1.0. AE(bin): 11
Accuracy: 1.0. AE(bin): 12
Accuracy: 1.0. AE(bin): 13
Accuracy: 1.0. AE(bin): 14
Accuracy: 1.0. AE(bin): 15
Accuracy: 1.0. AE(bin): 16
Accuracy: 1.0. AE(bin): 17
Accuracy: 1.0. AE(bin): 18
Accuracy: 1.0. AE(bin): 19


In [9]:
def plot_information_metrics(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
#             c.use_zlib()
#             predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(entropy_rate(predictions, k=5))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     color=color,
                    legend_label=legend)    

In [14]:
def plot_compression_performance(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
#     print(os.listdir(files_path))
    c = Compressor()
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
            c.use_zlib()
            predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(len(c.compress(predictions.encode('utf-8'))))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     color=color,
                    legend_label=legend)

In [154]:
def plot_rkc(folder_name, filename, rkc_plot, legend, color='blue', 
             tolerance=1, normalize=False):
    db_path = os.path.join(folder_name, filename)

    db = Database()
    db.open(db_path)

    db.query("SELECT * FROM RESULTS WHERE bin_tolerance<={}".format(tolerance))

    rows = db.cursor.fetchall()

    data_dict = {
        "n_params": [],
        "accuracy": []
    }

    for row in rows:
        trace, seed, n_params, oracle_size, accuracy, \
        bin_tolerance, gate_thres, l0_reg = row

        data_dict["n_params"].append(n_params)
        data_dict["accuracy"].append(1-accuracy)

    data_dict["n_params"] = np.array(data_dict["n_params"])
        
    if normalize:
        data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])
        
    print("Number of points: {}".format(len(data_dict["n_params"])))
    source = ColumnDataSource(data=data_dict)

    rkc_plot.scatter('accuracy','n_params', 
                     source=source, 
                     fill_alpha=0.5, 
                     color=color,
                    legend_label=legend)
    return rkc_plot

In [155]:
def rkc_approx(e, a=2, b=1, min_acc=0.5895):
#     return np.power(e, -m) + c
#     return np.exp(np.sqrt(4*a*np.log(b*(e+min_acc))))
#     print(a*(np.log(e)**2) + b*np.log(e) + min_acc)
#     print(np.sqrt(a*(np.log(e)**2) + b*np.log(e) + min_acc))
    return np.exp(np.sqrt(a*(np.log(e+min_acc)**2) + b*np.log(e+min_acc) + min_acc))

def rkc_approx_exp(e, a=2, e_0=0.5895):
    return np.exp(a*np.exp(np.log(1-e)))

def print_summary(arr, trace_size):
    print("argmax: ", np.argmax(arr))
    print("max_val: ", arr[np.argmax(arr)])
    print("bin frequency: ", np.max(arr)/trace_size)

    
def get_baseline(trace_name="pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                tolerance=1):
    with open(os.path.join("/home/arjun/ssd/chaos/data", trace_name), "rb") as f:
        trace = pkl.load(f)
    trace = trace[:2000]
    kernel = np.ones(2*tolerance+1)
    arr = np.convolve(np.sum(trace, axis=0), kernel, mode="same")
    print_summary(arr, trace.shape[0])
    
    return 1 - np.max(arr)/trace.shape[0]
    
    
tolerance = 1
normalize = False

jikes_baseline = get_baseline("pmd-small-JikesRVM-d-l64-p4096-w100000i.analyzed-1.pkl",
                             tolerance=tolerance)
j9_baseline = get_baseline("pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                             tolerance=tolerance)

plot_options = dict(width=500,
                        plot_height=250,
                        x_range=(1e-4, 1),
                        tools='pan,xwheel_zoom,reset,save')
rkc_plot = figure(
                  x_axis_type="log", 
                  y_axis_type="log", 
                  **plot_options)

folder_name = "/home/arjun/research/result_dbs"
filename = "result_cmr_lstm_s700_d1_omax.db"
rkc_plot = plot_rkc(folder_name, 
                    filename, 
                    rkc_plot, 
                    tolerance=tolerance,
                    normalize=normalize,
                    legend="pmd-small-jikes, h=700")

folder_name = "/home/arjun/research/result_dbs"
filename = "result_cmr_lstm_s400_d1_omax.db"
rkc_plot = plot_rkc(folder_name, 
                    filename, 
                    rkc_plot, 
                    tolerance=tolerance,
                    normalize=normalize,
                    legend="pmd-small-jikes, h=400", 
                    color='orange')

plot_compression_performance("/home/arjun/research/"
                             "predictions/lstm_s700_d1_fixed_mini24_seq50",
                            rkc_plot,
                            bin_tolerance = tolerance,
                            normalize=normalize,
                            legend="pmd-jikes-compression")
plot_compression_performance("/home/arjun/research/"
                             "predictions/lstm_j9_s700_d1_fixed_mini24_seq50",
                            rkc_plot,
                            bin_tolerance = tolerance,
                            normalize=normalize,
                            color="red",
                            legend="pmd-j9-compression")

# plot_information_metrics("/home/arjun/research/"
#                              "predictions/lstm_s700_d1_fixed_mini24_seq50",
#                             rkc_plot,
#                             bin_tolerance = tolerance,
#                             normalize=normalize,
#                             legend="pmd-jikes-compression")
# plot_information_metrics("/home/arjun/research/"
#                              "predictions/lstm_j9_s700_d1_fixed_mini24_seq50",
#                             rkc_plot,
#                             bin_tolerance = tolerance,
#                             normalize=normalize,
#                             color="red",
#                             legend="pmd-j9-compression")
# plot envelope
x_axis = np.arange(1e-4, 1, 0.01)

# linear in log-log
y_axis = [ rkc_approx(e, a=-1, b=100.0, min_acc=1) for e in x_axis ]
# y_axis = [ rkc_approx_exp(e, a=10, e_0=0.595) for e in x_axis ]
# print(y_axis)
# rkc_plot.line(x_axis, y_axis, line_color='red')

# # exponential in log-log
# y_axis = [ rkc_approx_exp(e, a=2, m=2.2, c=1) for e in x_axis ]
# rkc_plot.line(x_axis, y_axis, line_color='green')

## Plot baseline lines
jikes_line = Span(location=jikes_baseline, dimension='height', 
                  line_color='green', line_width=3)
j9_line = Span(location=j9_baseline, dimension='height', 
                  line_color='red', line_width=3)

rkc_plot.line([], [], color="green", legend_label="pmd-jikes")
rkc_plot.line([], [], color="red", legend_label="pmd-j9")

rkc_plot.renderers.extend([jikes_line, j9_line])
# rkc_plot.legend.location = "top_left"
rkc_plot.legend.visible = False
# rkc_plot.legend.location = "top_right"
rkc_plot.yaxis.axis_label="Sparsity(log)"
rkc_plot.xaxis.axis_label="error(log)"
show(rkc_plot)

argmax:  63
max_val:  631.0
bin frequency:  0.3155
argmax:  71
max_val:  661.0
bin frequency:  0.3305
Number of points: 498
Number of points: 256
['pmd-small-JikesRVM-d-l64-p4096-w100000i.analyzed-1.pkl.actual']
['pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl.actual']


In [None]:
# plot_options = dict(width=500,
#                         plot_height=250,
#                         x_range=(1e-4, 1),
#                         tools='pan,xwheel_zoom,reset,save')
# rkc_plot = figure(
#                   x_axis_type="log", 
#                   y_axis_type="log", 
#                   **plot_options)

