In [71]:
import os
import pickle as pkl
import numpy as np
import colorcet as cc

from prediction_model.utils import compute_parameters

# plotting tools
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.palettes import Inferno, all_palettes, Viridis256
from bokeh.models import (CustomJS, Slider, ColumnDataSource, Legend, 
                         BasicTicker, ColorBar, ColumnDataSource,
                          LinearColorMapper, PrintfTickFormatter)
from bokeh.models import Whisker, HoverTool, Span, ColorBar
from bokeh.transform import linear_cmap, log_cmap, factor_cmap, transform
# from bokeh._legacy_charts import HeatMap
from compress import Compressor
from pyinform import entropy_rate

from misc.database import Database

from sklearn.linear_model import LinearRegression

output_notebook()

In [12]:
research_folder_path = "/media/arjun/Shared/research"

# Training

In [13]:
def plot_histogram(array):
    plot_options = dict(width=450,
                        plot_height=250,
                        tools='pan,wheel_zoom,reset,save')

    gate_hist = figure(**plot_options)

    hist, edges = np.histogram(array, bins=50)

    x = np.linspace(0, 1, 1000)

    gate_hist.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="#036564", line_color="#033649",\
    )

    
    
    gate_hist.xaxis.axis_label = 'X'
    gate_hist.yaxis.axis_label = 'Density'
    show(gate_hist)

In [14]:
def plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, accuracy_plot, palette, 
                  param_plot=None, max_params=0, seed=0):
    xs_total = []
    ys_total = []
    xs_ce = []
    ys_ce = []
    xs_l0 = []
    ys_l0 = []
    xs_acc = []
    ys_acc = []
    idx=-1
    for key, val in l0_regs.items():
        idx+=1
        jobid = key
        l0_reg = val
    #     print("L0 regularization coefficient: {}".format(l0_reg))
#         with open("/home/arjun/research/train_infos/train_info_{}_0.pkl".format(jobid), 
#                   'rb') as f:
#             train_info = pkl.load(f)
#         seed = 0
#         train_info["cross_entropy"] = train_info["cross_entropy"]
#         train_info["l0_penalty"] = train_info["l0_penalty"]
#         train_info["total_loss"] = train_info["total_loss"]
        
#         if "weights_pruned" in train_info.keys():
            

        log_path = os.path.join(research_folder_path,
                                "train_infos", 
                                 "train_info_{}_{}.pkl".format(jobid, seed))

        with open(log_path, 'rb') as f:
            train_info = pkl.load(f)

    #     print(train_info["total_loss"])

        xs_total.append(list(range(len(train_info["total_loss"]))))
        ys_total.append(train_info["total_loss"])

        tloss_plot.line(list(range(len(train_info["total_loss"]))),
                        train_info["total_loss"], 
                        color=palette[idx],
                        legend_label="l0 reg: {}".format(val),
                        line_alpha=0.8,
                        line_width=2)

        xs_ce.append(list(range(len(train_info["cross_entropy"]))))
        ys_ce.append(train_info["cross_entropy"])

        ce_plot.line(list(range(len(train_info["cross_entropy"]))),
                        train_info["cross_entropy"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

        xs_l0.append(list(range(len(train_info["l0_penalty"]))))
        ys_l0.append(train_info["l0_penalty"])
        l0_plot.line(list(range(len(train_info["l0_penalty"]))),
                        train_info["l0_penalty"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)
        
        xs_acc.append(list(range(len(train_info["accuracy"]))))
        ys_acc.append(train_info["accuracy"])
        accuracy_plot.line(list(range(len(train_info["accuracy"]))),
                        train_info["accuracy"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)
        
        if param_plot is not None:
                param_plot.line(list(range(len(train_info["l0_penalty"]))),
                        -np.array(train_info["weights_pruned"])+max_params, 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

In [15]:
def color_to_cmap(palette, n_colors):
    color_div = int(len(palette)/n_colors)
    
    return [ palette[i] for i in range(0, len(palette), color_div) ]

In [145]:
# sunflow 400x1
l0_regs = {
    7456767: 0.5,
    7456768: 0.1,
    7456769: 0.05,
    7456770: 0.01,
    7456771: 0.005,
    7456772: 0.001,
    7456773: 0.0005,
    7456774: 0.0001,
    7456775: 0.00005,
    7456776: 0.00001,
    7456777: 0.0
}

plot_options = dict(width=1000,
                        plot_height=1000,
                        tools='pan,wheel_zoom,reset,save')

tloss_plot = figure( **plot_options)
ce_plot = figure(**plot_options)
l0_plot = figure(**plot_options)
params_plot = figure(**plot_options)
accuracy_plot = figure(**plot_options)
# params_plot=None

# palette = all_palettes['Inferno'][len(l0_regs)]
palette = list(reversed(color_to_cmap(cc.kg, len(l0_regs))))
plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, accuracy_plot, 
              palette, params_plot, 2806400)

# palette = color_to_cmap(cc.blues, len(l0_regs))
# plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, palette)

tloss_plot.legend.click_policy = 'hide'
tloss_plot.legend.visible = False
tloss_plot.xaxis.axis_label = 'epoch'
tloss_plot.yaxis.axis_label = 'total loss'
ce_plot.xaxis.axis_label = 'epoch'
ce_plot.yaxis.axis_label = 'cross entropy loss'
l0_plot.xaxis.axis_label = 'epoch'
l0_plot.yaxis.axis_label = 'l1 penalty'
params_plot.xaxis.axis_label = 'epoch'
params_plot.yaxis.axis_label = 'weights remaining'
grid = gridplot([[tloss_plot, params_plot], [ce_plot, l0_plot], [accuracy_plot, None]], 
                plot_width=600, 
                plot_height=350)
show(grid)

FileNotFoundError: [Errno 2] No such file or directory: '/media/arjun/Shared/research/train_infos/train_info_7456767_0.pkl'

In [17]:
db = Database()
db.open(os.path.join(research_folder_path,
                     "result_dbs/old", 
                     "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db"))
l0_reg_vals = l0_regs.values()
l0_params = []
# print(l0_reg_vals)
# print(l0_reg_vals)
l0_reg_vals_final = []
for l0_reg in l0_reg_vals:
    db.query("SELECT * FROM RESULTS WHERE l0_reg = {}".format(l0_reg))
    rows = db.cursor.fetchall()
    n_params_list = []
    for row in rows:
#         print(row)
        trace, seed, n_params, oracle_size, accuracy, bin_tolerance, \
        gate_threshold, l0_reg = row
        n_params_list.append(n_params)
    if len(n_params_list) > 0:
#         print(n_params_list)
        l0_reg_vals_final.append(l0_reg)
        l0_params.append(np.mean(n_params_list))

# print(l0_params)
plot_options = dict(title="Number of Parameters vs L0 regularization coefficient".format(l0_reg),
                    width=500,
                    plot_height=300,
                    y_axis_type='log',
                    tools='pan,wheel_zoom,reset,save')
# Total loss
ctm_plot = figure(**plot_options)
ctm_plot.line(
    l0_reg_vals_final,
                l0_params,
                line_width=2,
             line_color='red')
ctm_plot.xaxis.axis_label = 'l0_regularization coefficient'
ctm_plot.yaxis.axis_label = 'number of non zero parameters in model'
show(ctm_plot)
db.close()

# Distributions

In [18]:
def extract_distribution_data(folder_name, filename):

    with open(os.path.join(folder_name, filename+'.gate_distribution'), 
              'rb') as f:
        gate_distribution = pkl.load(f)

    with open(os.path.join(folder_name, filename+'.weight_distribution'), 
              'rb') as f:
        weight_distribution = pkl.load(f)

    mul_weights = gate_distribution*weight_distribution
    
    return gate_distribution, weight_distribution, mul_weights

In [19]:
folder_name = os.path.join(research_folder_path, 
                           "distributions/pmd_s_jikes_lstm_fixed_s20_d1")
filename = "model_0_0.0001.pkl.0.01"

gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                               filename)

print("Distribution of gates")
plot_histogram(gate_distribution)

print("Distribution of weights")
plot_histogram(weight_distribution)

print("Distribution of multiplied weights")
plot_histogram(mul_weights)

Distribution of gates


Distribution of weights


Distribution of multiplied weights


# Cumulative weights

In [20]:
def plot_weights_cumulative(mul_weights, cumweights_plot, color='blue'):
    total_parameters = mul_weights.shape[0]
    print(total_parameters)
    hist, edges = np.histogram(np.abs(mul_weights), bins=100)
    edges = np.convolve(edges, [1/2, 1/2], mode='valid')
    cumweights = np.cumsum(hist)
    cumweights = (cumweights)/total_parameters

    source = ColumnDataSource(data=dict(
        x=edges,
        y=cumweights,
    ))

    p = cumweights_plot.line(source = source, line_width=2, color=color)
    
    return p


def plot_params_vs_error(folder, 
                         model_size, 
                         seed, 
                         l0_reg, 
                         param_err_plot, 
                         color='blue',
                         bin_tolerance=2):
    db_path = os.path.join(folder, "result_cmr_lstm_s{}_d1_omax.db".format(model_size))

    db = Database()
    db.open(db_path)

    db.query("SELECT * FROM RESULTS WHERE "
             "bin_tolerance<={} AND "
             "seed={} AND "
             "l0_reg={}".format(bin_tolerance,
                                   seed,
                                   l0_reg))

    rows = db.cursor.fetchall()
    
    params = []
    error_list = []
    gate_threshold_list = []
    for row in rows:
        _, _, n_params, _, accuracy, _, gate_threshold, _ = row
        params.append(n_params)
        gate_threshold_list.append(gate_threshold)
#         error_list.append(gate_threshold)
        error_list.append(1-accuracy)
    
    params = np.array(params)
    error_list = np.array(error_list)
    
    sorted_idx = np.argsort(params)
    params = params[sorted_idx]
    error_list = error_list[sorted_idx]
    
    source = ColumnDataSource(data=dict(
        x=gate_threshold_list,
        y=error_list,
    ))
    
    p = param_err_plot.scatter(source = source, color=color)
    
    return p

In [21]:
plot_options = dict(width=650,
                    plot_height=350,
                    tools='pan,xwheel_zoom,reset,save')
cumweights_plot = figure(
                        y_axis_type='log',
                        x_axis_type='log',
                        **plot_options)
param_err_plot = figure(
#                         y_axis_type='log',
#                         x_axis_type='log',
                        **plot_options)
legend_items = []
palette = all_palettes['Set1'][7]

# size=20
seed = 0
l0_reg = 5e-5
filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
folder_name = os.path.join(research_folder_path,
                          "distributions/pmd_s_jikes_lstm_fixed_s20_d1")
gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                                filename)
p = plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[0])
p1 = plot_params_vs_error(folder_name, 20, seed, l0_reg, param_err_plot, color=palette[0])
legend_items.append(('hidden=20', [p]))

# size = 50
seed = 0
l0_reg = 0.0
filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
folder_name = os.path.join(research_folder_path,
                          "distributions/pmd_s_jikes_lstm_fixed_s50_d1")
gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                                filename)
p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[1])
p1 = plot_params_vs_error(folder_name, 50, seed, l0_reg, param_err_plot, color=palette[1])
legend_items.append(('hidden=50', [p]))

# # size = 400
# seed = 0
# l0_reg = 0.0001
# filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
# folder_name = "/home/arjun/research/distributions/pmd_s_jikes_lstm_fixed_s400_d1"
# gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
#                                                                                 filename)
# p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[2])
# p1 = plot_params_vs_error(folder_name, 400, seed, l0_reg, param_err_plot, color=palette[2])
# legend_items.append(('hidden=400', [p]))

# # size = 700
# seed = 0
# l0_reg = 0.005
# filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
# folder_name = "/home/arjun/research/distributions/pmd_s_jikes_lstm_fixed_s700_d1"
# gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
#                                                                                 filename)
# p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[3])
# p1 = plot_params_vs_error(folder_name, 700, seed, l0_reg, param_err_plot, color=palette[3])
# legend_items.append(('hidden=700', [p]))

hover = HoverTool(
    tooltips=[
        ( 'magnitude',   '$x'            ),
        ( 'parameter count',  '$y' ), # use @{ } for field names with spaces
    ],
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)
legend = Legend(items=legend_items, location="top_left")

cumweights_plot.add_layout(legend, 'right')
cumweights_plot.legend.click_policy='hide'
cumweights_plot.add_tools(hover)
cumweights_plot.xaxis.axis_label = 'magnitude cutoff'
cumweights_plot.yaxis.axis_label = 'fraction of parameters below magnitude'
show(cumweights_plot)

param_err_plot.xaxis.axis_label = 'magnitude cutoff'
param_err_plot.yaxis.axis_label = 'error'
show(param_err_plot)

12280
38050


In [22]:
hist, edges = np.histogram(np.abs(mul_weights), bins=100)
cumweights = np.cumsum(hist)

print(hist.shape, edges.shape)
print(cumweights.shape)
print(edges)
edges = np.convolve(edges, [1/2, 1/2], mode='valid')
print(edges.shape)

(100,) (101,)
(100,)
[8.48870764e-06 5.72977526e-02 1.14587017e-01 1.71876281e-01
 2.29165544e-01 2.86454808e-01 3.43744072e-01 4.01033336e-01
 4.58322600e-01 5.15611864e-01 5.72901128e-01 6.30190392e-01
 6.87479656e-01 7.44768920e-01 8.02058184e-01 8.59347448e-01
 9.16636712e-01 9.73925976e-01 1.03121524e+00 1.08850450e+00
 1.14579377e+00 1.20308303e+00 1.26037230e+00 1.31766156e+00
 1.37495082e+00 1.43224009e+00 1.48952935e+00 1.54681861e+00
 1.60410788e+00 1.66139714e+00 1.71868641e+00 1.77597567e+00
 1.83326493e+00 1.89055420e+00 1.94784346e+00 2.00513273e+00
 2.06242199e+00 2.11971125e+00 2.17700052e+00 2.23428978e+00
 2.29157905e+00 2.34886831e+00 2.40615757e+00 2.46344684e+00
 2.52073610e+00 2.57802537e+00 2.63531463e+00 2.69260389e+00
 2.74989316e+00 2.80718242e+00 2.86447169e+00 2.92176095e+00
 2.97905021e+00 3.03633948e+00 3.09362874e+00 3.15091801e+00
 3.20820727e+00 3.26549653e+00 3.32278580e+00 3.38007506e+00
 3.43736432e+00 3.49465359e+00 3.55194285e+00 3.60923212e+00
 3.

# Prediction Performance

In [23]:
def get_heatmap(predictions, actual):
    heatmap_arr = np.zeros((100, 100))
    for i in range(predictions.shape[0]):
        heatmap_arr[actual[i], predictions[i]] += 1
    
    div = np.sum(heatmap_arr, axis=0)
    div[div==0] = 1
    return heatmap_arr/div

In [24]:
folder_name = os.path.join(research_folder_path,
                          "predictions/lstm_pmd_jikes")
filename = "model_0_0.0005.pkl.0.07"

actual_filename = [ file for file in os.listdir(folder_name) if 'actual' in file ][0]
with open(os.path.join(folder_name, actual_filename), 
          'rb') as f:
    actual = pkl.load(f)

print(actual)
with open(os.path.join(folder_name, filename+'.predictions'), 
          'rb') as f:
    predictions = pkl.load(f)

diff_arr = np.abs(actual-predictions)

plot_options = dict(width=600,
                    plot_height=350,
                    y_range=(45, 90),
                    tools='pan,xwheel_zoom,reset,save')
predictions_plot = figure(**plot_options)
predictions_plot.line(range(len(actual)),
                actual,
#                 line_alpha=0.5,
                legend_label="actual",
                line_width=2)
predictions_plot.line(range(len(actual)),
                predictions,
                line_alpha=0.5,
                legend_label="prediction",
                line_width=2,
             line_color='red')
predictions_plot.xaxis.axis_label = 'Time index'
predictions_plot.yaxis.axis_label = 'Bin'
show(predictions_plot)

for bin_err in range(20):
    print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= bin_err), bin_err))

## Heatmap analysis
heatmap = get_heatmap(predictions, actual)
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=np.min(heatmap), high=np.max(heatmap))

accuracy_rates = [ heatmap[actual[i], predictions[i]] for i in range(predictions.shape[0]) ]

# print(heatmap)
# print(np.min(heatmap))
# print(np.max(heatmap))
# prin

source=ColumnDataSource(data={
    "actual": actual,
    "predictions": predictions,
    "heatmap_score": accuracy_rates
})

p = figure(plot_width=500, plot_height=400,
        x_range=(45, 90), y_range=(45,90),
        tools="reset,save")

p.rect(x="actual", y="predictions", width=1, height=1, source=source,
       line_color=None, fill_color=transform('heatmap_score', mapper))

color_bar = ColorBar(color_mapper=mapper, label_standoff=12)

p.add_layout(color_bar, 'right')

p.xaxis.axis_label="actual"
p.yaxis.axis_label="predictions"

show(p)
    
# for diff in diff_arr:
#     print(diff)

[74 73 73 ... 69 67 68]


Accuracy: 0.7963981990995498. AE(bin): 0
Accuracy: 0.8919459729864933. AE(bin): 1
Accuracy: 0.919959979989995. AE(bin): 2
Accuracy: 0.9404702351175588. AE(bin): 3
Accuracy: 0.9634817408704353. AE(bin): 4
Accuracy: 0.9794897448724362. AE(bin): 5
Accuracy: 0.9854927463731866. AE(bin): 6
Accuracy: 0.9879939969984992. AE(bin): 7
Accuracy: 0.9904952476238119. AE(bin): 8
Accuracy: 0.991495747873937. AE(bin): 9
Accuracy: 0.992496248124062. AE(bin): 10
Accuracy: 0.9944972486243121. AE(bin): 11
Accuracy: 0.9954977488744372. AE(bin): 12
Accuracy: 0.9964982491245623. AE(bin): 13
Accuracy: 0.9969984992496248. AE(bin): 14
Accuracy: 0.9984992496248124. AE(bin): 15
Accuracy: 0.9994997498749375. AE(bin): 16
Accuracy: 0.9994997498749375. AE(bin): 17
Accuracy: 1.0. AE(bin): 18
Accuracy: 1.0. AE(bin): 19


In [25]:
a=np.array([ [ 1, 2 ], [3, 4] ])
print(a/np.sum(a, axis=0))

[[0.25       0.33333333]
 [0.75       0.66666667]]


# Accuracy Parameter Tradeoff

In [26]:
def plot_information_metrics(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
#             c.use_zlib()
#             predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(entropy_rate(predictions, k=5))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     color=color,
                    legend_label=legend)    

In [27]:
def plot_compression_performance(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
#     print(os.listdir(files_path))
    c = Compressor()
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    actual_filename = [ file for file in os.listdir(files_path) if 'actual' in file ][0]
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, actual_filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
            c.use_zlib()
            predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(len(c.compress(predictions.encode('utf-8'))))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     marker='x',
                     color=color,
                    legend_label=legend)

In [28]:
def plot_rkc(folder_name, filename, rkc_plot, legend, color='blue', 
             tolerance=1, normalize=False, plot_envelope=False):
    db_path = os.path.join(folder_name, filename)

    db = Database()
    db.open(db_path)

    db.query("SELECT * FROM RESULTS WHERE bin_tolerance<={}".format(tolerance))

    rows = db.cursor.fetchall()

    data_dict = {
        "n_params": [],
        "accuracy": [],
        "l0_reg": [],
        "gate_thres": []
    }

    for row in rows:
        trace, seed, n_params, oracle_size, accuracy, \
        bin_tolerance, gate_thres, l0_reg = row

        data_dict["n_params"].append(n_params)
        data_dict["l0_reg"].append(l0_reg)
        data_dict["gate_thres"].append(gate_thres)
        data_dict["accuracy"].append(1-accuracy)
        
    data_dict["n_params"] = np.array(data_dict["n_params"])
        
    if normalize:
        data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])
        
    print("Number of points: {}".format(len(data_dict["n_params"])))
    source = ColumnDataSource(data=data_dict)

    p = rkc_plot.scatter('accuracy','n_params', 
                     source=source, 
                     fill_alpha=0.2, 
                     line_alpha=0.7,
                     color=color,
#                      marker='x'
#                     legend_label=legend
                    )
    error_scale = np.linspace(0, 1, 1000)
#     print(error_scale)
    error_list = []
    min_param_list = []
    for error_id in range(error_scale.shape[0]-1):
        min_err = error_scale[error_id]
        max_err = error_scale[error_id+1]
        mask = np.logical_and(data_dict["accuracy"] >= min_err,
                              data_dict["accuracy"] < max_err)
#         print(mask)
        if np.sum(mask) > 0:
            error_list.append((min_err+max_err)/2)
            if normalize:
                shift = 0
            else:
                shift = 0
            if len(min_param_list) > 0:
                min_param_list.append(min(np.min(data_dict["n_params"][mask]), 
                                          min_param_list[-1])-shift)
            else:
                min_param_list.append(np.min(data_dict["n_params"][mask])-shift)
            
#             print("error_range: {} - {}".format(min_err, max_err))
#             plot_histogram(data_dict["n_params"][mask])
            
#     print(min_param_list)
    if plot_envelope:
        rkc_plot.line(error_list,
                     min_param_list,
                     color=color,
                     line_width=2)
    db.close()
    return rkc_plot, p

In [29]:
def rkc_approx_parabolic(e, e_0, a=1, b=1, p_0=1):
    return p_0*np.exp(np.sqrt(np.log(b*e_0/e)/a))


def rkc_approx_sigmoid(e, e_0, a=1, b=1, p_0=1):
    return p_0*np.exp(b/(1+np.exp(a*np.log(e/e_0))) - b/2)


def rkc_approx_reciprocal(e, e_0, a=1, b=1, p_0=1):
    return p_0*np.exp(b/(1+a*np.log(e/e_0)))/np.exp(b)


def plot_rkc_approximated(e_0, rkc_plot, color='blue', a=1, b=1, p_0=1, 
                          approx_type='parabolic', line_dash='dashed'):
    error_array = np.linspace(2e-3, 0.5, 100)
    
    if approx_type == 'parabolic':
        rkc_approx = rkc_approx_parabolic
    elif approx_type == 'sigmoid':
        rkc_approx = rkc_approx_sigmoid
    elif approx_type == 'reciprocal':
        rkc_approx = rkc_approx_reciprocal
    
    p_approx = np.array(list(map(lambda x: rkc_approx(x, e_0, a, b, p_0), 
                                 error_array)))
#     print(error_array)
#     print(p_approx)
    rkc_plot.line(error_array,
                  p_approx,
                  color=color,
                  line_width=2,
                  line_dash=line_dash)

In [30]:
## Performance extrapolation
def plot_estimated_boundary(error_range, tolerance, folder_name, dbs):
    hidden_sizes = list(dbs.keys())
    max_sparsity = []
    max_params = []
    for hidden_size in hidden_sizes:
        db_path = os.path.join(folder_name, dbs[hidden_size])
        db = Database()
        db.open(db_path)

        # Find max number of parameters
        db.query("SELECT MAX(n_params) FROM RESULTS")
        rows = db.cursor.fetchall()

        for row in rows:
            max_params.append(row[0])
            break

        # To find min_size achieved
        db.query("SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} "
                 "AND ACCURACY >= {} AND ACCURACY < {}".format(tolerance,
                                                              1-error_range[1],
                                                              1-error_range[0]))
        rows = db.cursor.fetchall()

        for row in rows:
            max_sparsity.append(row[0]/max_params[-1])
#             print("hidden: {} sparsity: {}".format(hidden_size, max_sparsity[-1]))
            break
    
    # regression
    X = np.array(max_sparsity)
    y = np.array(max_params)
    mask = np.ones(len(y), dtype=bool)
    mask[1] = False

    X = X[mask].reshape((-1, 1))
    y = y[mask]

    X = np.log(X)
    y = np.log(y)
    reg = LinearRegression().fit(X, y)

    # Plotting
    sparsity = np.logspace(-3, 0, 100)
    estimated = np.exp(reg.predict(np.log(sparsity).reshape((-1, 1))))

    # Return estimated intercept
    return np.exp(reg.predict([[ 0 ]]))

In [139]:
# def rkc_approx(e, a=2, b=1, min_acc=0.5895):
# #     return np.power(e, -m) + c
# #     return np.exp(np.sqrt(4*a*np.log(b*(e+min_acc))))
# #     print(a*(np.log(e)**2) + b*np.log(e) + min_acc)
# #     print(np.sqrt(a*(np.log(e)**2) + b*np.log(e) + min_acc))
#     return np.exp(np.sqrt(a*(np.log(e+min_acc)**2) + b*np.log(e+min_acc) + min_acc))

# def rkc_approx_exp(e, a=2, e_0=0.5895):
#     return np.exp(a*np.exp(np.log(1-e)))

def print_summary(arr, trace_size):
    print("argmax: ", np.argmax(arr))
    print("max_val: ", arr[np.argmax(arr)])
    print("bin frequency: ", np.max(arr)/trace_size)

    
def get_baseline(trace_name="pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                tolerance=1, trace_length=2000):
    with open(os.path.join("/home/arjun/ssd/chaos/data", trace_name), "rb") as f:
        trace = pkl.load(f)
    if trace_length is not None:
        trace = trace[:2000]
    kernel = np.ones(2*tolerance+1)
    arr = np.convolve(np.sum(trace, axis=0), kernel, mode="same")
    print_summary(arr, trace.shape[0])
    
    return 1 - np.max(arr)/trace.shape[0]

palette = all_palettes['Set1'][9]
legend_items = []
tolerance = 2
normalize = False
plot_envelope = True
dbs = {
    700: "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db",
    400: "result_cmr_lstm_s400_d1_omax.db",
    100: "result_cmr_lstm_pmd_jikes_s100_d1_omax.db",
    50: "result_cmr_lstm_s50_lstm_new.db",
#     20: "result_cmr_lstm_pmd_jikes_s20_d1_gthres_0_001_gdecay1_11.db"
}

def plot_all(plot_list):
    palette = all_palettes['Set1'][9]
    legend_items = []
    tolerance = 3
    normalize = False

    for idx, model_info in enumerate(plot_list):
        filename, tag = model_info
        folder_name = "/home/arjun/research/result_dbs"
        filename = "result_cmr_lstm_s700_d1_omax.db"
        rkc_plot, p = plot_rkc(folder_name, 
                            filename, 
                            rkc_plot, 
                            tolerance=tolerance,
                            normalize=normalize,
                            color=palette[0],
                            legend="h=700")
        legend_items.append(( "h=700", [p] ))
    

luindex_jikes_baseline = get_baseline("luindex-small-JikesRVM-d-l64-p4096-w100000i.analyzed-1.pkl",
                             tolerance=tolerance, trace_length=None)
luindex_j9_baseline = get_baseline("luindex-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                             tolerance=tolerance, trace_length=None)
luindex_hotspot_baseline = get_baseline("luindex-small-HotSpot-d-l64-p4096-w100000i.analyzed-1.pkl",
                             tolerance=tolerance, trace_length=None)
    
pmd_jikes_baseline = get_baseline("pmd-small-JikesRVM-d-l64-p4096-w100000i.analyzed-1.pkl",
                                 tolerance=tolerance)
pmd_j9_baseline = get_baseline("pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                                 tolerance=tolerance)
pmd_hotspot_baseline = get_baseline("pmd-small-HotSpot-d-l64-p4096-w100000i.analyzed-1.pkl",
                                 tolerance=tolerance)


if normalize:
    y_range=(1e-4, 1)
else:
    y_range=(1, 5e6)

plot_options = dict(width=900,
                        plot_height=400,
                        y_range=y_range,
                        x_range=(2e-3, 1.0),
                        tools='pan,wheel_zoom,reset,save')
TOOLTIPS = [
    ("error:", "$x"),
    ("parameters:", "$y"),
    ("l1_reg:", "@l0_reg"),
    ("gate_thres:", "@gate_thres"),
]
rkc_plot = figure(
                  x_axis_type="log", 
                  y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)

palette_idx = 0
folder_name = os.path.join(research_folder_path,
                          "result_dbs")

# extrapolated_errors = [ ]
# extrapolated_parameters = []

# for error in np.arange(0.0, 0.9, 0.03):
#     error_range = [ error, error+0.05 ]
    
#     estimated_parameters = plot_estimated_boundary(error_range, tolerance, folder_name, dbs)
    
#     extrapolated_errors.append( (error_range[0]+error_range[1])/2 )
#     extrapolated_parameters.append(estimated_parameters)

# filename = "cmr_lstm_pmd_jikes_d1_combined.db"
# rkc_plot, p = plot_rkc(folder_name, 
#                     filename, 
#                     rkc_plot, 
#                     tolerance=tolerance,
#                     normalize=normalize,
#                     plot_envelope=True,
#                     color=palette[palette_idx],
#                     legend="pmd jikes d=1")
# legend_items.append(( "pmd-jikes d=1", [p] ))
# palette_idx+=1
filename = "cmr_lstm_pmd_jikes_d1_combined.db"
rkc_plot, p = plot_rkc(folder_name, 
                    filename, 
                    rkc_plot, 
                    tolerance=tolerance,
                    normalize=normalize,
                    plot_envelope=True,
                    color=palette[palette_idx],
                    legend="pmd jikes d=1")
legend_items.append(( "pmd-jikes d=1", [p] ))
palette_idx+=1

filename = "cmr_lstm_pmd_j9_d1_combined.db"
rkc_plot, p = plot_rkc(folder_name, 
                    filename, 
                    rkc_plot, 
                    tolerance=tolerance,
                    normalize=normalize,
                    plot_envelope=True,
                    color=palette[palette_idx],
                    legend="pmd j9 d=1")
legend_items.append(( "pmd j9 d=1", [p] ))
palette_idx+=1

filename = "result_cmr_lstm_pmd_hotspot.db"
rkc_plot, p = plot_rkc(folder_name, 
                    filename, 
                    rkc_plot, 
                    tolerance=tolerance,
                    normalize=normalize,
                    plot_envelope=True,
                    color=palette[palette_idx],
                    legend="pmd hotspot d=1")
legend_items.append(( "pmd hotspot d=1", [p] ))
palette_idx+=1

############################################ 
# filename = "result_cmr_luindex_small_jikes_lstm_combined.db"
# rkc_plot, p = plot_rkc(folder_name, 
#                     filename, 
#                     rkc_plot, 
#                     tolerance=tolerance,
#                     normalize=normalize,
#                     plot_envelope=True,
#                     color=palette[palette_idx],
#                     legend="luindex jikes")
# legend_items.append(( "luindex jikes", [p] ))
# palette_idx+=1

# filename = "result_cmr_luindex_small_j9_lstm_combined.db"
# rkc_plot, p = plot_rkc(folder_name, 
#                     filename, 
#                     rkc_plot, 
#                     tolerance=tolerance,
#                     normalize=normalize,
#                     plot_envelope=True,
#                     color=palette[palette_idx],
#                     legend="luindex j9")
# legend_items.append(( "luindex j9", [p] ))
# palette_idx+=1

# filename = "result_cmr_luindex_small_hotspot_lstm_combined.db"
# rkc_plot, p = plot_rkc(folder_name, 
#                     filename, 
#                     rkc_plot, 
#                     tolerance=tolerance,
#                     normalize=normalize,
#                     plot_envelope=True,
#                     color=palette[palette_idx],
#                     legend="luindex hotspot")
# legend_items.append(( "luindex hotspot", [p] ))
# palette_idx+=1

## Plot baseline lines
pmd_jikes_line = Span(location=pmd_jikes_baseline, dimension='height', 
                  line_color=palette[0], line_width=3)
pmd_j9_line = Span(location=pmd_j9_baseline, dimension='height', 
                  line_color=palette[1], line_width=3)
pmd_hotspot_line = Span(location=pmd_hotspot_baseline, dimension='height', 
                  line_color=palette[2], line_width=3)

luindex_jikes_line = Span(location=luindex_jikes_baseline, dimension='height', 
                  line_color=palette[0], line_width=3)
luindex_j9_line = Span(location=luindex_j9_baseline, dimension='height', 
                  line_color=palette[1], line_width=3)
luindex_hotspot_line = Span(location=luindex_hotspot_baseline, dimension='height', 
                  line_color=palette[2], line_width=3)

legend = Legend(items=legend_items, location="center")

# rkc_plot.line(extrapolated_errors,
#              extrapolated_parameters,
#              line_color="red")
rkc_plot.add_layout(legend, 'right')
rkc_plot.renderers.extend([pmd_jikes_line, pmd_j9_line, pmd_hotspot_line,])
rkc_plot.legend.location = "top_left"
rkc_plot.legend.click_policy='hide'
rkc_plot.legend.border_line_alpha = 1.0
# rkc_plot.legend.visible = False
# rkc_plot.legend.location = "top_right"
if normalize:
    rkc_plot.yaxis.axis_label="Fraction of total parameters remaining"
else:
    rkc_plot.yaxis.axis_label="Number of non zero parameters in model(log)"
rkc_plot.xaxis.axis_label="error(log)"
show(rkc_plot)

argmax:  66
max_val:  10262.0
bin frequency:  0.33306286715783323
argmax:  72
max_val:  36486.0
bin frequency:  0.6750915886467084
argmax:  73
max_val:  36820.0
bin frequency:  0.7125164486415357
argmax:  64
max_val:  950.0
bin frequency:  0.475
argmax:  72
max_val:  978.0
bin frequency:  0.489
argmax:  73
max_val:  1469.0
bin frequency:  0.7345
Number of points: 960
Number of points: 1728
Number of points: 1728


## Boundary comparison on architectural changes

In [142]:
def show_comparison(folder_name, filename1, filename2, translation_plot, 
                    label1, label2, legend, color='blue', tolerance=1):
    db_path1 = os.path.join(folder_name, filename1)
    db_path2 = os.path.join(folder_name, filename2)
    
    print("db path-------------", db_path1)
    db1 = Database()
    db1.open(db_path1)
    db2 = Database()
    db2.open(db_path2)

    error_scale = np.linspace(0, 0.5, 50)
    
    data_dict = {}
    data_dict[label1] = []
    data_dict[label2] = []
    data_dict["error"] = []
    
    for error_id in range(error_scale.shape[0]-1):
        min_err = error_scale[error_id]
        max_err = error_scale[error_id+1]
        query = "SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} AND " \
                  "accuracy>={} AND accuracy<{}".format(tolerance,
                                                       1-max_err,
                                                       1-min_err)
        query = "SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} AND " \
                  "accuracy>={} AND accuracy<=0.99".format(tolerance, 1-max_err)
        db1.query(query)
        db2.query(query)

        rows1 = db1.cursor.fetchall()
        rows2 = db2.cursor.fetchall()
        
        for row in rows1:
            min_param1 = row[0]
        
        for row in rows2:
            min_param2 = row[0]
        
        if min_param1 is not None and min_param2 is not None:
            data_dict[label1].append(min_param1)
            data_dict[label2].append(min_param2)
            data_dict["error"].append((min_err+max_err)/2)

    source = ColumnDataSource(data=data_dict)
    
    p = translation_plot.scatter(label1,label2, 
                         source=source, 
                         fill_alpha=0.7, 
                         line_alpha=0.9,
                         color=color)
#     translation_plot.line(label1,label1, 
#                          source=source, 
#                          line_dash='dotdash',
#                          line_alpha=0.7)
    
    db1.close()
    db2.close()
    return translation_plot, p, data_dict

In [143]:
folder_name = "/media/arjun/Shared/research/result_dbs"
plot_options = dict(width=900,
                        plot_height=400,
#                         y_range=y_range,
#                         x_range=(2e-3, 1.0),
                        tools='pan,wheel_zoom,reset,save')

translation_plot = figure(
#                   x_axis_type="log", 
#                   y_axis_type="log",
                  **plot_options)

palette = Viridis256
colormapper = linear_cmap(field_name = "error", 
                          palette=palette, 
                          low=0, 
                          high=0.5)
color_bar = ColorBar(color_mapper=colormapper['transform'], label_standoff=12)

filename1 = "cmr_lstm_pmd_jikes_d4_combined.db"
filename2 = "cmr_lstm_pmd_jikes_d1_combined.db"
translation_plot, p, data_dict = show_comparison(folder_name, 
                                                filename1,
                                                filename2,
                                                translation_plot, 
                                                "lstm depth=1",
                                                "lstm depth=4",
                                                tolerance=tolerance,
                                                color=colormapper,
                                                legend="pmd j9")
filename1 = "cmr_lstm_pmd_j9_d4_combined.db"
filename2 = "cmr_lstm_pmd_j9_d1_combined.db"

final_data_dict = data_dict
print(final_data_dict)
translation_plot, p, data_dict = show_comparison(folder_name, 
                                                filename1,
                                                filename2,
                                                translation_plot, 
                                                "lstm depth=1",
                                                "lstm depth=4",
                                                tolerance=tolerance,
                                                color=colormapper,
                                                legend="pmd jikes")

final_data_dict["lstm depth=1"].extend(data_dict["lstm depth=1"])
final_data_dict["lstm depth=4"].extend(data_dict["lstm depth=4"])
final_data_dict["error"].extend(data_dict["error"])

X = np.array(final_data_dict["lstm depth=1"]).reshape((-1, 1))
y = np.array(final_data_dict["lstm depth=4"])

reg = LinearRegression().fit(X, y)
d1_model_parameters = np.linspace(min(final_data_dict["lstm depth=1"]), 
                                  max(final_data_dict["lstm depth=1"]), 50)
d4_predicted = reg.predict(d1_model_parameters.reshape((-1, 1)))
translation_plot.line(d1_model_parameters,
                      d4_predicted, 
                      line_alpha=0.7,
                      color="blue")
translation_plot.line(d1_model_parameters,
                      d1_model_parameters, 
                      line_dash='dotdash',
                      line_alpha=0.7,
                      color="green")

translation_plot.add_layout(color_bar, 'right')
show(translation_plot)

db path------------- /media/arjun/Shared/research/result_dbs/cmr_lstm_pmd_jikes_d4_combined.db
{'lstm depth=1': [13418, 9814, 8798, 3929, 3929, 3626, 3626, 3544, 3544, 3544, 3544, 1697, 1697, 1249, 1249, 1211, 1211, 1211, 1182, 1182, 1182, 1182, 1182, 1167, 1167, 440, 440, 440, 440, 429, 429, 429, 429, 423, 423, 418, 418, 418, 418, 196, 188, 188, 176, 176, 145, 145, 138, 138], 'lstm depth=4': [4384, 4244, 4212, 4212, 4154, 4154, 4077, 3862, 3658, 3380, 3380, 3294, 3294, 1499, 1499, 1441, 1441, 1441, 1244, 1244, 1244, 1244, 1244, 1244, 1216, 1216, 1216, 1216, 1216, 1216, 1216, 1078, 1078, 1078, 1078, 1078, 1078, 1078, 1078, 1078, 1046, 1046, 490, 237, 214, 106, 106, 106], 'error': [0.015306122448979591, 0.025510204081632654, 0.03571428571428571, 0.04591836734693877, 0.05612244897959183, 0.06632653061224489, 0.07653061224489796, 0.086734693877551, 0.09693877551020408, 0.10714285714285712, 0.1173469387755102, 0.12755102040816324, 0.1377551020408163, 0.14795918367346939, 0.1581632653061224

In [32]:
error_range = [ 0.25, 0.35 ]
tolerance = 1
folder_name = "/home/arjun/research/result_dbs/old"
dbs = {
    700: "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db",
    400: "result_cmr_lstm_s400_d1_omax.db",
    100: "result_cmr_lstm_pmd_jikes_s100_d1_omax.db",
    50: "result_cmr_lstm_s50_lstm_new.db",
    20: "result_cmr_lstm_pmd_jikes_s20_d1_gthres_0_001_gdecay1_11.db"
}

hidden_sizes = list(dbs.keys())
max_sparsity = []
max_params = []
for hidden_size in hidden_sizes:
    db_path = os.path.join(folder_name, dbs[hidden_size])
    db = Database()
    db.open(db_path)
    
    # Find max number of parameters
    db.query("SELECT MAX(n_params) FROM RESULTS")
    rows = db.cursor.fetchall()
    
    for row in rows:
        max_params.append(row[0])
        break
    
    # To find min_size achieved
    db.query("SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} "
             "AND ACCURACY >= {} AND ACCURACY < {}".format(tolerance,
                                                          1-error_range[1],
                                                          1-error_range[0]))
    rows = db.cursor.fetchall()
    
    for row in rows:
        max_sparsity.append(row[0]/max_params[-1])
        print("hidden: {} sparsity: {}".format(hidden_size, max_sparsity[-1]))
        break

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error connecting to database!
Traceback (most recent call last):
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 71, in open
    self.conn = sqlite3.connect(name);
sqlite3.OperationalError: unable to open database file

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-32-d86a611f764e>", line 18, in <module>
    db.open(db_path)
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 76, in open
    sys.exit()
SystemExit

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/ultratb.py", line 1169, in get_records
    return _fixed_getinnerframes(

TypeError: object of type 'NoneType' has no len()

In [None]:
# regression
X = np.array(max_sparsity)
y = np.array(max_params)
mask = np.ones(len(y), dtype=bool)
mask[1] = False

X = X[mask].reshape((-1, 1))
y = y[mask]

X = X[:2]
y = y[:2]
X = np.log(X)
y = np.log(y)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))

# Plotting
sparsity = np.logspace(-3, 0, 100)
estimated = np.exp(reg.predict(np.log(sparsity).reshape((-1, 1))))

print("Intercept: {}".format(np.exp(reg.predict([[ 0 ]]))))

plot_options = dict(width=600,
                    x_axis_type="log",
                    y_axis_type="log",
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')
performance_plot = figure(y_range=(1e-3,1),
                          x_range=(1, 1e7),
                          **plot_options)
performance_plot.scatter(max_params, max_sparsity)
performance_plot.line(estimated,
                     sparsity)

performance_plot.yaxis.axis_label="Fraction of total parameters remaining"
performance_plot.xaxis.axis_label="Total Number of Parameters before training"

show(performance_plot)

In [None]:
# magnitude fisher information relation
save_dir = "/media/arjun/Shared/cluster/experiments/pmd-small-jikes-lstm"
model_dict_file = "model_0_0.0005.pkl"

with open(os.path.join(save_dir, "{}_{}_fisher".format(model_dict_file,
                                             0.0)), 'rb') as f:
    fisher_information = pkl.load(f)

with open(os.path.join(save_dir, "{}_{}_param".format(model_dict_file,
                                            0.0)), 'rb') as f:
    params = pkl.load(f)

for name, param in params.items():
    print(name)
    x = param.flatten().data.numpy()
    y = fisher_information[name].flatten()
    print(np.min(y), np.max(y))
    
    plot_options = dict(width=600,
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')
    performance_plot = figure(**plot_options)
    performance_plot.scatter(x, y)

    performance_plot.yaxis.axis_label="Fisher Information"
    performance_plot.xaxis.axis_label="Magnitude of parameters"

    show(performance_plot)

In [None]:
compute_parameters.get_count(100, 100, [400, 400], [ 'lstm', 'fc' ])