In [164]:
import os
import pickle as pkl
import numpy as np
import colorcet as cc

import torch.nn as nn
from prediction_model.utils import compute_parameters
from prediction_model.models.fully_connected import FCN
# plotting tools
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.palettes import Inferno, all_palettes, Viridis256
from bokeh.models import (CustomJS, Slider, ColumnDataSource, Legend, 
                         BasicTicker, ColorBar, ColumnDataSource,
                          LinearColorMapper, LogColorMapper, 
                          PrintfTickFormatter)
from bokeh.models import Whisker, HoverTool, Span, ColorBar
from bokeh.transform import linear_cmap, log_cmap, factor_cmap, transform
# from bokeh._legacy_charts import HeatMap
from compress import Compressor
from pyinform import entropy_rate

from misc.database import Database

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

from prediction_model.data.mnist import MNISTData

import copy
import torch

output_notebook()

In [86]:
research_folder_path = "/media/arjun/Shared/cluster/results"
data_path = "/media/arjun/Shared/chaos/data/mnist"

# Training

In [5]:
def plot_histogram(array):
    plot_options = dict(width=450,
                        plot_height=250,
                        tools='pan,wheel_zoom,reset,save')

    gate_hist = figure(**plot_options)

    hist, edges = np.histogram(array, bins=50)

    x = np.linspace(0, 1, 1000)

    gate_hist.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="#036564", line_color="#033649",\
    )

    
    
    gate_hist.xaxis.axis_label = 'X'
    gate_hist.yaxis.axis_label = 'Density'
    show(gate_hist)

In [87]:
def plot_training(result_folder_path, l1_regs, 
                  model_size, model_depth, model_seed, model_lstm_depth,
                  tloss_plot, ce_plot, l0_plot, accuracy_plot, palette, 
                  param_plot=None, max_params=0, seed=0, model_epoch=None):
    xs_total = []
    ys_total = []
    xs_ce = []
    ys_ce = []
    xs_l0 = []
    ys_l0 = []
    xs_acc = []
    ys_acc = []
    idx=-1
    for val in l1_regs:
        idx+=1
#         print
        try:
            l1_reg = val
        #     print("L0 regularization coefficient: {}".format(l0_reg))
    #         with open("/home/arjun/research/train_infos/train_info_{}_0.pkl".format(jobid), 
    #                   'rb') as f:
    #             train_info = pkl.load(f)
    #         seed = 0
    #         train_info["cross_entropy"] = train_info["cross_entropy"]
    #         train_info["l0_penalty"] = train_info["l0_penalty"]
    #         train_info["total_loss"] = train_info["total_loss"]

    #         if "weights_pruned" in train_info.keys():


#             if model_lstm_depth is not None:
#                 log_path = os.path.join(result_folder_path,
#                         "model_lstm_s{}_d{}_lstmd{}_l1{}_seed{}.train_info.pkl".format(model_size, 
#                                                                                     model_depth,
#                                                                                     model_lstm_depth,
#                                                                                     l1_reg, 
#                                                                                     model_seed))
#             else:
            log_path = os.path.join(result_folder_path,
                "model_fcn_s{}_d{}_l1{}_seed{}.train_info.pkl".format(model_size, 
                                                                        model_depth,
                                                                        l1_reg, 
                                                                        model_seed))


            with open(log_path, 'rb') as f:
                train_info = pkl.load(f)
            
        except FileNotFoundError:
            print("not found: ", log_path)
            continue
    #     print(train_info["total_loss"])
        print(train_info["epoch"])
        xs_total.append(list(range(len(train_info["total_loss"]))))
        ys_total.append(train_info["total_loss"])

        tloss_plot.line(list(range(len(train_info["total_loss"]))),
                        train_info["total_loss"], 
                        color=palette[idx],
                        legend_label="l0 reg: {}".format(val),
                        line_alpha=0.8,
                        line_width=2)

        xs_ce.append(list(range(len(train_info["cross_entropy"]))))
        ys_ce.append(train_info["cross_entropy"])

        ce_plot.line(list(range(len(train_info["cross_entropy"]))),
                        train_info["cross_entropy"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

        xs_l0.append(list(range(len(train_info["l0_penalty"]))))
        ys_l0.append(train_info["l0_penalty"])
        l0_plot.line(list(range(len(train_info["l0_penalty"]))),
                        train_info["l0_penalty"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

        xs_acc.append(list(range(len(train_info["accuracy"]))))
        ys_acc.append(train_info["accuracy"])
        accuracy_plot.line(list(range(len(train_info["accuracy"]))),
                        train_info["accuracy"], 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

        if param_plot is not None:
                param_plot.line(list(range(len(train_info["l0_penalty"]))),
                        -np.array(train_info["weights_pruned"])+max_params, 
                        color=palette[idx],
                        line_alpha=0.8,
                        line_width=2)

In [7]:
def color_to_cmap(palette, n_colors):
    color_div = int(len(palette)/n_colors)
    
    return [ palette[i] for i in range(0, len(palette), color_div) ]

In [88]:
# sunflow 400x1
l0_regs = {
    7456767: 0.5,
    7456768: 0.1,
    7456769: 0.05,
    7456770: 0.01,
    7456771: 0.005,
    7456772: 0.001,
    7456773: 0.0005,
    7456774: 0.0001,
    7456775: 0.00005,
    7456776: 0.00001,
    7456777: 0.0
}

result_folder_path = os.path.join(research_folder_path,
                                 'mnist',
                                 'full_data_generalization') 
l0_regs = [0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001, 0.0]
model_size = 40
model_depth=4
model_seed=0
model_lstm_depth=None

plot_options = dict(width=1000,
                        plot_height=1000,
                        tools='pan,wheel_zoom,reset,save')

tloss_plot = figure( **plot_options)
ce_plot = figure(**plot_options)
l0_plot = figure(**plot_options)
params_plot = figure(**plot_options)
accuracy_plot = figure(**plot_options)
# params_plot=None

# palette = all_palettes['Inferno'][len(l0_regs)]
palette = list(reversed(color_to_cmap(cc.kg, len(l0_regs))))
plot_training(result_folder_path, l0_regs,
              model_size, model_depth, model_seed, model_lstm_depth,
              tloss_plot, ce_plot, l0_plot, accuracy_plot, 
              palette, params_plot)

# palette = color_to_cmap(cc.blues, len(l0_regs))
# plot_training(l0_regs, tloss_plot, ce_plot, l0_plot, palette)

tloss_plot.legend.click_policy = 'hide'
tloss_plot.legend.visible = False
tloss_plot.xaxis.axis_label = 'epoch'
tloss_plot.yaxis.axis_label = 'total loss'
ce_plot.xaxis.axis_label = 'epoch'
ce_plot.yaxis.axis_label = 'cross entropy loss'
l0_plot.xaxis.axis_label = 'epoch'
l0_plot.yaxis.axis_label = 'l1 penalty'
params_plot.xaxis.axis_label = 'epoch'
params_plot.yaxis.axis_label = 'weights remaining'
grid = gridplot([[tloss_plot, params_plot], [ce_plot, l0_plot], [accuracy_plot, None]], 
                plot_width=600, 
                plot_height=350)
show(grid)

[0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500, 520, 540, 560, 580, 600, 620, 640, 660, 680, 700, 720, 740, 760, 780, 800, 820, 840, 860, 880, 900, 920, 940, 960, 980]
[0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500, 520, 540, 560, 580, 600, 620, 640, 660, 680, 700, 720, 740, 760, 780, 800, 820, 840, 860, 880, 900, 920, 940, 960, 980]
[0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500, 520, 540, 560, 580, 600, 620, 640, 660, 680, 700, 720, 740, 760, 780, 800, 820, 840, 860, 880, 900, 920, 940, 960, 980]
[0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400, 420, 440, 460, 480, 500, 520, 540, 560, 580, 600, 620, 640, 660, 680, 700, 720, 740, 760, 780, 800, 820, 840, 860, 880, 900, 920, 940, 960, 980]
[0, 20, 40, 60, 80, 



In [7]:
db = Database()
db.open(os.path.join(research_folder_path,
                     "result_dbs/old", 
                     "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db"))
l0_reg_vals = l0_regs.values()
l0_params = []
# print(l0_reg_vals)
# print(l0_reg_vals)
l0_reg_vals_final = []
for l0_reg in l0_reg_vals:
    db.query("SELECT * FROM RESULTS WHERE l0_reg = {}".format(l0_reg))
    rows = db.cursor.fetchall()
    n_params_list = []
    for row in rows:
#         print(row)
        trace, seed, n_params, oracle_size, accuracy, bin_tolerance, \
        gate_threshold, l0_reg = row
        n_params_list.append(n_params)
    if len(n_params_list) > 0:
#         print(n_params_list)
        l0_reg_vals_final.append(l0_reg)
        l0_params.append(np.mean(n_params_list))

# print(l0_params)
plot_options = dict(title="Number of Parameters vs L0 regularization coefficient".format(l0_reg),
                    width=500,
        data_dict["oracle_param_sum"].append(1/(n_params+oracle_size))
                    plot_height=300,
                    y_axis_type='log',
                    tools='pan,wheel_zoom,reset,save')
# Total loss
ctm_plot = figure(**plot_options)
ctm_plot.line(
    l0_reg_vals_final,
                l0_params,
                line_width=2,
             line_color='red')
ctm_plot.xaxis.axis_label = 'l0_regularization coefficient'
ctm_plot.yaxis.axis_label = 'number of non zero parameters in model'
show(ctm_plot)
db.close()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error connecting to database!
Traceback (most recent call last):
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 71, in open
    self.conn = sqlite3.connect(name);
sqlite3.OperationalError: unable to open database file

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-52d223222cdb>", line 4, in <module>
    "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db"))
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 76, in open
    sys.exit()
SystemExit

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/ultratb.py", line 

TypeError: object of type 'NoneType' has no len()

# Distributions

In [8]:
def extract_distribution_data(folder_name, filename):

    with open(os.path.join(folder_name, filename+'.gate_distribution'), 
              'rb') as f:
        gate_distribution = pkl.load(f)

    with open(os.path.join(folder_name, filename+'.weight_distribution'), 
              'rb') as f:
        weight_distribution = pkl.load(f)

    mul_weights = gate_distribution*weight_distribution
    
    return gate_distribution, weight_distribution, mul_weights

In [9]:
folder_name = os.path.join(research_folder_path, 
                           "distributions/pmd_s_jikes_lstm_fixed_s20_d1")
filename = "model_0_0.0001.pkl.0.01"

gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                               filename)

print("Distribution of gates")
plot_histogram(gate_distribution)

print("Distribution of weights")
plot_histogram(weight_distribution)

print("Distribution of multiplied weights")
plot_histogram(mul_weights)

FileNotFoundError: [Errno 2] No such file or directory: '/media/arjun/Shared/cluster/results/distributions/pmd_s_jikes_lstm_fixed_s20_d1/model_0_0.0001.pkl.0.01.gate_distribution'

# Cumulative weights

In [10]:
def plot_weights_cumulative(mul_weights, cumweights_plot, color='blue'):
    total_parameters = mul_weights.shape[0]
    print(total_parameters)
    hist, edges = np.histogram(np.abs(mul_weights), bins=100)
    edges = np.convolve(edges, [1/2, 1/2], mode='valid')
    cumweights = np.cumsum(hist)
    cumweights = (cumweights)/total_parameters

    source = ColumnDataSource(data=dict(
        x=edges,
        y=cumweights,
    ))

    p = cumweights_plot.line(source = source, line_width=2, color=color)
    
    return p


def plot_params_vs_error(folder, 
                         model_size, 
                         seed, 
                         l0_reg, 
                         param_err_plot, 
                         color='blue',
                         bin_tolerance=2):
    db_path = os.path.join(folder, "result_cmr_lstm_s{}_d1_omax.db".format(model_size))

    db = Database()
    db.open(db_path)

    db.query("SELECT * FROM RESULTS WHERE "
             "bin_tolerance<={} AND "
             "seed={} AND "
             "l0_reg={}".format(bin_tolerance,
                                   seed,
                                   l0_reg))

    rows = db.cursor.fetchall()
    
    params = []
    error_list = []
    gate_threshold_list = []
    for row in rows:
        _, _, n_params, _, accuracy, _, gate_threshold, _ = row
        params.append(n_params)
        gate_threshold_list.append(gate_threshold)
#         error_list.append(gate_threshold)
        error_list.append(1-accuracy)
    
    params = np.array(params)
    error_list = np.array(error_list)
    
    sorted_idx = np.argsort(params)
    params = params[sorted_idx]
    error_list = error_list[sorted_idx]
    
    source = ColumnDataSource(data=dict(
        x=gate_threshold_list,
        y=error_list,
    ))
    
    p = param_err_plot.scatter(source = source, color=color)
    
    return p

In [None]:
plot_options = dict(width=650,
                    plot_height=350,
                    tools='pan,xwheel_zoom,reset,save')
cumweights_plot = figure(
                        y_axis_type='log',
                        x_axis_type='log',
                        **plot_options)
param_err_plot = figure(
#                         y_axis_type='log',
#                         x_axis_type='log',
                        **plot_options)
legend_items = []
palette = all_palettes['Set1'][7]

# size=20
seed = 0
l0_reg = 5e-5
filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
folder_name = os.path.join(research_folder_path,
                          "distributions/pmd_s_jikes_lstm_fixed_s20_d1")
gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                                filename)
p = plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[0])
p1 = plot_params_vs_error(folder_name, 20, seed, l0_reg, param_err_plot, color=palette[0])
legend_items.append(('hidden=20', [p]))

# size = 50
seed = 0
l0_reg = 0.0
filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
folder_name = os.path.join(research_folder_path,
                          "distributions/pmd_s_jikes_lstm_fixed_s50_d1")
gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
                                                                                filename)
p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[1])
p1 = plot_params_vs_error(folder_name, 50, seed, l0_reg, param_err_plot, color=palette[1])
legend_items.append(('hidden=50', [p]))

# # size = 400
# seed = 0
# l0_reg = 0.0001
# filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
# folder_name = "/home/arjun/research/distributions/pmd_s_jikes_lstm_fixed_s400_d1"
# gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
#                                                                                 filename)
# p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[2])
# p1 = plot_params_vs_error(folder_name, 400, seed, l0_reg, param_err_plot, color=palette[2])
# legend_items.append(('hidden=400', [p]))

# # size = 700
# seed = 0
# l0_reg = 0.005
# filename = "model_{}_{}.pkl.0.0".format(seed, l0_reg)
# folder_name = "/home/arjun/research/distributions/pmd_s_jikes_lstm_fixed_s700_d1"
# gate_distribution, weight_distribution, mul_weights = extract_distribution_data(folder_name,
#                                                                                 filename)
# p=plot_weights_cumulative(mul_weights, cumweights_plot, color=palette[3])
# p1 = plot_params_vs_error(folder_name, 700, seed, l0_reg, param_err_plot, color=palette[3])
# legend_items.append(('hidden=700', [p]))

hover = HoverTool(
    tooltips=[
        ( 'magnitude',   '$x'            ),
        ( 'parameter count',  '$y' ), # use @{ } for field names with spaces
    ],
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)
legend = Legend(items=legend_items, location="top_left")

cumweights_plot.add_layout(legend, 'right')
cumweights_plot.legend.click_policy='hide'
cumweights_plot.add_tools(hover)
cumweights_plot.xaxis.axis_label = 'magnitude cutoff'
cumweights_plot.yaxis.axis_label = 'fraction of parameters below magnitude'
show(cumweights_plot)

param_err_plot.xaxis.axis_label = 'magnitude cutoff'
param_err_plot.yaxis.axis_label = 'error'
show(param_err_plot)

In [None]:
hist, edges = np.histogram(np.abs(mul_weights), bins=100)
cumweights = np.cumsum(hist)

print(hist.shape, edges.shape)
print(cumweights.shape)
print(edges)
edges = np.convolve(edges, [1/2, 1/2], mode='valid')
print(edges.shape)

# Prediction Performance

In [12]:
def get_heatmap(predictions, actual):
    heatmap_arr = np.zeros((100, 100))
    for i in range(predictions.shape[0]):
        heatmap_arr[actual[i], predictions[i]] += 1
    
    div = np.sum(heatmap_arr, axis=0)
    div[div==0] = 1
    return heatmap_arr/div

In [334]:
# folder_name = os.path.join(research_folder_path,
#                           "predictions/lstm_pmd_jikes")
# filename = "model_0_0.0005.pkl.0.07"

# actual_filename = [ file for file in os.listdir(folder_name) if 'actual' in file ][0]
folder_name = "/media/arjun/SSD/chaos/data"
data_path = "/media/arjun/Shared/chaos/data/mnist"
train_metrics = True

mnist_data = MNISTData(args.data, train=train_metrics)
feature_dimension = mnist_data.feature_dimension
output_dimension = mnist_data.output_dimension

## Train Validation Split
data_loader = torch.utils.data.DataLoader(mnist_data,
                                          batch_size=1)



# with open(os.path.join("/media/arjun/Shared/cluster/results", 
#                        "pmd-small-JikesRVM-d-l64-p4096-w100000i",
#                        "full_trace_generalization",
#                        "results_generalization",
#                        "model_lstm_s120_d4_lstmd1_l10.0_seed9.pkl.0.02.train.predictions"
#                       ), 
#           'rb') as f:
#     prediction = pkl.load(f)

# actual = actual.to_numpy()
# # print(actual.shape)
# actual = np.argmax(actual, axis=1)

# folder_name = os.path.join(research_folder_path,
#                            "pmd-small-JikesRVM-d-l64-p4096-w100000i",
#                            "full_trace_generalization")
# with open(os.path.join(folder_name, 
#                        "model_lstm_s120_d4_lstmd1_l10.0_seed9.train_mask.pkl"), 
#           'rb') as f:
#     train_mask = pkl.load(f)

# with open(os.path.join(folder_name, 
#                        "model_lstm_s120_d4_lstmd1_l10.0_seed9.test_mask.pkl"), 
#           'rb') as f:
#     test_mask = pkl.load(f)
    
# print(actual.shape)
# print(prediction.shape)
# print(np.sum(test_mask))
# data_mask = train_mask
# actual = actual[actual.shape[0]-51]*data_mask
# predictions = np.zeros(actual.shape)
# print(predictions[data_mask].shape)
# print(prediction[:-1])
# predictions[data_mask] = prediction
# # with open(os.path.join(folder_name, filename+'.predictions'), 
# #           'rb') as f:
# #     predictions = pkl.load(f)

# # diff_arr = np.abs(actual-predictions)
# # print(actual)
# plot_options = dict(width=600,
#                     plot_height=350,
# #                     y_range=(45, 90),
#                     tools='pan,xwheel_zoom,reset,save')
# predictions_plot = figure(**plot_options)
# predictions_plot.line(range(len(actual)),
#                 actual,
# #                 line_alpha=0.5,
#                 legend_label="actual",
#                 color='red',
#                 line_width=2)
# predictions_plot.line(range(len(actual)),
#                 predictions,
# #                 line_alpha=0.5,
#                 legend_label="predictions",
#                 line_width=2)
# # predictions_plot.line(range(len(actual)),
# #                 predictions,
# #                 line_alpha=0.5,
# #                 legend_label="prediction",
# #                 line_width=2,
# #              line_color='red')
# predictions_plot.xaxis.axis_label = 'Time index'
# predictions_plot.yaxis.axis_label = 'Bin'
# show(predictions_plot)

# # for bin_err in range(20):
# #     print("Accuracy: {}. AE(bin): {}".format(np.mean(diff_arr <= bin_err), bin_err))

# # ## Heatmap analysis
# predictions = predictions[data_mask].astype(int)
# actual = actual[data_mask].astype(int)
# print(actual)

# predictions_plot = figure(**plot_options)
# predictions_plot.line(range(len(actual)),
#                 actual,
# #                 line_alpha=0.5,
#                 legend_label="actual",
#                 color='red',
#                 line_width=2)
# predictions_plot.line(range(len(actual)),
#                 predictions,
# #                 line_alpha=0.5,
#                 legend_label="predictions",
#                 line_width=2)
# # predictions_plot.line(range(len(actual)),
# #                 predictions,
# #                 line_alpha=0.5,
# #                 legend_label="prediction",
# #                 line_width=2,
# #              line_color='red')
# predictions_plot.xaxis.axis_label = 'Time index'
# predictions_plot.yaxis.axis_label = 'Bin'
# show(predictions_plot)

# print("Error: ", 1 - (np.sum(np.abs(predictions-actual) <= 2)/actual.shape[0]))
# heatmap = get_heatmap(predictions, actual)
# # colors = ["#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
# # colors = list(all_palettes['Inferno'][7]).reverse()
# colors = all_palettes['Inferno'][7]
# colors = list(colors)
# colors.reverse()
# # print(colors)
# # print()
# # mapper = LinearColorMapper(palette=colors, low=np.min(heatmap), high=np.max(heatmap))

# accuracy_rates = [ heatmap[actual[i], predictions[i]] for i in range(predictions.shape[0]) ]

# # print(heatmap)
# # print(np.min(heatmap))
# # print(np.max(heatmap))
# # prin
# data_dict = {
#     "actual": actual,
#     "predictions": predictions,
#     "heatmap_score": accuracy_rates,
#     "inv_heatmap_score": 1-np.array(accuracy_rates)
# }
# source=ColumnDataSource(data=data_dict)

# # mapper = log_cmap(field_name='heatmap_score', palette=Viridis256 ,
# #                           low=np.min(data_dict['heatmap_score'])+1e-5,
# #                           high=np.max(data_dict['heatmap_score']))
# # print(Viridis256)
# colors = list(Viridis256)
# colors.reverse()
# mapper = LinearColorMapper(palette=colors, 
#                            low=np.clip(np.min(heatmap), 1e-5, 1), 
#                         high=np.max(heatmap))

# p = figure(plot_width=500, plot_height=400,
# #         x_range=(45, 90), y_range=(45,90),
#         tools="reset,save")

# p.rect(x="actual", y="predictions", width=1, height=1, source=source,
#        line_color=None, fill_color=transform('heatmap_score', mapper))

# color_bar = ColorBar(color_mapper=mapper, label_standoff=12)

# p.add_layout(color_bar, 'right')

# p.xaxis.axis_label="actual"
# p.yaxis.axis_label="predictions"

# show(p)
    
# for diff in diff_arr:
#     print(diff)

(58767,)
(18888,)
4458
(18888,)
[71 73 54 ... 69 63 64]


[71 71 71 ... 71 71 71]


Error:  0.7874311732316814


In [182]:
def get_slopes(x, net):
  activations = []
  def get_activation_sums():
      def hook(model, input, output):
          out = output.detach().view(output.shape[0],-1)
          out[out < 0] = 0
          activations.append(out)
      return hook 

  hooks = []
  def register_hooks(module):    
    if any([isinstance(module, nn.Linear), isinstance(module, nn.Conv2d)]):
      handle = module.register_forward_hook(get_activation_sums())
      hooks.append(handle)
    if hasattr(module, 'children'):
      children = list(module.children())
      for i in range(len(children)):
        register_hooks(children[i])  

  register_hooks(net)
  if len(hooks) > 1:
    hooks[-1].remove()

  output = net(x)

  acts = np.array([a.cpu().data.numpy() for a in activations])

  #De-registering hooks:
  for hook in hooks:
    hook.remove()

  def threshold(M):
      Mabs = np.abs(M)
      M[Mabs<0.0000001] = 0
      return M
  
#   C = np.array([np.ones(len(acts)),np.arange(1,len(acts)+1)]).transpose()
#   Cf = np.linalg.inv((C.T).dot(C)).dot(C.T)
#   Cf = threshold(Cf)
#   Cf = Cf[1,:]

#   S = 0
#   for j in range(len(Cf)):
#       S += acts[j]*Cf[j]
  
  return acts

In [211]:
# folder_name = os.path.join(research_folder_path,
#                           "predictions/lstm_pmd_jikes")
# filename = "model_0_0.0005.pkl.0.07"

# actual_filename = [ file for file in os.listdir(folder_name) if 'actual' in file ][0]
folder_name = "/media/arjun/SSD/chaos/data"
results_path = os.path.join("/media/arjun/Shared/cluster",
                           "results/mnist",
                           "full_data_generalization")
data_path = "/media/arjun/Shared/chaos/data/mnist"
train_metrics = False
model_size = 40
model_depth = 4
weight_threshold = 0.02
l1_reg = 0.0005
epoch = 860
seed = 0

model_filename = "model_{}_s{}_d{}_l1{}_seed{}_{}.pkl_{}_pruned.pkl".format('fcn',
                                            model_size,
                                            model_depth,
                                            l1_reg,
                                            seed,
                                            epoch,
                                            weight_threshold)

mnist_data = MNISTData(data_path, train=train_metrics)
feature_dimension = mnist_data.feature_dimension
output_dimension = mnist_data.output_dimension

## Train Validation Split
data_loader = torch.utils.data.DataLoader(mnist_data,
                                          batch_size=1)

## load model
model = FCN(input_bins=feature_dimension,
            output_bins=output_dimension,
            fc_config=[model_size] * model_depth,
            enable_gpu=True)

with open(os.path.join(results_path, model_filename), 
          'rb') as f:
        state_dict_pruned = pkl.load(f)

model.cuda()
model.load_state_dict(state_dict_pruned)

# test data
# 61,84 - 8
# 2  - 1
im_idx = 84

plot_options = dict(width=350,
                    plot_height=350,
#                     y_range=(45, 90),
                    tools='pan,wheel_zoom,reset,save')
input_plot = figure(**plot_options)
activation_plot = figure(**plot_options)
model_output_plot = figure(**plot_options)

sample_idx = -1
# confusion_matrix()
for sample in data_loader:
    sample_idx += 1
    print(sample_idx, np.argmax(sample[1].cpu().data.numpy().flatten()))
    if im_idx == sample_idx:
        x, y = sample
        x = x.cuda()
        
        activations = np.array(get_slopes(x, model))
#         activations = 
        
        out = model.forward(x)
        out = torch.tanh(out) * 9  # bounding before passing into softmax

        # Softmax
        out = torch.exp(out)
        out = out / torch.sum(out, dim=1)
        
        out = out.cpu().data.numpy().flatten()
        y_pred = np.argmax(out)
        
        x = x.cpu().data.numpy().flatten()
        y = y.cpu().data.numpy()
        y = np.argmax(y)
#         x = np.flip(x)
#         plt.imshow(x.reshape(28, 28), cmap='gray')
        input_plot.image(image=[np.flip(x.reshape(28, 28), 0)], 
                x=0, y=0, dw=10, dh=10, palette="Greys9", level="image")
#         print(activations.shape)
#         print(x.shape)
        activations = activations.squeeze(1)
#         print(activations.shape)
        activation_plot.image(image=[activations.T], 
                x=0, y=0, dw=10, dh=10, palette="Inferno10", level="image")
        model_output_plot.quad(top=out, bottom=0, left=range(10), right=range(1,11),
         fill_color="#036564", line_color="#033649")
        break
grid = gridplot([[activation_plot, input_plot, model_output_plot]], 
                plot_width=300, 
                plot_height=200)
show(grid)
print("actual: {} pred: {}".format(y, y_pred))
print(data_loader)

fc0
fc1
fc2
fc3
0 7
1 2
2 1
3 0
4 4
5 1
6 4
7 9
8 5
9 9
10 0
11 6
12 9
13 0
14 1
15 5
16 9
17 7
18 3
19 4
20 9
21 6
22 6
23 5
24 4
25 0
26 7
27 4
28 0
29 1
30 3
31 1
32 3
33 4
34 7
35 2
36 7
37 1
38 2
39 1
40 1
41 7
42 4
43 2
44 3
45 5
46 1
47 2
48 4
49 4
50 6
51 3
52 5
53 5
54 6
55 0
56 4
57 1
58 9
59 5
60 7
61 8
62 9
63 3
64 7
65 4
66 6
67 4
68 3
69 0
70 7
71 0
72 2
73 9
74 1
75 7
76 3
77 2
78 9
79 7
80 7
81 6
82 2
83 7
84 8


actual: 8 pred: 8
<torch.utils.data.dataloader.DataLoader object at 0x7fba338f7a50>


In [14]:
a=np.array([ [ 1, 2 ], [3, 4] ])
print(a/np.sum(a, axis=0))

[[0.25       0.33333333]
 [0.75       0.66666667]]


# Accuracy Parameter Tradeoff

In [15]:
def plot_information_metrics(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
#             c.use_zlib()
#             predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(entropy_rate(predictions, k=5))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     color=color,
                    legend_label=legend)    

In [16]:
def plot_compression_performance(files_path, plot, bin_tolerance=1, 
                                 color='green', legend='compression',
                                normalize=False):
#     print(os.listdir(files_path))
    c = Compressor()
    source_dict = {
        "accuracy": [],
        "compressed_size": []
    }
    actual_filename = [ file for file in os.listdir(files_path) if 'actual' in file ][0]
    for filename in os.listdir(files_path):
        if 'predictions' in filename:
            with open(os.path.join(files_path, filename), 'rb') as f:
                predictions = pkl.load(f)
            
            filename = filename.split('.')
            filename[-1] = 'actual'
            filename = '.'.join(filename)
            with open(os.path.join(files_path, actual_filename), 'rb') as f:
                actual = pkl.load(f)
            
#             print(np.abs(predictions-actual))
            error = 1 - np.sum(np.abs(predictions-actual) <= bin_tolerance)/actual.shape[0]
#             print(error)
            c.use_zlib()
            predictions = ' '.join(list(map(str, predictions)))
            source_dict["compressed_size"].append(len(c.compress(predictions.encode('utf-8'))))
            source_dict["accuracy"].append(error)
    
    if normalize:
        source_dict["compressed_size"] /= np.max(source_dict["compressed_size"])
    
    rkc_plot.scatter('accuracy','compressed_size', 
                     source=source_dict, 
                     fill_alpha=0.5, 
                     marker='x',
                     color=color,
                    legend_label=legend)

In [222]:
def plot_rkc(folder_name, filename, rkc_plot, legend, color='blue',
             tolerance=1, normalize=False, plot_envelope=False, model_type='lstm',
            model_depth=1, additional_query="", color_map=None, colormap_type='log',
            x_axis_key="error", y_axis_key="n_params", cmap_range=None):
    db_path = os.path.join(folder_name, filename)

    db = Database()
    db_test = Database()
    db.open(db_path)
    db_test.open(db_path)

#     print("SELECT seed, n_params, accuracy, gate_threshold, l1_reg, model_size "\
#              "FROM RESULTS WHERE model_type='{}' "\
#              "AND model_depth={} {}".format(model_type,
#                                         model_depth,
#                                         additional_query))
    
    db.query("SELECT epoch, seed, n_params, accuracy, gate_threshold, l1_reg, model_size "\
             "FROM RESULTS WHERE model_type='{}' "\
             "AND model_depth={} {}".format(model_type,
                                        model_depth,
                                        additional_query))

    rows = db.cursor.fetchall()

    data_dict = {
        "n_params": [],
        "error": [],
        "l1_reg": [],
        "gate_thres": [],
        "model_size": [],
        "model_depth": [],
        "test_error": [],
        "error_gap": [],
        "error_delta": [],
        "seed": [],
        "epoch": []
    }

    for row in tqdm(rows):
        epoch, seed, n_params, accuracy, gate_thres, l1_reg, model_size = row
        
        db_test.query("SELECT accuracy FROM RESULTS WHERE "\
                 "epoch={} "\
                 "AND model_type='{}' "\
                 "AND model_depth={} "\
                 "AND model_size='{}' "\
                 "AND l1_reg={} "\
                 "AND n_params={} "\
                 "AND seed={} "\
                 "AND gate_threshold={} "\
                 "AND metric_type='test'".format(epoch,
                                            model_type,
                                            model_depth,
                                            model_size,
                                            l1_reg,
                                            n_params,
                                            seed,
                                            gate_thres))
        rows_test = db_test.cursor.fetchall()
        i=0
        for row_test in rows_test:
#             print(i, row_test)
#             i+=1
            test_accuracy = row_test[0]
            break
#         print(test_accuracy)
        data_dict["n_params"].append(n_params)
        data_dict["l1_reg"].append(l1_reg)
        data_dict["gate_thres"].append(gate_thres)
        data_dict["error"].append(1-accuracy)
        data_dict["model_size"].append(model_size)
        data_dict["model_depth"].append(model_depth)
        data_dict["seed"].append(seed)
        data_dict["epoch"].append(epoch)
        data_dict["test_error"].append(1-test_accuracy)
        data_dict["error_gap"].append(np.abs(test_accuracy-accuracy))
        data_dict["error_delta"].append(test_accuracy-accuracy)
#         data_dict["accuracy"].append
    
    data_dict["n_params"] = np.array(data_dict["n_params"])
    
    if normalize:
        data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])
        
    print("Number of points: {}".format(len(data_dict["n_params"])))
    source = ColumnDataSource(data=data_dict)
    
    #Use the field name of the column source
    if color_map is not None:
        if cmap_range is None:
            cmap_min = np.min(data_dict[color_map])
            cmap_max = np.max(data_dict[color_map])
        else:
            cmap_min, cmap_max = cmap_range
        print("Color Mapped '{}': {}-{}".format(color_map,
                                               cmap_min,
                                               cmap_min))
        if colormap_type == 'log':
            mapping_function = log_cmap
        else:
            mapping_function = linear_cmap
        mapper = mapping_function(field_name=color_map, palette=Viridis256 ,
                          low=max(cmap_min, 1e-5),
                          high=cmap_max)
        color = mapper
        
        color_bar = ColorBar(color_mapper=mapper['transform'], label_standoff=12)
        rkc_plot.add_layout(color_bar, 'right')
    
    print("Plotting")
    
    p = rkc_plot.scatter(x_axis_key,y_axis_key, 
                     source=source, 
                     fill_alpha=0.2, 
                     line_alpha=0.7,
                     color = color
#                      marker='x'
#                     legend_label=legend
                    )
    error_scale = np.linspace(0, 1, 5000)
#     print(error_scale)
    error_list = []
    min_param_list = []
    for error_id in range(error_scale.shape[0]-1):
        min_err = error_scale[error_id]
        max_err = error_scale[error_id+1]
        mask = np.logical_and(data_dict["error"] >= min_err,
                              data_dict["error"] < max_err)
#         print(mask)
        if np.sum(mask) > 0:
            error_list.append((min_err+max_err)/2)
            if normalize:
                shift = 0
            else:
                shift = 0
            if len(min_param_list) > 0:
                min_param_list.append(min(np.min(data_dict["n_params"][mask]), 
                                          min_param_list[-1])-shift)
            else:
                min_param_list.append(np.min(data_dict["n_params"][mask])-shift)
            
#             print("error_range: {} - {}".format(min_err, max_err))
#             plot_histogram(data_dict["n_params"][mask])
            
#     print(min_param_list)
    if plot_envelope:
        rkc_plot.line(error_list,
                     min_param_list,
                     color=color,
                     line_width=2)
    db.close()
    return rkc_plot, p

In [225]:
palette = all_palettes['Set1'][9]
legend_items = []
tolerance = 2
normalize = False
plot_envelope = True

if normalize:
    y_range=(1e-4, 1)
else:
    y_range=(1, 6e5)

plot_options = dict(width=900,
                        plot_height=400,
                        tools='pan,wheel_zoom,reset,save')
TOOLTIPS = [
    ("error:", "$x"),
    ("parameters:", "$y"),
    ("l1_reg:", "@l1_reg"),
    ("gate_thres:", "@gate_thres"),
    ("model_size:", "@model_size x @model_depth"),
    ("seed:", "@seed"),
    ("test_error:", "@test_error"),
    ("error_gap:", "@error_gap"),
    ("epoch:", "@epoch")
]
rkc_plot = figure(
                  x_axis_type="log", 
#                   y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)

palette_idx = 0
x_axis_key = "error"
# program_names = [ "pmd-small-J9" ]
folder_name = os.path.join("/media/arjun/Shared/cluster/results",
                       "mnist",
                       "full_data_generalization")
rkc_plot, p = plot_rkc(folder_name, 
                    "result_cmr.db", 
                    rkc_plot, 
                    model_type='fcn',
                    tolerance=tolerance,
                    normalize=normalize,
                    plot_envelope=False,
                    model_depth=1,
                    color=palette[palette_idx],
                    x_axis_key=x_axis_key,
                    y_axis_key="n_params",
                    legend="mnist",
                        color_map="error_gap", # error gap
                        cmap_range=(1e-3, 0.2),
#                     color_map="test_error", # test error
#                     cmap_range=(1e-2, 0.2),
#                     colormap_type='linear',
                    additional_query=" AND metric_type='train' AND accuracy>0.1 AND epoch = 880")
palette_idx+=1

legend = Legend(items=legend_items, location="center")

# rkc_plot.line(extrapolated_errors,
#              extrapolated_parameters,
#              line_color="red")
rkc_plot.add_layout(legend, 'right')
rkc_plot.legend.location = "top_left"
rkc_plot.legend.click_policy='hide'
rkc_plot.legend.border_line_alpha = 1.0
# rkc_plot.legend.visible = False
# rkc_plot.legend.location = "top_right"
if normalize:
    rkc_plot.yaxis.axis_label="Fraction of total parameters remaining"
else:
    rkc_plot.yaxis.axis_label="Number of non zero parameters in model(log)"
rkc_plot.xaxis.axis_label=x_axis_key
show(rkc_plot)

100%|██████████| 548/548 [00:03<00:00, 162.61it/s]


Number of points: 548
Color Mapped 'error_gap': 0.001-0.001
Plotting


In [None]:
def plot_rkc_best_test(folder_name, filename, rkc_plot, legend, color='blue',
             tolerance=1, normalize=False, plot_envelope=False, model_type='lstm',
            model_depth=1, additional_query="", color_map=None, colormap_type='log',
            x_axis_key="error", y_axis_key="n_params", cmap_range=None):
    db_path = os.path.join(folder_name, filename)

    db = Database()
    db_test = Database()
    db.open(db_path)
    db_test.open(db_path)

#     print("SELECT seed, n_params, accuracy, gate_threshold, l1_reg, model_size "\
#              "FROM RESULTS WHERE model_type='{}' "\
#              "AND model_depth={} {}".format(model_type,
#                                         model_depth,
#                                         additional_query))
    
    db.query("SELECT epoch, seed, n_params, accuracy, gate_threshold, l1_reg, model_size "\
             "FROM RESULTS WHERE model_type='{}' "\
             "AND model_depth={} {}".format(model_type,
                                        model_depth,
                                        additional_query))

    rows = db.cursor.fetchall()

    data_dict = {
        "n_params": [],
        "error": [],
        "l1_reg": [],
        "gate_thres": [],
        "model_size": [],
        "model_depth": [],
        "test_error": [],
        "error_gap": [],
        "error_delta": [],
        "seed": [],
        "epoch": []
    }

    for row in tqdm(rows):
        epoch, seed, n_params, accuracy, gate_thres, l1_reg, model_size = row
        
        db_test.query("SELECT accuracy FROM RESULTS WHERE "\
#                  "epoch={} "\
                 "AND model_type='{}' "\
                 "AND model_depth={} "\
                 "AND model_size='{}' "\
                 "AND l1_reg={} "\
                 "AND n_params={} "\
                 "AND seed={} "\
                 "AND gate_threshold={} "\
                 "AND metric_type='test'".format(epoch,
                                            model_type,
                                            model_depth,
                                            model_size,
                                            l1_reg,
                                            n_params,
                                            seed,
                                            gate_thres))
        rows_test = db_test.cursor.fetchall()
        best_test = 0
        i=0
        for row_test in rows_test:
#             print(i, row_test)
#             i+=1
            test_accuracy = row_test[0]
            break
#         print(test_accuracy)
        data_dict["n_params"].append(n_params)
        data_dict["l1_reg"].append(l1_reg)
        data_dict["gate_thres"].append(gate_thres)
        data_dict["error"].append(1-accuracy)
        data_dict["model_size"].append(model_size)
        data_dict["model_depth"].append(model_depth)
        data_dict["seed"].append(seed)
        data_dict["epoch"].append(epoch)
        data_dict["test_error"].append(1-test_accuracy)
        data_dict["error_gap"].append(np.abs(test_accuracy-accuracy))
        data_dict["error_delta"].append(test_accuracy-accuracy)
#         data_dict["accuracy"].append
    
    data_dict["n_params"] = np.array(data_dict["n_params"])
    
    if normalize:
        data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])
        
    print("Number of points: {}".format(len(data_dict["n_params"])))
    source = ColumnDataSource(data=data_dict)
    
    #Use the field name of the column source
    if color_map is not None:
        if cmap_range is None:
            cmap_min = np.min(data_dict[color_map])
            cmap_max = np.max(data_dict[color_map])
        else:
            cmap_min, cmap_max = cmap_range
        print("Color Mapped '{}': {}-{}".format(color_map,
                                               cmap_min,
                                               cmap_min))
        if colormap_type == 'log':
            mapping_function = log_cmap
        else:
            mapping_function = linear_cmap
        mapper = mapping_function(field_name=color_map, palette=Viridis256 ,
                          low=max(cmap_min, 1e-5),
                          high=cmap_max)
        color = mapper
        
        color_bar = ColorBar(color_mapper=mapper['transform'], label_standoff=12)
        rkc_plot.add_layout(color_bar, 'right')
    
    print("Plotting")
    
    p = rkc_plot.scatter(x_axis_key,y_axis_key, 
                     source=source, 
                     fill_alpha=0.2, 
                     line_alpha=0.7,
                     color = color
#                      marker='x'
#                     legend_label=legend
                    )
    error_scale = np.linspace(0, 1, 5000)
#     print(error_scale)
    error_list = []
    min_param_list = []
    for error_id in range(error_scale.shape[0]-1):
        min_err = error_scale[error_id]
        max_err = error_scale[error_id+1]
        mask = np.logical_and(data_dict["error"] >= min_err,
                              data_dict["error"] < max_err)
#         print(mask)
        if np.sum(mask) > 0:
            error_list.append((min_err+max_err)/2)
            if normalize:
                shift = 0
            else:
                shift = 0
            if len(min_param_list) > 0:
                min_param_list.append(min(np.min(data_dict["n_params"][mask]), 
                                          min_param_list[-1])-shift)
            else:
                min_param_list.append(np.min(data_dict["n_params"][mask])-shift)
            
#             print("error_range: {} - {}".format(min_err, max_err))
#             plot_histogram(data_dict["n_params"][mask])
            
#     print(min_param_list)
    if plot_envelope:
        rkc_plot.line(error_list,
                     min_param_list,
                     color=color,
                     line_width=2)
    db.close()
    return rkc_plot, p

In [None]:
palette = all_palettes['Set1'][9]
legend_items = []
tolerance = 2
normalize = False
plot_envelope = True

if normalize:
    y_range=(1e-4, 1)
else:
    y_range=(1, 6e5)

plot_options = dict(width=900,
                        plot_height=400,
                        tools='pan,wheel_zoom,reset,save')
TOOLTIPS = [
    ("error:", "$x"),
    ("parameters:", "$y"),
    ("l1_reg:", "@l1_reg"),
    ("gate_thres:", "@gate_thres"),
    ("model_size:", "@model_size x @model_depth"),
    ("seed:", "@seed"),
    ("test_error:", "@test_error"),
    ("error_gap:", "@error_gap"),
    ("epoch:", "@epoch")
]
rkc_plot = figure(
                  x_axis_type="log", 
#                   y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)

palette_idx = 0
x_axis_key = "error"
# program_names = [ "pmd-small-J9" ]
folder_name = os.path.join("/media/arjun/Shared/cluster/results",
                       "mnist",
                       "full_data_generalization")
rkc_plot, p = plot_rkc(folder_name, 
                    "result_cmr.db", 
                    rkc_plot, 
                    model_type='fcn',
                    tolerance=tolerance,
                    normalize=normalize,
                    plot_envelope=False,
                    model_depth=1,
                    color=palette[palette_idx],
                    x_axis_key=x_axis_key,
                    y_axis_key="n_params",
                    legend="mnist",
                        color_map="error_gap", # error gap
                        cmap_range=(1e-3, 0.2),
#                     color_map="test_error", # test error
#                     cmap_range=(1e-2, 0.2),
#                     colormap_type='linear',
                    additional_query=" AND metric_type='train' AND accuracy>0.1 AND epoch = 880")
palette_idx+=1

legend = Legend(items=legend_items, location="center")

# rkc_plot.line(extrapolated_errors,
#              extrapolated_parameters,
#              line_color="red")
rkc_plot.add_layout(legend, 'right')
rkc_plot.legend.location = "top_left"
rkc_plot.legend.click_policy='hide'
rkc_plot.legend.border_line_alpha = 1.0
# rkc_plot.legend.visible = False
# rkc_plot.legend.location = "top_right"
if normalize:
    rkc_plot.yaxis.axis_label="Fraction of total parameters remaining"
else:
    rkc_plot.yaxis.axis_label="Number of non zero parameters in model(log)"
rkc_plot.xaxis.axis_label=x_axis_key
show(rkc_plot)

## Boundary Analysis

In [367]:
def plot_boundaries(folder_name, filename, rkc_plot, legend, color='blue', 
             tolerance=1, normalize=False, model_type='lstm', model_depth=1, 
                    baseline=None, plot_envelope=True):
    print(color)
    db_path = os.path.join(folder_name, filename)

    db = Database()
    db.open(db_path)

    boundary_data = {
            "n_params": [],
            "error": [],
            "l1_reg": [],
            "gate_thres": [],
            "oracle_size": [],
            "model_size": [],
            "accuracy": []
        }
    
    for seed in range(0, 15):
        db.query("SELECT trace, seed, n_params, accuracy, gate_threshold, "\
                 "l1_reg, oracle_size, model_size "\
                 "FROM RESULTS WHERE bin_tolerance={} AND seed={} AND model_type= '{}' "\
                 "AND model_depth={} AND metric_type='train'".format(tolerance,
                                           seed,
                                           model_type,
                                           model_depth))

        rows = db.cursor.fetchall()

        data_dict = {
            "n_params": [],
            "error": [],
            "l1_reg": [],
            "gate_thres": [],
            "oracle_size": [],
            "model_size": [],
            "accuracy": []
        }

        for row in rows:
            trace, seed, n_params, accuracy, gate_thres, \
            l1_reg, oracle_size, model_size = row

            data_dict["n_params"].append(n_params)
            data_dict["l1_reg"].append(l1_reg)
            data_dict["gate_thres"].append(gate_thres)
            data_dict["error"].append(1-accuracy)
            data_dict["oracle_size"].append(oracle_size)
            data_dict["model_size"].append(model_size)
            data_dict["accuracy"].append(accuracy)

        for key in data_dict.keys():
            data_dict[key] = np.array(data_dict[key])

        if normalize:
            data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])

        print("Number of points: {}".format(len(data_dict["n_params"])))
        source = ColumnDataSource(data=data_dict)
# Error
        error_scale = np.linspace(0, 1, 1000)
    #     print(error_scale)
        error_list = []
        min_param_list = []
        min_oracle_list = []
        for error_id in range(error_scale.shape[0]-1):
            min_err = error_scale[error_id]
            max_err = error_scale[error_id+1]
#             mask = np.logical_and(data_dict["error"] >= min_err,
#                                   data_dict["error"] < max_err)
            mask = data_dict["error"] < max_err
            if np.sum(mask) > 0:
#                 print(np.min(data_dict["error"][mask]),
#                   np.max(data_dict["error"][mask]),
#                   np.min(data_dict["n_params"][mask]))
#                 print(mask)
                argmin_idx = np.argmin(data_dict["n_params"][mask]).astype(int)
                if len(min_param_list) > 0 and np.min(data_dict["n_params"][mask]
                                                     ) < min_param_list[-1]:
#                     error_list.append(data_dict["error"][mask][argmin_idx])
                    if data_dict["error"][mask][argmin_idx] < baseline-0.2*baseline:
                        for key in boundary_data.keys():
                            boundary_data[key].append(data_dict[key][mask][argmin_idx])
#                 else:
                
                error_list.append((min_err+max_err)/2)
                if normalize:
                    shift = 0
                else:
                    shift = 0
                
#                 if len(min_param_list) > 0 and np.min(data_dict["n_params"][mask]) > min_param_list[-1]:

                if len(min_param_list) > 0:
                    min_param_list.append(min(np.min(data_dict["n_params"][mask]), 
                                              min_param_list[-1])-shift)
                else:
                    min_param_list.append(np.min(data_dict["n_params"][mask])-shift)
        if plot_envelope:
            rkc_plot.line(error_list,
                         min_param_list,
                         color=color,
                         line_width=2)
    db.close()
    return rkc_plot, p, boundary_data

In [380]:
plot_options = dict(width=900,
                        plot_height=400,
                        y_range=(1e2, 1e6),
#                         x_range=(1e-2, 1.0),
                        tools='pan,wheel_zoom,reset,save')
TOOLTIPS = [
    ("error:", "$x"),
    ("parameters:", "$y"),
    ("l1_reg:", "@l0_reg"),
    ("gate_thres:", "@gate_thres"),
]
palette = all_palettes['Set1'][9]
boundaries_plot = figure(
                  x_axis_type="log", 
                  y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)

palette_idx = 0

program_names = [ 
                 "gcc-ref-cc15",
                 "pmd-small-JikesRVM",
                 "pmd-small-J9",
                 "pmd-small-HotSpot"
                ]

baselines = { program_name: get_baseline("{}-d-l64-p4096-w100000i.analyzed-1.pkl".format(program_name),
                             tolerance=tolerance, trace_length=None) 
             for program_name in program_names }
boundary_points = {}
for program_name in program_names:
    folder_name = os.path.join("/media/arjun/Shared/cluster/results",
                              "{}-d-l64-p4096-w100000i".format(program_name),
                              "full_trace_generalization/results_generalization")
    boundaries_plot, p, boundary_data = plot_boundaries(folder_name, 
                        "result_cmr.db", 
                        boundaries_plot, 
                        tolerance=2,
                        normalize=False,
                        color=palette[palette_idx],
                        model_depth=4,
                        legend="{}".format(program_name),
                        baseline=baselines[program_name],
                        plot_envelope=True)
    boundary_points[program_name] = boundary_data
    boundaries_plot.line([], [], 
                         color=palette[palette_idx], 
                         legend_label="{}".format(program_name))
    source = ColumnDataSource(data=boundary_data)
    print(baselines)
    boundaries_plot.scatter("error", "n_params", 
                            source=source,
                           color=palette[palette_idx])
    
    palette_idx += 1

## Plot baseline lines
baseline_lines = [ Span(location=baselines[program_name],
                        dimension='height', 
                        line_color=palette[idx], 
                        line_width=3) for idx, program_name in enumerate(program_names) ]

# pmd_jikes_line = Span(location=pmd_jikes_baseline, dimension='height', 
#                   line_color=palette[1], line_width=3)
# pmd_hotspot_line = Span(location=pmd_hotspot_baseline, dimension='height', 
#                   line_color=palette[0], line_width=3)
# pmd_j9_line = Span(location=pmd_j9_baseline, dimension='height', 
#                   line_color=palette[2], line_width=3)
boundaries_plot.renderers.extend(baseline_lines)
boundaries_plot.xaxis.axis_label = "Error"
boundaries_plot.yaxis.axis_label = "number of parameters"
boundaries_plot.legend.location = "bottom_left"
show(boundaries_plot)

argmax:  74
max_val:  29976.0
bin frequency:  0.3751877440672875
argmax:  68
max_val:  8594.0
bin frequency:  0.35202556015237785
argmax:  72
max_val:  39663.0
bin frequency:  0.6749195977334218
argmax:  73
max_val:  16571.0
bin frequency:  0.6701852301221386
#e41a1c
Number of points: 880
Number of points: 859
Number of points: 870
Number of points: 809
Number of points: 694
Number of points: 683
Number of points: 704
Number of points: 684
Number of points: 685
Number of points: 682
Number of points: 670
Number of points: 679
Number of points: 683
Number of points: 685
Number of points: 622
{'gcc-ref-cc15': 0.6248122559327125, 'pmd-small-JikesRVM': 0.6479744398476222, 'pmd-small-J9': 0.3250804022665782, 'pmd-small-HotSpot': 0.3298147698778614}
#377eb8
Number of points: 1002
Number of points: 925
Number of points: 896
Number of points: 798
Number of points: 764
Number of points: 766
Number of points: 714
Number of points: 605
Number of points: 509
Number of points: 446
Number of points:

## Parameterization of RKC curve

In [325]:
def preprocess(x):
    x = np.log(x)
    return x

def postprocess(x):
    x = np.exp(x)
    return x

def fit(X, y):
    X = preprocess(X)
    y = preprocess(y)
    
    return np.poly1d(np.polyfit(X, y, 3))

def predict(X, p):
    print("predict", p)
    X = preprocess(X)
    y_pred = p(X)
    
    return postprocess(y_pred)

def marginal(X, p):
#     print("marginal", p)
    pre_X = preprocess(X)
#     print(p[-1])
    return np.exp(p(pre_X))*(3*p[-3]*(pre_X**2) + 2*p[-2]*pre_X + p[-1])

In [383]:
# print(boundary_data)
plot_options = dict(width=900,
                        plot_height=400,
#                         y_range=(1e2, 1e6),
#                         x_range=(1e-2, 1.0),
                        tools='pan,wheel_zoom,reset,save')
boundaries_plot = figure(
                  x_axis_type="log", 
                  y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)
marginals_plot = figure(
#                  x_axis_type="log", 
#                   y_axis_type="log",
                  tooltips=TOOLTIPS,
                  **plot_options)
palette_idx = 0
for program_name in program_names:
    print("program_name: {}".format(program_name))
    boundary_data = boundary_points[program_name]
    source = ColumnDataSource(data=boundary_data)
    # print(baselines)
#     boundaries_plot.scatter("error", "n_params", 
#                             source=source,
#                            color=palette[palette_idx])
    boundaries_plot.line([], [], 
                         color=palette[palette_idx], 
                         legend_label="{}".format(program_name))
    #preprocessing for fitting
    X = np.array(boundary_data["error"])
    y = np.array(boundary_data["n_params"])

    p = fit(X, y)

    x_test = np.linspace(np.min(X), np.max(X), 500)
    z = predict(x_test, p)
    
    print(p)
    print("{}: {}".format(baselines[program_names[-1]],
                          predict(baselines[program_names[-1]], p)))
    
    marginal_z = marginal(x_test, p)
    
    print(marginal_z)
    
    boundaries_plot.line(x_test,
                           z,
                           color=palette[palette_idx],
                           line_width=2)
    marginals_plot.line(x_test,
                           marginal_z,
                           color=palette[palette_idx],
                           line_width=2)
    palette_idx+=1

## Plot baseline lines
baseline_lines = [ Span(location=baselines[program_name],
                        dimension='width', 
                        line_color=palette[idx], 
                        line_width=3) for idx, program_name in enumerate(program_names) ]
boundaries_plot.renderers.extend(baseline_lines)

boundaries_plot.legend.location = "bottom_left"
show(boundaries_plot)
show(marginals_plot)

program_name: gcc-ref-cc15
predict           3          2
-0.01145 x - 0.4676 x - 3.869 x + 2.645
          3          2
-0.01145 x - 0.4676 x - 3.869 x + 2.645
predict           3          2
-0.01145 x - 0.4676 x - 3.869 x + 2.645
0.3298147698778614: 588.124020395975
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 

## Plot Changes depending on number of parameters in model

In [125]:
def plot_rkc_model_configuration(folder_name, filename, rkc_plot, layer_id,
                                 legend, color='blue',
             tolerance=1, normalize=False, plot_envelope=False, model_type='lstm',
            model_depth=1, additional_query="", colormap_type='log',
            x_axis_key="error", y_axis_key="n_params", cmap_range=None):
    color_map='layer_param_count'
    db_path = os.path.join(folder_name, filename)
    db_path_layer = os.path.join(folder_name, "result_layer_test.db")
    db = Database()
    db_test = Database()
    db.open(db_path)
    db_test.open(db_path_layer)

    db.query("SELECT trace, seed, n_params, accuracy, gate_threshold, l1_reg, model_size, oracle_size "\
             "FROM RESULTS WHERE bin_tolerance={} AND model_type='{}' "\
             "AND model_depth={} {}".format(tolerance,
                                        model_type,
                                        model_depth,
                                        additional_query))

    rows = db.cursor.fetchall()

    data_dict = {
        "n_params": [],
        "error": [],
        "l1_reg": [],
        "gate_thres": [],
        "model_size": [],
        "oracle_size": [],
        "layer_param_count": [],
        "oracle_param_sum": [],
        "seed": []
    }

    for row in tqdm(rows):
        trace, seed, n_params, accuracy, gate_thres, l1_reg, model_size, oracle_size = row
        
        db_test.query("SELECT {} FROM RESULTS WHERE trace='{}' "\
                 "AND model_type='{}' "\
                 "AND model_depth={} "\
                 "AND model_size='{}' "\
                 "AND l1_reg={} "\
                 "AND n_params={} "\
                 "AND seed={} "\
                 "AND gate_threshold={}".format(layer_id,
                                            trace,
                                            model_type,
                                            model_depth,
                                            model_size,
                                            l1_reg,
                                            n_params,
                                            seed,
                                            gate_thres))
        rows_test = db_test.cursor.fetchall()
        i=0
        layer_param_count = None
        for row_test in rows_test:
#             print(i, row_test)
#             i+=1
            layer_param_count = row_test[0]
            break
#         print(test_accuracy)
        data_dict["n_params"].append(n_params)
        data_dict["l1_reg"].append(l1_reg)
        data_dict["gate_thres"].append(gate_thres)
        data_dict["error"].append(1-accuracy)
        data_dict["model_size"].append(model_size)
        data_dict["oracle_size"].append(oracle_size)
        data_dict["seed"].append(seed)
        data_dict["oracle_param_sum"].append(1/(n_params+oracle_size))
        if layer_param_count is not None:
            data_dict["layer_param_count"].append(layer_param_count)
        else:
            data_dict["layer_param_count"].append(0)
    
    data_dict["n_params"] = np.array(data_dict["n_params"])
        
    if normalize:
        data_dict["n_params"] = data_dict["n_params"]/np.max(data_dict["n_params"])
        
    print("Number of points: {}".format(len(data_dict["n_params"])))
    source = ColumnDataSource(data=data_dict)
    
    #Use the field name of the column source
    if color_map is not None:
        if cmap_range is None:
            cmap_min = np.min(data_dict[color_map])
            cmap_max = np.max(data_dict[color_map])
        else:
            cmap_min, cmap_max = cmap_range
        print("Color Mapped '{}': {}-{}".format(color_map,
                                               cmap_min,
                                               cmap_min))
        if colormap_type == 'log':
            mapping_function = log_cmap
        else:
            mapping_function = linear_cmap
        mapper = mapping_function(field_name=color_map, palette=Viridis256 ,
                          low=max(cmap_min, 1e-5),
                          high=cmap_max)
        color = mapper
        
        color_bar = ColorBar(color_mapper=mapper['transform'], label_standoff=12)
        rkc_plot.add_layout(color_bar, 'right')
    
    print("Plotting")
    
    p = rkc_plot.scatter(x_axis_key,y_axis_key, 
                     source=source, 
                     fill_alpha=0.2, 
                     line_alpha=0.7,
                     color = color
#                      marker='x'
#                     legend_label=legend
                    )
    error_scale = np.linspace(0, 1, 5000)
#     print(error_scale)
    error_list = []
    min_param_list = []
    for error_id in range(error_scale.shape[0]-1):
        min_err = error_scale[error_id]
        max_err = error_scale[error_id+1]
        mask = np.logical_and(data_dict["error"] >= min_err,
                              data_dict["error"] < max_err)
#         print(mask)
        if np.sum(mask) > 0:
            error_list.append((min_err+max_err)/2)
            if normalize:
                shift = 0
            else:
                shift = 0
            if len(min_param_list) > 0:
                min_param_list.append(min(np.min(data_dict["n_params"][mask]), 
                                          min_param_list[-1])-shift)
            else:
                min_param_list.append(np.min(data_dict["n_params"][mask])-shift)
            
#             print("error_range: {} - {}".format(min_err, max_err))
#             plot_histogram(data_dict["n_params"][mask])
            
#     print(min_param_list)
    if plot_envelope:
        rkc_plot.line(error_list,
                     min_param_list,
                     color=color,
                     line_width=2)
    db.close()
    return rkc_plot, p

In [260]:
def print_summary(arr, trace_size):
    print("argmax: ", np.argmax(arr))
    print("max_val: ", arr[np.argmax(arr)])
    print("bin frequency: ", np.max(arr)/trace_size)

    
def get_baseline(trace_name="pmd-small-J9-d-l64-p4096-w100000i.analyzed-1.pkl",
                tolerance=1, trace_length=2000):
    with open(os.path.join("/home/arjun/ssd/chaos/data", trace_name), "rb") as f:
        trace = pkl.load(f)
    if trace_length is not None:
        trace = trace[:2000]
    kernel = np.ones(2*tolerance+1)
    arr = np.convolve(np.sum(trace, axis=0), kernel, mode="same")
    print_summary(arr, trace.shape[0])
    
    return 1 - np.max(arr)/trace.shape[0]

palette = all_palettes['Set1'][9]
legend_items = []
tolerance = 2
normalize = False
plot_envelope = True

if normalize:
    y_range=(1e-4, 1)
else:
    y_range=(1, 6e5)

plot_options = dict(width=900,
                        plot_height=400,
#                         y_range=(1e2, 5e5),
#                         x_range=(1e-3, 0.6),
                        tools='pan,wheel_zoom,reset,save')
TOOLTIPS = [
    ("error:", "$x"),
    ("parameters:", "$y"),
    ("l1_reg:", "@l1_reg"),
    ("gate_thres:", "@gate_thres"),
    ("model_size:", "@model_size"),
    ("seed:", "@seed"),
    ("layer_size:", "@layer_param_count")
]

layer_ids = [ "lstm_ih_l0",
              "lstm_hh_l0",
              "fc0_weight",
              "fc1_weight",
              "fc2_weight",
              "fc3_weight",
              "output_weight",
              "lstm_bias_ih_l0",
              "lstm_bias_hh_l0",
              "fc0_bias",
              "fc1_bias",
              "fc2_bias",
              "fc3_bias",
              "output_bias"
            ]

for layer_id in layer_ids:
    print("Layer: {}".format(layer_id))
    rkc_plot = figure(
                      x_axis_type="log", 
                      y_axis_type="log",
                      tooltips=TOOLTIPS,
                      **plot_options)

    palette_idx = 0
    folder_name = os.path.join(research_folder_path,
                              "result_dbs")

    x_axis_key = "error"
    program_names = [ "pmd-small-J9" ]
    folder_name = os.path.join("/media/arjun/Shared/cluster/results",
                               "{}-d-l64-p4096-w100000i".format(program_name),
                               "full_trace_generalization/results_generalization")
    baselines = { program_name: get_baseline("{}-d-l64-p4096-w100000i.analyzed-1.pkl".format(program_name),
                                 tolerance=tolerance, trace_length=None) 
                 for program_name in program_names }

    for program_name in program_names:
        rkc_plot, p = plot_rkc_model_configuration(folder_name, 
                            "result_cmr.db", 
                            rkc_plot, 
                            layer_id,
                            tolerance=tolerance,
                            normalize=normalize,
                            plot_envelope=False,
                            model_depth=4,
                            color=palette[palette_idx],
                            x_axis_key=x_axis_key,
                            y_axis_key="n_params",
                            legend=program_name,
    #                         cmap_range=(1e-3, 0.2),
    #                         color_map="test_error", # test error
                            cmap_range=(1, 1000),
                            colormap_type='linear',
                            additional_query="AND metric_type='train' AND accuracy>0.1")
        legend_items.append(( program_name, [p] ))
        palette_idx+=1

    ## Plot baseline lines
    baseline_lines = [ Span(location=baselines[program_name],
                            dimension='height', 
                            line_color=palette[idx], 
                            line_width=3) for idx, program_name in enumerate(program_names) ]

    legend = Legend(items=legend_items, location="center")

    # rkc_plot.line(extrapolated_errors,
    #              extrapolated_parameters,
    #              line_color="red")
    rkc_plot.add_layout(legend, 'right')
    rkc_plot.renderers.extend([pmd_hotspot_line])
    rkc_plot.legend.location = "top_left"
    rkc_plot.legend.click_policy='hide'
    rkc_plot.legend.border_line_alpha = 1.0
    # rkc_plot.legend.visible = False
    # rkc_plot.legend.location = "top_right"
    if normalize:
        rkc_plot.yaxis.axis_label="Fraction of total parameters remaining"
    else:
        rkc_plot.yaxis.axis_label="Number of non zero parameters in model(log)"
    rkc_plot.xaxis.axis_label=x_axis_key
    show(rkc_plot)


  0%|          | 0/5930 [00:00<?, ?it/s][A

Layer: lstm_ih_l0
argmax:  72
max_val:  39663.0
bin frequency:  0.6749195977334218


OperationalError: no such table: RESULTS

## Boundary comparison on architectural changes

In [27]:
def show_comparison(folder_name, filename1, filename2, translation_plot, 
                    label1, label2, legend, color='blue', tolerance=1):
    db_path1 = os.path.join(folder_name, filename1)
    db_path2 = os.path.join(folder_name, filename2)
    
    print("db path-------------", db_path1)
    db1 = Database()
    db1.open(db_path1)
    db2 = Database()
    db2.open(db_path1)

    error_scale = np.linspace(0, 0.5, 50)
    
    data_dict = {}
    data_dict[label1] = []
    data_dict[label2] = []
    data_dict["error"] = []
    
    for seed in range(10):
        for error_id in range(error_scale.shape[0]-1):
            min_err = error_scale[error_id]
            max_err = error_scale[error_id+1]
            query1 = "SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} AND " \
                      "accuracy>={} AND accuracy<{} AND model_depth=1 AND seed={}".format(tolerance,
                                                           1-max_err,
                                                           1-min_err,
                                                            seed)
            query2 = "SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} AND " \
                      "accuracy>={} AND accuracy<{} and model_depth=4 AND seed={}".format(tolerance, 
                                                                                        1-max_err,
                                                                                        1-min_err,
                                                                                        seed)
            db1.query(query1)
            db2.query(query2)

            rows1 = db1.cursor.fetchall()
            rows2 = db2.cursor.fetchall()

            for row in rows1:
                min_param1 = row[0]

            for row in rows2:
                min_param2 = row[0]

            if min_param1 is not None and min_param2 is not None:
                data_dict[label1].append(min_param1)
                data_dict[label2].append(min_param2)
                data_dict["error"].append((min_err+max_err)/2)

    source = ColumnDataSource(data=data_dict)
    
    p = translation_plot.scatter(label1,label2, 
                         source=source, 
                         fill_alpha=0.7, 
                         line_alpha=0.9,
                         color=color)
#     translation_plot.line(label1,label1, 
#                          source=source, 
#                          line_dash='dotdash',
#                          line_alpha=0.7)
    translation_plot.xaxis.axis_label = label1
    translation_plot.yaxis.axis_label = label2
    
    db1.close()
    db2.close()
    return translation_plot, p, data_dict

In [28]:
# folder_name = "/media/arjun/Shared/research/result_dbs"
plot_options = dict(width=900,
                        plot_height=400,
                        y_range=(1e2, 1e4),
                        x_range=(1e2, 1e4),
                        tools='pan,wheel_zoom,reset,save')

translation_plot = figure(
#                   x_axis_type="log", 
#                   y_axis_type="log",
                  **plot_options)

palette = Viridis256
colormapper = linear_cmap(field_name = "error", 
                          palette=palette, 
                          low=1e-5, 
                          high=0.5)
color_bar = ColorBar(color_mapper=colormapper['transform'], label_standoff=12)

# filename1 = "cmr_lstm_pmd_jikes_d4_combined.db"
# filename2 = "cmr_lstm_pmd_jikes_d1_combined.db"
# translation_plot, p, data_dict = show_comparison(folder_name, 
#                                                 filename1,
#                                                 filename2,
#                                                 translation_plot, 
#                                                 "lstm depth=1",
#                                                 "lstm depth=4",
#                                                 tolerance=tolerance,
#                                                 color=colormapper,
#                                                 legend="pmd j9")
filename1 = "result_cmr.db"
filename2 = "cmr_lstm_pmd_j9_d1_combined.db"

translation_plot, p, data_dict = show_comparison(folder_name, 
                                                filename1,
                                                filename2,
                                                translation_plot, 
                                                "lstm depth=1",
                                                "lstm depth=4",
                                                tolerance=tolerance,
                                                color=colormapper,
                                                legend="pmd jikes")
final_data_dict = data_dict

final_data_dict["lstm depth=1"].extend(data_dict["lstm depth=1"])
final_data_dict["lstm depth=4"].extend(data_dict["lstm depth=4"])
final_data_dict["error"].extend(data_dict["error"])

X = np.array(final_data_dict["lstm depth=1"]).reshape((-1, 1))
y = np.array(final_data_dict["lstm depth=4"])

reg = LinearRegression().fit(X, y)
d1_model_parameters = np.linspace(min(final_data_dict["lstm depth=1"]), 
                                  max(final_data_dict["lstm depth=1"]), 50)
d4_predicted = reg.predict(d1_model_parameters.reshape((-1, 1)))
translation_plot.line(d1_model_parameters,
                      d4_predicted, 
                      line_alpha=0.7,
                      color="blue")
translation_plot.line(d1_model_parameters,
                      d1_model_parameters, 
                      line_dash='dotdash',
                      line_alpha=0.7,
                      color="green")
print("m: {}, c={}".format(reg.coef_, reg.intercept_))
translation_plot.add_layout(color_bar, 'right')
show(translation_plot)

db path------------- /media/arjun/Shared/cluster/results/pmd-small-HotSpot-d-l64-p4096-w100000i/full_trace_generalization/results_generalization/result_cmr.db


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [137]:
error_range = [ 0.25, 0.35 ]
tolerance = 1
folder_name = "/home/arjun/research/result_dbs/old"
dbs = {
    700: "result_cmr_jikes_s_lstm_s700_d1_tprune200_gthres_0_001_gdecay1_01.db",
    400: "result_cmr_lstm_s400_d1_omax.db",
    100: "result_cmr_lstm_pmd_jikes_s100_d1_omax.db",
    50: "result_cmr_lstm_s50_lstm_new.db",
    20: "result_cmr_lstm_pmd_jikes_s20_d1_gthres_0_001_gdecay1_11.db"
}

hidden_sizes = list(dbs.keys())
max_sparsity = []
max_params = []
for hidden_size in hidden_sizes:
    db_path = os.path.join(folder_name, dbs[hidden_size])
    db = Database()
    db.open(db_path)
    
    # Find max number of parameters
    db.query("SELECT MAX(n_params) FROM RESULTS")
    rows = db.cursor.fetchall()
    
    for row in rows:
        max_params.append(row[0])
        break
    
    # To find min_size achieved
    db.query("SELECT MIN(n_params) FROM RESULTS WHERE bin_tolerance<={} "
             "AND ACCURACY >= {} AND ACCURACY < {}".format(tolerance,
                                                          1-error_range[1],
                                                          1-error_range[0]))
    rows = db.cursor.fetchall()
    
    for row in rows:
        max_sparsity.append(row[0]/max_params[-1])
        print("hidden: {} sparsity: {}".format(hidden_size, max_sparsity[-1]))
        break

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error connecting to database!
Traceback (most recent call last):
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 71, in open
    self.conn = sqlite3.connect(name);
sqlite3.OperationalError: unable to open database file

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-137-d86a611f764e>", line 18, in <module>
    db.open(db_path)
  File "/media/arjun/SSD/chaos/cache-management/misc/database.py", line 76, in open
    sys.exit()
SystemExit

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/arjun/anaconda3/envs/cache_analysis/lib/python3.7/site-packages/IPython/core/ultratb.py", line 1169, in get_records
    return _fixed_getinnerframes

TypeError: object of type 'NoneType' has no len()

In [None]:
# regression
X = np.array(max_sparsity)
y = np.array(max_params)
mask = np.ones(len(y), dtype=bool)
mask[1] = False

X = X[mask].reshape((-1, 1))
y = y[mask]

X = X[:2]
y = y[:2]
X = np.log(X)
y = np.log(y)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))

# Plotting
sparsity = np.logspace(-3, 0, 100)
estimated = np.exp(reg.predict(np.log(sparsity).reshape((-1, 1))))

print("Intercept: {}".format(np.exp(reg.predict([[ 0 ]]))))

plot_options = dict(width=600,
                    x_axis_type="log",
                    y_axis_type="log",
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')
performance_plot = figure(y_range=(1e-3,1),
                          x_range=(1, 1e7),
                          **plot_options)
performance_plot.scatter(max_params, max_sparsity)
performance_plot.line(estimated,
                     sparsity)

performance_plot.yaxis.axis_label="Fraction of total parameters remaining"
performance_plot.xaxis.axis_label="Total Number of Parameters before training"

show(performance_plot)

In [705]:
# magnitude fisher information relation
save_dir = "/media/arjun/Shared/cluster/experiments/pmd-small-jikes-lstm"
model_dict_file = "model_0_0.0005.pkl"

with open(os.path.join(save_dir, "{}_{}_fisher".format(model_dict_file,
                                             0.0)), 'rb') as f:
    fisher_information = pkl.load(f)

with open(os.path.join(save_dir, "{}_{}_param".format(model_dict_file,
                                            0.0)), 'rb') as f:
    params = pkl.load(f)

for name, param in params.items():
    print(name)
    x = param.flatten().data.numpy()
    y = fisher_information[name].flatten()
    print(np.min(y), np.max(y))
    
    plot_options = dict(width=600,
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')
    performance_plot = figure(**plot_options)
    performance_plot.scatter(x, y)

    performance_plot.yaxis.axis_label="Fisher Information"
    performance_plot.xaxis.axis_label="Magnitude of parameters"

    show(performance_plot)

lstm._origin.weight_ih_l0
nan nan


lstm._origin.weight_hh_l0
nan nan


lstm._origin.bias_ih_l0
nan nan


lstm._origin.bias_hh_l0
nan nan


linear.fc0.weight
nan nan


linear.fc0.bias
nan nan


KeyboardInterrupt: 

## Miscellaneous

In [None]:
compute_parameters.get_count(100, 100, [400, 400], [ 'lstm', 'fc' ])

In [723]:
plot_options = dict(width=600,
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')

def exponent_function(oracle):
    return np.exp(-(oracle-1))

def returns_function(oracle):
    return 1/(oracle+exponent_function(oracle))

x = np.arange(0, 2, 0.01)
y = np.array(list(map(returns_function, x)))

performance_plot = figure(**plot_options)
performance_plot.line(x, y, line_width=2)

performance_plot.yaxis.axis_label="Information per parameter"
performance_plot.xaxis.axis_label="Oracle Size"
performance_plot.xaxis.major_label_text_font_size = '0pt'
performance_plot.yaxis.major_label_text_font_size = '0pt'
performance_plot.xaxis.axis_label_text_font_size = '15pt'
performance_plot.yaxis.axis_label_text_font_size = '15pt'

show(performance_plot)

In [72]:
plot_options = dict(width=600,
                        plot_height=300,
                        tools='pan,wheel_zoom,reset,save')

def compute_ed(f, m, c=1):
    return (1.25*f + 0.4)**2 * f * (m*1e-4 + 200000/(f*1e9))**2

def best_frequency(x):
    best_ed = None
    for f in np.arange(0.01, 8, 0.1):
        estimated_ed = compute_ed(f, x)
        if best_ed is None or best_ed < estimated_ed:
            best_ed = estimated_ed
    return best_ed

x = np.arange(0, 1, 0.1)
y = np.array(list(map(best_frequency, x)))

performance_plot = figure(**plot_options)
performance_plot.line(x, y, line_width=2)

performance_plot.yaxis.axis_label="Best Frequency"
performance_plot.xaxis.axis_label="Misses"
performance_plot.xaxis.major_label_text_font_size = '0pt'
performance_plot.yaxis.major_label_text_font_size = '0pt'
performance_plot.xaxis.axis_label_text_font_size = '15pt'
performance_plot.yaxis.axis_label_text_font_size = '15pt'

show(performance_plot)