In [43]:
import csv
import random
import numpy as np
import json
import warnings
import operator

import h5py
from keras.models import model_from_json
from keras import backend as K

warnings.filterwarnings("ignore")




In [44]:
def read_file(file_path):
    with open(file_path, 'r') as data_file:
        data = json.loads(data_file.read())
    return data

In [45]:
def create_model(model_path):
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    model_weights = list()
    weight_ctr = 0
    for index, item in enumerate(trained_model.keys()):
        if "weight_" in item:
            d_key = "weight_" + str(weight_ctr)
            weights = trained_model.get(d_key).value
            model_weights.append(weights)
            weight_ctr += 1
    # set the model weights
    loaded_model.set_weights(model_weights)
    return loaded_model, dictionary, reverse_dictionary


def get_predicted_tools(base_tools, predictions, topk):
    """
    Get predicted tools. If predicted tools are less in number, combine them with published tools
    """
    precision = np.nan
    intersection = list()
    if len(base_tools) > 0:
        intersection = list(set(predictions).intersection(set(base_tools)))
        precision = len(intersection) / float(len(predictions))
    return intersection[:topk], precision


def sort_by_usage(t_list, class_weights, d_dict):
    """
    Sort predictions by usage/class weights
    """
    tool_dict = dict()
    for tool in t_list:
        t_id = d_dict[tool]
        tool_dict[tool] = class_weights[str(t_id)]
    #tool_dict = dict(sorted(tool_dict.items(), key=lambda kv: kv[1], reverse=True))
    return list(tool_dict.keys()), list(tool_dict.values())


def separate_predictions(base_tools, predictions, last_tool_name, weight_values, topk):
    """
    Get predictions from published and normal workflows
    """
    last_base_tools = list()
    predictions = predictions * weight_values
    prediction_pos = np.argsort(predictions, axis=-1)
    topk_prediction_pos = prediction_pos[-topk:]
    # get tool ids
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    if last_tool_name in base_tools:
        last_base_tools = base_tools[last_tool_name]
        if type(last_base_tools).__name__ == "str":
            # get published or compatible tools for the last tool in a sequence of tools
            last_base_tools = last_base_tools.split(",")
    # get predicted tools
    p_tools, precision = get_predicted_tools(last_base_tools, pred_tool_ids, topk)
    sorted_c_t, sorted_c_v = sort_by_usage(p_tools, class_weights, dictionary)
    return sorted_c_t, sorted_c_v, precision


def compute_recommendations(model, tool_sequence, labels, dictionary, reverse_dictionary, class_weights, topk=10, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    tl_seq_ids = [str(dictionary[t]) for t in tl_seq]
    last_tool_name = tl_seq[-1]
    sample = np.zeros(max_seq_len)
    weight_val = list(class_weights.values())
    weight_val = np.reshape(weight_val, (len(weight_val),))
    for idx, tool_id in enumerate(tl_seq_ids):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))
    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tl_seq_ids]
    # predict next tools for a test path
    prediction = model.predict(sample_reshaped, verbose=0)
    nw_dimension = prediction.shape[1]
    prediction = np.reshape(prediction, (nw_dimension,))
    
    half_len = int(nw_dimension / 2)
    
    pub_t, pub_v, pub_prec = separate_predictions(standard_connections, prediction[:half_len], last_tool_name, weight_val, topk)
    # get recommended tools from normal workflows
    c_t, c_v, c_prec = separate_predictions(compatible_tools, prediction[half_len:], last_tool_name, weight_val, topk)
    
    return pub_prec, c_prec

In [46]:
base_path = "../output_files/data_20_05/gru_wc/run10/"
original_freq = read_file(base_path + "freq_dict_names.txt")
balanced_freq = read_file(base_path + "generated_tool_frequencies.txt")
test_paths = read_file(base_path + "test_paths_dict.txt")

model_path = base_path + "tool_recommendation_model.hdf5"
trained_model = h5py.File(model_path, 'r')
model_config = json.loads(trained_model.get('model_config').value)
dictionary = json.loads(trained_model.get('data_dictionary').value)
class_weights = json.loads(trained_model.get('class_weights').value)
standard_connections = json.loads(trained_model.get('standard_connections').value)
compatible_tools = json.loads(trained_model.get('compatible_tools').value)
loaded_model = model_from_json(model_config)
model, dictionary, reverse_dictionary = create_model(model_path)

In [47]:
print("Number of tool sequences: %d" % len(test_paths))
num_calibrations = 50
step = 1
topk = 1
calibrations_p_prec = list()
calibrations_c_prec = list()
calibrations_freq = list()
calibrations_n_paths = list()
for i in range(1, num_calibrations + 1, step):
    complete_p_prec = list()
    complete_c_prec = list()
    freq = list()
    ctr = 0
    for t_seq in test_paths:
        last_tool = t_seq.split(",")[-1]
        last_tool_name = reverse_dictionary[last_tool]
        t_seq_n = [reverse_dictionary[tid] for tid in t_seq.split(",")]
        t_seq_n = ",".join(t_seq_n)
        if last_tool_name in original_freq:
            if original_freq[last_tool_name] == i:
                p_prec, c_prec = compute_recommendations(model, t_seq_n, "", dictionary, reverse_dictionary, class_weights, topk)
                complete_p_prec.append(p_prec)
                complete_c_prec.append(c_prec)
                freq.append(original_freq[last_tool_name])
                ctr+= 1
    mean_p_prec = np.nanmean(complete_p_prec)
    mean_c_prec = np.nanmean(complete_c_prec)
    mean_freq = np.mean(freq)
    calibrations_p_prec.append(str(mean_p_prec))
    calibrations_c_prec.append(str(mean_c_prec))
    calibrations_freq.append(str(mean_freq))
    calibrations_n_paths.append(str(ctr))
    print("Last tool frequency: %s" % str(i))
    print("Published precision: %s" % str(mean_p_prec))
    print("Normal precision: %s" % str(mean_c_prec))
    print("Mean frequency of last tools: %s" % str(mean_freq))
    print("Number of paths used : %s" % ctr)
    print("===========================================================")
    
with open(base_path + "test_paths_low_freq_tool_perf.txt", "w") as f:
    results = ""
    results = ",".join(calibrations_p_prec)
    results += "\t"+ ",".join(calibrations_c_prec)
    results += "\t"+ ",".join(calibrations_freq)
    results += "\t"+ ",".join(calibrations_n_paths)
    f.write(results)
    f.close()

Number of tool sequences: 45955
Last tool frequency: 1
Published precision: 0.8666666666666667
Normal precision: 0.7872340425531915
Mean frequency of last tools: 1.0
Number of paths used : 47
Last tool frequency: 2
Published precision: 1.0
Normal precision: 0.8148148148148148
Mean frequency of last tools: 2.0
Number of paths used : 54
Last tool frequency: 3
Published precision: 0.9473684210526315
Normal precision: 0.859375
Mean frequency of last tools: 3.0
Number of paths used : 64
Last tool frequency: 4
Published precision: 1.0
Normal precision: 0.9555555555555556
Mean frequency of last tools: 4.0
Number of paths used : 45
Last tool frequency: 5
Published precision: 1.0
Normal precision: 0.9464285714285714
Mean frequency of last tools: 5.0
Number of paths used : 56
Last tool frequency: 6
Published precision: 0.9615384615384616
Normal precision: 0.875
Mean frequency of last tools: 6.0
Number of paths used : 48
Last tool frequency: 7
Published precision: 0.8333333333333334
Normal precis

Last tool frequency: 42
Published precision: 1.0
Normal precision: 1.0
Mean frequency of last tools: 42.0
Number of paths used : 14
Last tool frequency: 43
Published precision: 1.0
Normal precision: 1.0
Mean frequency of last tools: 43.0
Number of paths used : 38
Last tool frequency: 44
Published precision: 1.0
Normal precision: 1.0
Mean frequency of last tools: 44.0
Number of paths used : 21
Last tool frequency: 45
Published precision: 0.7142857142857143
Normal precision: 1.0
Mean frequency of last tools: 45.0
Number of paths used : 35
Last tool frequency: 46
Published precision: nan
Normal precision: nan
Mean frequency of last tools: nan
Number of paths used : 0
Last tool frequency: 47
Published precision: nan
Normal precision: 1.0
Mean frequency of last tools: 47.0
Number of paths used : 3
Last tool frequency: 48
Published precision: nan
Normal precision: 1.0
Mean frequency of last tools: 48.0
Number of paths used : 11
Last tool frequency: 49
Published precision: 1.0
Normal precisio