# Set Variables

In [None]:
# -------- dataset
# software_name = "Camel"
software_name = "CloudStack"
# software_name = "Geode"
# software_name = "Hbase"


# --------
my_keyword_Based = True
# my_keyword_Based = False


# -------- my_conv_kernel_output_channel & output_version
my_conv_kernel_output_channel = 64
# outp_version
# v2: 64 filter
# v3: 8  filter
output_version = 2 if my_conv_kernel_output_channel == 64 else 3


# -------- my_balance_train & file_subversion
# my_balance_train = True
my_balance_train = False
file_subversion = ".2-balanced" if my_balance_train else ".1-imbalance"

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_folder =    "00- My Data/one-phase method/"

# output folder path
output_folder =  "01- Jupyter Notebook/50- one-phase method _ CNN/00. Output/"

In [None]:
# dataset folder path
dataset_folder = software_name + "/"

# output data-folder path
output_data_folder = project_folder + output_folder + dataset_folder + dataset_file_name + "/"

# Libs

In [None]:
import json
import os.path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from operator import truediv

from matplotlib.ticker import MaxNLocator

# Config

In [None]:
mypaths = {
    "data": {
        "dataset":           project_folder + data_folder + dataset_folder + dataset_file_name + ".csv",
        "w2v_word_vectors":  project_folder + data_folder + "w2vGoogle.bin"
    },
    "output": {
        "not_keyword_based": {
            "performance":         output_data_folder + "performances/N_KB-performance-v{}{}.json",
            "model":               output_data_folder + "models/N_KB-model-v{}{}.pth"
        },
        "keyword_based": {
            "tfidf_word_weights":  output_data_folder + "tfidf-word-weights-v01.json",
            "performance":         output_data_folder + "performances/KB-performance-v{}{}.json",
            "model":               output_data_folder + "models/KB-model-v{}{}.pth"
        }
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1],
            "num_bug_classes": 2
        }
    },
    "keyword_Based": my_keyword_Based,
}

In [None]:
for k1, v1 in mypaths["output"].items():
    for k2, v2 in mypaths["output"][k1].items():
        temp_1 = mypaths["output"][k1][k2].format(output_version, file_subversion)
        print(k1, k2)
        print(temp_1)
        print("-" * 10)

not_keyword_based performance
gdrive/MyDrive/Colab Notebooks/paper/01- Jupyter Notebook/one-phased method/00. Output/CloudStack/CloudStack_DE - v.01/performances/N_KB-performance-v2.1-imbalance.json
----------
not_keyword_based model
gdrive/MyDrive/Colab Notebooks/paper/01- Jupyter Notebook/one-phased method/00. Output/CloudStack/CloudStack_DE - v.01/models/N_KB-model-v2.1-imbalance.pth
----------
keyword_based tfidf_word_weights
gdrive/MyDrive/Colab Notebooks/paper/01- Jupyter Notebook/one-phased method/00. Output/CloudStack/CloudStack_DE - v.01/tfidf-word-weights-v01.json
----------
keyword_based performance
gdrive/MyDrive/Colab Notebooks/paper/01- Jupyter Notebook/one-phased method/00. Output/CloudStack/CloudStack_DE - v.01/performances/KB-performance-v2.1-imbalance.json
----------
keyword_based model
gdrive/MyDrive/Colab Notebooks/paper/01- Jupyter Notebook/one-phased method/00. Output/CloudStack/CloudStack_DE - v.01/models/KB-model-v2.1-imbalance.pth
----------


In [None]:
bcd_colours = ["blue", "green", "red"]

# my_whole_datas

In [None]:
my_whole_datas = {}
for k, v in mypaths.items():
    with open(v) as filehandle:
        my_whole_datas[k] = json.load(filehandle)

TypeError: ignored

In [None]:
class_labels = my_whole_datas["camel"]["preprocessing_params"]["bug_classes"]
epochs = my_whole_datas["camel"]["train_params"]["epochs"]

In [None]:
print(class_labels)
print(epochs)

In [None]:
best_step_labels = {}
best_step_indexs = {}
datas = {}

for k, mwd in my_whole_datas.items():
    best_step_labels[k] = mwd["model_results"]["validation"]["best_at_step"]
    best_step_indexs[k] = mwd["model_results"]["validation"]["best_at_step"] - 1
    datas[k] = mwd["model_results"]

In [None]:
print(best_step_labels)
print(best_step_indexs)

In [None]:
ls = "solid"
lw = 1
lw_test = 5
marker = "."
marker_test = "o"
# train_data_status = train_data_status

colours = {
    "camel":      "red",
    "cloudstack": "blue",
    "geode":      "green",
    "hbase":      "orange"
}

In [None]:
def plot_acc_cost_():
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 10))

    mymetrics = ["accuracy", "cost"]
    my_x = range(1, epochs + 1, 1)

    for ax, metric in zip(axes, mymetrics):
        ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))

        if metric == "accuracy":
            for k, mwd in my_whole_datas.items():
                ax.set_ylim([0, 100])
                ax.scatter(best_step_labels[k], datas[k]["validation"]["best_acc"], 
                           label="best on Validation Dataset", color="cyan", lw=lw_test, marker=marker_test)
        else:
            for k, mwd in my_whole_datas.items():
                ax.set_ylim([0, 300])
                ax.scatter(best_step_labels[k], datas[k]["validation"]["cost"][best_step_indexs[k]], 
                           label="best on Validation Dataset", color="cyan", lw=lw_test, marker=marker_test)
        
        for k, mwd in my_whole_datas.items():
#             ax.plot(my_x, metric, data=self.data["train"], label="Train-{} Dataset".format(self.train_data_status), 
#                     color="red", linestyle=self.ls, lw=self.lw, marker=self.marker)

            ax.plot(my_x, metric, data=datas[k]["validation"], label="Validation Dataset - {}".format(k), 
                    color=colours[k], linestyle=ls, lw=lw, marker=marker)
        
        ax.set_xlim([0, 30])
        ax.set_xlabel("epoch")
        ax.set_ylabel(metric)
        ax.legend()
        ax.grid(True)

        ax.get_xaxis().set_ticks( list(range(1, epochs + 1)) )
    # --- end for

    fig.tight_layout(pad=3.0)

In [None]:
plot_acc_cost_()

# Plot results

In [None]:
class PlotResults():
    def __init__(self, train_data_status):
        self.ls = "solid"
        self.lw = 1
        self.lw_test = 5
        self.marker = "."
        self.marker_test = "o"
        self.class_labels = None
        self.data = None
        self.epochs = None
        self.train_data_status = train_data_status
    
    
    def load_results(self, result_path):
        with open(result_path) as filehandle:
            whole_data = json.load(filehandle)
        self.class_labels = whole_data["preprocessing_params"]["bug_classes"]
        self.epochs = whole_data["train_params"]["epochs"]
        self.best_step_index = whole_data["model_results"]["validation"]["best_at_step"] - 1
        self.best_step_label = whole_data["model_results"]["validation"]["best_at_step"]
        
        self.class_distributions = whole_data["class_distributions"]
        self.data = whole_data["model_results"]
    
    
    # ******************************* acc, cost *******************************
    
    def plot_acc_cost_(self):
        fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 7))
        
        mymetrics = ["accuracy", "cost"]
        my_x = range(1, self.epochs + 1, 1)
        
        for ax, metric in zip(axes, mymetrics):
            ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))
            
            if metric == "accuracy":
                ax.set_ylim([0, 100])
                ax.scatter(self.best_step_label, self.data["validation"]["best_acc"], 
                           label="best on Validation Dataset", color="cyan", lw=self.lw_test, marker=self.marker_test)
            else:
                ax.set_ylim([0, 300])
                ax.scatter(self.best_step_label, self.data["validation"]["cost"][self.best_step_index], 
                           label="best on Validation Dataset", color="cyan", lw=self.lw_test, marker=self.marker_test)
            
            ax.plot(my_x, metric, data=self.data["train"], label="Train-{} Dataset".format(self.train_data_status), 
                    color="red", linestyle=self.ls, lw=self.lw, marker=self.marker)
            
            ax.plot(my_x, metric, data=self.data["validation"], label="Validation Dataset", 
                    color="blue", linestyle=self.ls, lw=self.lw, marker=self.marker)
            
            ax.set_xlabel("epoch")
            ax.set_ylabel(metric)
            ax.legend()
            ax.grid(True)
            
            ax.get_xaxis().set_ticks( list(range(1, self.epochs + 1)) )
        # --- end for
        
        fig.tight_layout(pad=3.0)
        self.other_results()
    
    
    def other_results(self):
        print("** {} **".format(self.train_data_status))
        titles = ["validation_best_acc", "validation_best_acc_at_step", "epochs", "train accuracy", "cost"]
        titles = self.set_strings_to_equal_len_(titles)
        
        # accuracy
        tempTrAcc = round(self.data["train"]["accuracy"][self.best_step_index], 1)
        print("{}: {:2.1f}".format(titles[3], tempTrAcc) )
        
        tempValAcc = round(self.data["validation"]["best_acc"], 1)
        print("{}: {:2.1f}".format(titles[0], tempValAcc) )
        print("{}: {}".format(titles[1], self.data["validation"]["best_at_step"]))
        print("{}: {}".format(titles[2], self.epochs))
        
        # cost
        print()
        print("** {} **".format(self.train_data_status))
        print("{}: {:2.4f}".format(titles[4], self.data["validation"]["cost"][self.best_step_index]))
    
    
    def set_strings_to_equal_len_(self, words):
        max_len = 0
        longest_string_length = len(max(words, key=len))
        
        w = []
        for i, word in enumerate(words):
            temp = longest_string_length - len(word)
            w.append(word + " " * temp)
        return w
    
    
    # ******************************* confustion matrix *******************************
    
    def plot_confusion_matrices(self):
        
        fig_cf, axes_cd = plt.subplots(nrows=1, ncols=2, figsize=(7, 4))
        fig_cf.suptitle("Seaborn Confusion Matrix with labels")
        fig_cf.supxlabel("--Predicted-- Bug Report Category")
        fig_cf.supylabel("--Actual-- Bug Report Category")
        
        datasetNames = ["validation", "train"]
        i = 0
        
        for cfax, datasetName in zip(axes_cd, datasetNames):
            if datasetName == "train":
                mycbar = True
            else:
                mycbar = False
                
            tempCF = self.data[datasetName]["confusion_matrix"]
            cf_matrix = tempCF[self.best_step_index]
            cf_matrix = np.array(cf_matrix)
            
            calcmet = CalculateMetrics(cf_matrix)
            precision = calcmet.calc_precision()
            precision = [round(elem, 2) for elem in precision]
            
            recall = calcmet.calc_recall()
            recall = [round(elem, 2) for elem in recall]
            
            f1_score = calcmet.calc_f1_score()
            f1_score = [round(elem, 2) for elem in f1_score]
            
            print("-" * 15, datasetName)
            print("precision: ", precision)
            print("recall:    ", recall)
            print("f1_score:  ", f1_score)
            
            snax = sns.heatmap(cf_matrix / np.sum(cf_matrix), 
                               annot=True, fmt=".2%", cmap="Blues", ax=cfax, vmin=0, vmax=1, 
                               cbar=mycbar, annot_kws={"size": 12})
            
            if datasetName == "train":
                snax.set_title("[{}-{}]\n".format(datasetName, self.train_data_status))
            else:
                snax.set_title("[{}]\n".format(datasetName))
            
            labels = list(map(str, self.class_labels))
            
            snax.xaxis.set_ticklabels(labels)
            snax.yaxis.set_ticklabels(labels)
            i += 1
        
        fig_cf.tight_layout(w_pad=6.0)
    
    
    # ******************************* class distribution *******************************
    
    def plot_class_distribution(self):
        # plot totlal class distribution for splited and whole dataset
        for dataName, v in self.class_distributions.items():
            print("-" * 15, dataName)
            tempCDist = ClassDistribution(v)
            tempCDist.calc_ratios()
            print("ratios     : ", tempCDist.ratios)
            tempCDist.calc_percentage()
            print("percentages: ", tempCDist.percentage)
            tempCDist.plot_data(dataName)
            print("\n")

## flow_plot_on_data

In [None]:
def flow_plot_on_data(file_subversion, train_data_status, mypath):
    result_path = mypath.format(file_subversion)
    print(result_path)
    if os.path.isfile(result_path):
        plr = PlotResults(train_data_status)
        plr.load_results(result_path)

        plr.plot_acc_cost_()
        plr.plot_confusion_matrices()
        return plr
    else:
        print("There is no such file.")

In [None]:
if on_google_colab:
    if preprocessing_params["keyword_Based"]:
        temp = "tfidf"
    else:
        temp = "w2v"
    myp = mypaths[temp]["output_performance"]

else:
    if preprocessing_params["keyword_Based"]:
        temp = "tfidf"
    else:
        temp = "w2v"
    myp = mypaths[temp]["output_performance"]

## on imbalance data

In [None]:
plr1 = flow_plot_on_data(mypaths["file_subversion"]["im"], "imbalance", myp)

## on balanced data

In [None]:
plr2 = flow_plot_on_data(mypaths["file_subversion"]["ba"], "balanced", myp)

## class_distribution

In [None]:
plr1.plot_class_distribution()