# Set Variables

In [None]:
output_version = 1

class_probability = {
    "classes":       [0, 1],
    "probabilities": [0.67, 0.33]
}
# -------- dataset
# software_name = "Camel"
# software_name = "CloudStack"
# software_name = "Geode"
software_name = "Hbase"

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'   SAVE


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_folder =    "00- My Data/one-phase method/"

# output folder path
output_folder =  "01- Jupyter Notebook/01- Random Labeling of bugs/00. Output/"

In [None]:
# dataset folder path
dataset_folder = software_name + "/"

# output data-folder path
output_data_folder = project_folder + output_folder + dataset_folder + dataset_file_name + "/"

# Libs

In [None]:
import json
import os.path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from operator import truediv

from torchvision import transforms

# Config

In [None]:
mypaths = {
    "data": {
        "dataset": project_folder + data_folder + dataset_folder + dataset_file_name + ".csv"
    },
    "output": {
        "performance": output_data_folder + "random-performance-v{}.json".format(output_version)
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1],
            "num_bug_classes": 2
        }
    }
}

In [None]:
bcd_colours = ["blue", "green", "red"]

# Read File

In [None]:
df_main = pd.read_csv(
    mypaths["data"]["dataset"], 
    names=preprocessing_params["data"]["dataset"]["columns_name"], 
    dtype=preprocessing_params["data"]["dataset"]["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

# Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

## obj

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["data"]["dataset"]["columns_name"], 
        preprocessing_params["data"]["dataset"]["bug_classes"]
    )
])

df_main = composed_pre(df_main)

In [None]:
texts = df_main["text"].tolist()
labels = df_main["bug_class_2"].tolist()

# predict

In [None]:
predicted = np.random.choice(
  class_probability["classes"], 
  len(labels),
  p=class_probability["probabilities"]
)

In [None]:
predicted

array([1, 0, 0, ..., 0, 0, 0])

# MyConfusionMatrix

In [None]:
class MyConfusionMatrix():
    def __init__(self, num_classes):
        # rows: actual, columns: prediction
        self.confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int32)
    
    
    def update(self, y, yhat_indices):
        for actual, pred in zip(y, yhat_indices):
                self.confusion_matrix[actual, pred] += 1
    
    
    def calc_accuracy(self):
        diagon = self.confusion_matrix.diagonal()
        # accuracy
        total_samples = self.confusion_matrix.sum()
        total_corrects = diagon.sum()
        accuracy = 100 * (total_corrects / total_samples)
        
        # accuracy per class
        # sum(1): 1 referes to sum for each row
        samples_per_class = self.confusion_matrix.sum(1)
        accuracy_per_class = 100 * (np.divide(diagon, samples_per_class))
        
        return accuracy, accuracy_per_class.tolist()
    
    
    def get_cf(self):
        return self.confusion_matrix.tolist()

## obj

In [None]:
confusion_matrix = MyConfusionMatrix(preprocessing_params["data"]["dataset"]["num_bug_classes"])
confusion_matrix.update(labels, predicted)

# Save

In [None]:
def save_to_file_results(dataset_name, preprocessing_params, result_path):
    tempStructure = {
        "dataset": dataset_name,
        "preprocessing_params": preprocessing_params,
        "class_probability": class_probability,
        "model_results": {
            "confusion_matrix": confusion_matrix.get_cf()
        }
    }
    
    with open(result_path, "w") as fout:
        json.dump(tempStructure, fout)

In [None]:
save_to_file_results(
    mypaths["data"]["dataset"], 
    preprocessing_params, 
    mypaths["output"]["performance"]
)

# CalculateMetrics

In [None]:
class CalculateMetrics():
    def __init__(self, cm):
        self.cm = cm # it is a numpy object
        self.true_positives = np.diag(cm)
    
    
    # calculate precision for each class
    def calc_precision(self):
        columns_sum = np.sum(self.cm, axis=0)
        prec = list(map(truediv, self.true_positives, columns_sum))
        self.precision = prec
        return prec
    
    
    # calculate recall for each class
    # recall = accuracy per class
    # how accuratly each class is predicted
    def calc_recall(self):
        rows_sum = np.sum(self.cm, axis=1)
        rec = list(map(truediv, self.true_positives, rows_sum))
        self.recall = rec
        return rec
    
    
    # calculate f1_score for each class
    def calc_f1_score(self):
        tempPrec = np.array(self.precision)
        tempRec = np.array(self.recall)
        numerator = tempPrec * tempRec
        Denominator = tempPrec + tempRec
        f1s = 2 * (numerator / Denominator)
        self.f1_score = f1s
        return f1s
    
    def calc_accuracy(self):
        total_samples = np.sum(self.cm)
        sum_true_positives = sum(self.true_positives)
        acc = (sum_true_positives / total_samples)
        return acc

## obj

In [None]:
cf_matrix = confusion_matrix.get_cf()
cf_matrix = np.array(cf_matrix)
            
calcmet = CalculateMetrics(cf_matrix)
precision = calcmet.calc_precision()
precision = [round(elem * 100) for elem in precision]

recall = calcmet.calc_recall()
recall = [round(elem * 100) for elem in recall]

f1_score = calcmet.calc_f1_score()
f1_score = [round(elem * 100) for elem in f1_score]

acc = calcmet.calc_accuracy()
acc = round(acc * 100)

In [None]:
print("-" * 15)
print("accuracy  :", acc)
print("precision :", precision)
print("recall    :", recall)
print("f1_score  :", f1_score)

---------------
accuracy  : 55
precision : [66, 32]
recall    : [66, 32]
f1_score  : [66, 32]


# results

In [None]:
# should have equal values
print("len(texts)     : ", len(texts))
print("len(labels)    : ", len(labels))
print("len(predicted) : ", len(predicted))

len(texts)     :  9201
len(labels)    :  9201
len(predicted) :  9201
