# Set Variables

In [None]:
output_version = 1

# -------- dataset
# software_name = "Camel"
# software_name = "CloudStack"
# software_name = "Geode"
software_name = "Hbase"

token_threshold = 20000


# -------- my_keyword_Based & my_docMaxLen
my_keyword_Based = True
# my_keyword_Based = False
my_docMaxLen = 100 if my_keyword_Based else None

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install enlighten
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'   SAVE


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_subfolder_1 =    "00- My Data/one-phase method/"

# output folder path
output_subfolder_1 =  "01- Jupyter Notebook/10- Naive Bayes - MultinomialNB/00. Output/"

In [None]:
# dataset folder path
subfolder_2 = software_name + "/"
subfolder_3 = dataset_file_name + "/"

# output data-folder path
output_folder = project_folder + output_subfolder_1 + subfolder_2 + subfolder_3

data_folder_dataset = project_folder + data_subfolder_1 + subfolder_2
data_folder_w2v     = project_folder + data_subfolder_1

# Libs

In [None]:
import string
import re
import json
import os.path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import enlighten

from collections import Counter, OrderedDict
from operator import truediv

from torchvision import transforms
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.text import TextCollection
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Config

In [None]:
mypaths = {
    "data": {
        "dataset":            data_folder_dataset + dataset_file_name + ".csv",
        "tfidf_word_weights": data_folder_dataset + dataset_file_name + " _ tfidf-word-weights-v01.json",
        "w2v_word_vectors":   data_folder_w2v     + "w2vGoogle.bin"
    },
    "output": {
        "not_keyword_based": {
            "performance": output_folder + "MultiNB-w2v-NKB-performance-v{}.json".format(output_version)
        },
        "keyword_based": {
            "performance": output_folder + "MultiNB-w2v-KB-performance-v{}.json".format(output_version)
        }
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1],
            "num_bug_classes": 2
        }
    },
    "keyword_Based": my_keyword_Based,
    "docMaxLen": my_docMaxLen
}

In [None]:
bcd_colours = ["blue", "green", "red"]

# Read File

In [None]:
df_main = pd.read_csv(
    mypaths["data"]["dataset"], 
    names=preprocessing_params["data"]["dataset"]["columns_name"], 
    dtype=preprocessing_params["data"]["dataset"]["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

# Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

## obj

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["data"]["dataset"]["columns_name"], 
        preprocessing_params["data"]["dataset"]["bug_classes"]
    )
])

df_main = composed_pre(df_main)

In [None]:
texts = df_main["text"].tolist()
labels = df_main["bug_class_2"].tolist()

# IV. ProgressLines

In [None]:
class ProgressLines():
    
    def progress_lines(self, num, total, description, unit, colour):
        desc = self.set_strings_to_equal_len_(description)
        manager = enlighten.get_manager()
        progresses = []
        for i in range(num):
            prog = manager.counter(total=total[i], desc=desc[i], unit=unit[i], color=colour[i])
            prog.refresh()
            progresses.append(prog)
        self.progresses = progresses
    
    
    def set_strings_to_equal_len_(self, description):
        max_len = 0
        # longest_string_length = len(max(description, key=len))
        longest_string_length = -1
        for ele in description:
            if len(ele) > longest_string_length:
                longest_string_length = len(ele)
        w = []
        for i, word in enumerate(description):
            temp = longest_string_length - len(word)
            w.append(word + " " * temp)
        return w

# I. Preprocessing

In [None]:
class Preprocessing():
    
    my_deleted_bug = {}
    
    docMaxLen = 0 # max keywords allowed
    w2vDic = {} # dic : {"w1": [0.1, 0.2, ...], "w2": [0.1, 0.3, ...], ...}
    paddingVector = np.zeros(300, dtype="float32")
    bugRepTokens = [] # [[w1, w2, w3, ...], [w1, w2, ...], ...]
    docMaxTokenNo_org = 0
    docMaxTokenNo_token_threshold = 0
    docMaxTokenNo = 0 # max doc len after vectorization
    vector_tfidf = [] # array of dictinaries: [{"w1": 0.1, "w2": 0.3, ...}, {}, ...]
    vector_em = [] # array of matrix : [ [w1Vector, w2Vector], [], ...] 
    
    
    def __init__(self, docMaxLen, token_threshold):
        self.docMaxLen = docMaxLen
        self.token_threshold = token_threshold
    
    
    # tfidf of corpuses words
    def load_tfidf(self, tfidf_path):
        with open(tfidf_path, "r") as filehandle:
            self.vector_tfidf = json.load(filehandle)
    
    
    def tokenize(self, texts):
        stop_words = set(stopwords.words("english"))
        excludedTokens = {"http", "url", "https"}
        
        # self.df.columns[0] : "description"
        for i, doc in enumerate(texts):
            thisTokens = []
            doc = doc.lower()
            for token in WordPunctTokenizer().tokenize(doc):
                if (token in string.punctuation or token in stop_words or token in excludedTokens or 
                    (not re.findall("\w", token)) or re.findall("\A[0-9]", token)):
                    continue
                thisTokens.append(token)
                self.w2vDic[token] = self.paddingVector
            if len(thisTokens) <= self.token_threshold:
                self.bugRepTokens.append(thisTokens)
                if (len(thisTokens) > self.docMaxTokenNo_token_threshold):
                    self.docMaxTokenNo_token_threshold = len(thisTokens)
            else:
                self.my_deleted_bug[i] = len(thisTokens)
                del labels[i]
                del self.vector_tfidf[i]
            if (len(thisTokens) > self.docMaxTokenNo_org):
                self.docMaxTokenNo_org = len(thisTokens)
    
    
    def loadW2V(self, w2vpath):
        print("loadW2V")
        with open(w2vpath, "rb") as f:
            header = f.readline()
            model_vocab_size, model_vector_size = map(int, header.split())
            binary_len = np.dtype("float32").itemsize * model_vector_size
            
            for line_no in range(model_vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b" ":
                        break
                    if ch == b"":
                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                    if ch != b"\n":
                        word.append(ch)
                word = b"".join(word).decode("utf-8")
                if (word in self.w2vDic.keys()):
                    self.w2vDic[word] = np.frombuffer(f.read(binary_len), dtype="float32")
                else:
                    f.seek(binary_len, 1)
    
    
    def vectorize_w2V (self, keywordBased=False):
        print("vectorize_w2V")
        tempVec = []
        x = slice(0, self.docMaxLen)
        if keywordBased:
            print("Keyword Based")
            for doc_tokens, doc_tfidf in zip(self.bugRepTokens, self.vector_tfidf):
                docKeywords = list(doc_tfidf.keys())[x]
                docAbs = [t for t in doc_tokens if t in docKeywords] # getDocAbsrtract_
                tempVec = [self.w2vDic[term] for term in docAbs]
                self.vector_em.append(tempVec)
                if (len(tempVec) > self.docMaxTokenNo):
                    self.docMaxTokenNo = len(tempVec)
        else:
            print("Not Keyword Based")
            for doc_tokens in self.bugRepTokens:
                tempVec = [self.w2vDic[term] for term in doc_tokens]
                self.vector_em.append(tempVec)
                if (len(tempVec) > self.docMaxTokenNo):
                    self.docMaxTokenNo = len(tempVec)
    
    
    def padding(self):
        for doc in self.vector_em:
            if (len(doc) < self.docMaxTokenNo):
                doc.extend([self.paddingVector] * (self.docMaxTokenNo - len(doc)))
    
    
    def freeMem(self):
        self.w2vDic = {}
        self.bugRepTokens = []
        self.vector_tfidf = []
        self.vector_em = []

## obj

In [None]:
ds = Preprocessing(preprocessing_params["docMaxLen"], token_threshold)
ds.load_tfidf(mypaths["data"]["tfidf_word_weights"])
ds.tokenize(texts)

# --- vectorize: w2v (keywordbased or no)
ds.loadW2V(mypaths["data"]["w2v_word_vectors"])
ds.vectorize_w2V(preprocessing_params["keyword_Based"])

loadW2V
vectorize_w2V
Keyword Based


In [None]:
our_input = []
for text_w2v in ds.vector_em:
    temp1 = np.array(text_w2v)
    temp2 = temp1.sum(axis=0)
    temp3 = list(temp2)
    our_input.append(temp3)

# Naive Bayes | linear_kernel

In [None]:
X_train, X_test, y_train, y_test = train_test_split(our_input, labels, random_state=0, train_size=0.75)

In [None]:
# clf = MultinomialNB().fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

p = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
p.fit(X_train, y_train) 

Pipeline(steps=[('Normalizing', MinMaxScaler()),
                ('MultinomialNB', MultinomialNB())])

In [None]:
predicted = p.predict(X_test)
np.mean(predicted == y_test)

0.6718817905258583

In [None]:
print(len(predicted))
print(len(labels))

2301
9201


In [None]:
type(predicted)

numpy.ndarray

# MyConfusionMatrix

In [None]:
class MyConfusionMatrix():
    def __init__(self, num_classes):
        # rows: actual, columns: prediction
        self.confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int32)
    
    
    def update(self, y, yhat_indices):
        for actual, pred in zip(y, yhat_indices):
                self.confusion_matrix[actual, pred] += 1
    
    
    def calc_accuracy(self):
        diagon = self.confusion_matrix.diagonal()
        # accuracy
        total_samples = self.confusion_matrix.sum()
        total_corrects = diagon.sum()
        accuracy = 100 * (total_corrects / total_samples)
        
        # accuracy per class
        # sum(1): 1 referes to sum for each row
        samples_per_class = self.confusion_matrix.sum(1)
        accuracy_per_class = 100 * (np.divide(diagon, samples_per_class))
        
        return accuracy, accuracy_per_class.tolist()
    
    
    def get_cf(self):
        return self.confusion_matrix.tolist()

## obj

In [None]:
confusion_matrix = MyConfusionMatrix(preprocessing_params["data"]["dataset"]["num_bug_classes"])
confusion_matrix.update(y_test, predicted)

# Save

In [None]:
def save_to_file_results(dataset_name, preprocessing_params, result_path):
    tempStructure = {
        "dataset": dataset_name,
        "preprocessing_params": preprocessing_params,
        "model_results": {
            "confusion_matrix": confusion_matrix.get_cf()
        }
    }
    
    with open(result_path, "w") as fout:
        json.dump(tempStructure, fout)

In [None]:
# save_to_file_results(
#     mypaths["data"]["dataset"], 
#     preprocessing_params, 
#     mypaths["output"]["performance"]
# )

# CalculateMetrics

In [None]:
class CalculateMetrics():
    def __init__(self, cm):
        self.cm = cm # it is a numpy object
        self.true_positives = np.diag(cm)
    
    
    # calculate precision for each class
    def calc_precision(self):
        columns_sum = np.sum(self.cm, axis=0)
        prec = list(map(truediv, self.true_positives, columns_sum))
        self.precision = prec
        return prec
    
    
    # calculate recall for each class
    # recall = accuracy per class
    # how accuratly each class is predicted
    def calc_recall(self):
        rows_sum = np.sum(self.cm, axis=1)
        rec = list(map(truediv, self.true_positives, rows_sum))
        self.recall = rec
        return rec
    
    
    # calculate f1_score for each class
    def calc_f1_score(self):
        tempPrec = np.array(self.precision)
        tempRec = np.array(self.recall)
        numerator = tempPrec * tempRec
        Denominator = tempPrec + tempRec
        f1s = 2 * (numerator / Denominator)
        self.f1_score = f1s
        return f1s
    
    def calc_accuracy(self):
        total_samples = np.sum(self.cm)
        sum_true_positives = sum(self.true_positives)
        acc = (sum_true_positives / total_samples)
        return acc

## obj

In [None]:
cf_matrix = confusion_matrix.get_cf()
cf_matrix = np.array(cf_matrix)
            
calcmet = CalculateMetrics(cf_matrix)
precision = calcmet.calc_precision()
precision = [round(elem, 2) * 100 for elem in precision]

recall = calcmet.calc_recall()
recall = [round(elem, 2) * 100 for elem in recall]

f1_score = calcmet.calc_f1_score()
f1_score = [round(elem, 2) * 100 for elem in f1_score]

acc = calcmet.calc_accuracy()
acc = round(acc * 100)

  # Remove the CWD from sys.path while we load stuff.


In [None]:
print("-" * 15)
print("accuracy           :", acc)
print("precision          :", precision)
print("recall             :", recall)
print("f1_score           :", f1_score)

---------------
accuracy           : 67
precision          : [67.0, nan]
recall             : [100.0, 0.0]
f1_score           : [80.0, nan]


# results

In [None]:
print("len(ds.bugRepTokens)   : ", len(ds.bugRepTokens))
print("ds.docMaxTokenNo_org   : ", ds.docMaxTokenNo_org)
print("len(ds.w2vDic)         : ", len(ds.w2vDic))
print("len(tfidf.vocabulary_) : ", len(tfidf.vocabulary_))
print("len(gfno)              : ", len(gfno))
print("vec.shape              : ", vec.shape)
print("len(texts)             : ", len(texts))
print("len(predicted)         : ", len(predicted))
print("len(labels)            : ", len(labels))

len(ds.bugRepTokens)   :  9201
ds.docMaxTokenNo_org   :  8463
len(ds.w2vDic)         :  27969


NameError: ignored

In [None]:
np.random.choice(
  ['pooh', 'rabbit', 'piglet', 'Christopher'], 
  5,
  p=[0.5, 0.1, 0.1, 0.3]
)