# Set Variables

In [None]:
output_version = 1

# -------- dataset
software_name = "Camel"
# software_name = "CloudStack"
# software_name = "Geode"
# software_name = "Hbase"

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install enlighten
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting enlighten
  Downloading enlighten-1.10.2-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 612 kB/s 
[?25hCollecting blessed>=1.17.7
  Downloading blessed-1.19.1-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 4.7 MB/s 
[?25hCollecting prefixed>=0.3.2
  Downloading prefixed-0.3.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: prefixed, blessed, enlighten
Successfully installed blessed-1.19.1 enlighten-1.10.2 prefixed-0.3.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting matplotlib
  Downloading matplotlib-3.5.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 7.9 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.34.4-py3-none-any.whl (944 kB)
[K     |██████████████████

In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'   SAVE


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_folder =    "00- My Data/one-phase method/"

# output folder path
output_folder =  "01- Jupyter Notebook/30- KNN_Cosine Similarity/00. Output/"

In [None]:
# dataset folder path
dataset_folder = software_name + "/"

# output data-folder path
output_data_folder = project_folder + output_folder + dataset_folder + dataset_file_name + "/"

# Libs

In [None]:
import string
import re
import json
import os.path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import enlighten

from collections import Counter, OrderedDict
from operator import truediv

from torchvision import transforms
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.text import TextCollection
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Config

In [None]:
mypaths = {
    "data": {
        "dataset": project_folder + data_folder + dataset_folder + dataset_file_name + ".csv"
    },
    "output": {
        "performance": output_data_folder + "CS-performance-v{}.json".format(output_version)
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1],
            "num_bug_classes": 2
        }
    }
}

In [None]:
bcd_colours = ["blue", "green", "red"]

# Read File

In [None]:
df_main = pd.read_csv(
    mypaths["data"]["dataset"], 
    names=preprocessing_params["data"]["dataset"]["columns_name"], 
    dtype=preprocessing_params["data"]["dataset"]["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

# Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

## obj

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["data"]["dataset"]["columns_name"], 
        preprocessing_params["data"]["dataset"]["bug_classes"]
    )
])

df_main = composed_pre(df_main)

In [None]:
texts = df_main["text"].tolist()
labels = df_main["bug_class_2"].tolist()

# IV. ProgressLines

In [None]:
class ProgressLines():
    
    def progress_lines(self, num, total, description, unit, colour):
        desc = self.set_strings_to_equal_len_(description)
        manager = enlighten.get_manager()
        progresses = []
        for i in range(num):
            prog = manager.counter(total=total[i], desc=desc[i], unit=unit[i], color=colour[i])
            prog.refresh()
            progresses.append(prog)
        self.progresses = progresses
    
    
    def set_strings_to_equal_len_(self, description):
        max_len = 0
        # longest_string_length = len(max(description, key=len))
        longest_string_length = -1
        for ele in description:
            if len(ele) > longest_string_length:
                longest_string_length = len(ele)
        w = []
        for i, word in enumerate(description):
            temp = longest_string_length - len(word)
            w.append(word + " " * temp)
        return w

# I. Preprocessing

In [None]:
class Preprocessing():
    
    w2vDic = set() # dic : {"w1", "w2", ...}
    bugRepTokens = [] # [[w1, w2, w3, ...], [w1, w2, ...], ...]
    docMaxTokenNo_org = 0
    
    
    # ************************** tokenize ************************** #
    
    def tokenize(self, texts):
        stop_words = set(stopwords.words("english"))
        excludedTokens = {"http", "url", "https"}
        
        for i, doc in enumerate(texts):
            thisTokens = []
            doc = doc.lower()
            for token in WordPunctTokenizer().tokenize(doc):
                if (token in string.punctuation or token in stop_words or token in excludedTokens or 
                    (not re.findall("\w", token)) or re.findall("\A[0-9]", token)):
                    continue
                thisTokens.append(token)
                self.w2vDic.add(token)
            self.bugRepTokens.append(thisTokens)
            if (len(thisTokens) > self.docMaxTokenNo_org):
                self.docMaxTokenNo_org = len(thisTokens)

## obj

In [None]:
ds = Preprocessing()
ds.tokenize(texts)

# tfidf

In [None]:
def dummy_fun(doc):
    return doc

In [None]:
tfidf = TfidfVectorizer(
    analyzer="word",
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None
)

## fit

In [None]:
tfidf_matrix2 = tfidf.fit(ds.bugRepTokens)

In [None]:
print(tfidf.vocabulary_)



In [None]:
gfno = tfidf.get_feature_names_out()
print(gfno)
print(type(gfno))

['_1' '_2' '__' ... '异常的超时' '穗' '者蘞㪑']
<class 'numpy.ndarray'>


In [None]:
# print(gfno[14492])
# print(tfidf.vocabulary_["license"])

## transform

In [None]:
vec = tfidf.transform(ds.bugRepTokens)
tfidf_matrices = vec.toarray()

In [None]:
print("type(vec)                : ", type(vec))
print("tfidf_matrices[0, 14492] : ", tfidf_matrices[0, 14492])
print("tfidf_matrices.shape     : ", tfidf_matrices.shape)
print("vec[0]                   : \n", vec[0])

type(vec)                :  <class 'scipy.sparse.csr.csr_matrix'>
tfidf_matrices[0, 14492] :  0.0
tfidf_matrices.shape     :  (9201, 27969)
vec[0]                   : 
   (0, 27253)	0.0928384252761043
  (0, 27224)	0.1572605791921827
  (0, 27024)	0.09509245521110912
  (0, 26333)	0.11528875275466925
  (0, 25973)	0.15509021137710743
  (0, 22577)	0.14135671194396263
  (0, 22139)	0.09274093296458373
  (0, 21600)	0.08375556569306297
  (0, 20524)	0.1890864603712154
  (0, 20519)	0.14422058797168186
  (0, 18311)	0.09269233809024882
  (0, 18123)	0.1324809501694133
  (0, 16847)	0.2243980507198006
  (0, 15956)	0.10393034710566403
  (0, 14819)	0.3974428505082399
  (0, 14285)	0.324629505996314
  (0, 13321)	0.07915863928767788
  (0, 13198)	0.13812321325857546
  (0, 12739)	0.14295024544497476
  (0, 11210)	0.1958457271176951
  (0, 10759)	0.13119671724125795
  (0, 10346)	0.15974092846824647
  (0, 10096)	0.177679156445736
  (0, 8509)	0.16641129277145597
  (0, 6852)	0.1524768968582382
  (0, 6407)	0.200129

# Cosine Similarity | linear_kernel

In [None]:
predicted = []

# --- ProgressLines
pl = ProgressLines()
pl.progress_lines(1, [len(labels)], ["Cosine_Similarity"], ["bug"], ["blue"])

for i, y in enumerate(labels):
    cosine_similarities = linear_kernel(vec[i], vec).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-3:-1]
    yhat = labels[related_docs_indices[1]]
    predicted.append(yhat)
    pl.progresses[0].update()

In [None]:
print(len(predicted))
print(len(labels))

9201
9201


# MyConfusionMatrix

In [None]:
class MyConfusionMatrix():
    def __init__(self, num_classes):
        # rows: actual, columns: prediction
        self.confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int32)
    
    
    def update(self, y, yhat_indices):
        for actual, pred in zip(y, yhat_indices):
                self.confusion_matrix[actual, pred] += 1
    
    
    def calc_accuracy(self):
        diagon = self.confusion_matrix.diagonal()
        # accuracy
        total_samples = self.confusion_matrix.sum()
        total_corrects = diagon.sum()
        accuracy = 100 * (total_corrects / total_samples)
        
        # accuracy per class
        # sum(1): 1 referes to sum for each row
        samples_per_class = self.confusion_matrix.sum(1)
        accuracy_per_class = 100 * (np.divide(diagon, samples_per_class))
        
        return accuracy, accuracy_per_class.tolist()
    
    
    def get_cf(self):
        return self.confusion_matrix.tolist()

## obj

In [None]:
confusion_matrix = MyConfusionMatrix(preprocessing_params["data"]["dataset"]["num_bug_classes"])
confusion_matrix.update(labels, predicted)

# Save

In [None]:
def save_to_file_results(dataset_name, preprocessing_params, result_path):
    tempStructure = {
        "dataset": dataset_name,
        "preprocessing_params": preprocessing_params,
        "model_results": {
            "confusion_matrix": confusion_matrix.get_cf()
        }
    }
    
    with open(result_path, "w") as fout:
        json.dump(tempStructure, fout)

In [None]:
save_to_file_results(
    mypaths["data"]["dataset"], 
    preprocessing_params, 
    mypaths["output"]["performance"]
)

# CalculateMetrics

In [None]:
class CalculateMetrics():
    def __init__(self, cm):
        self.cm = cm # it is a numpy object
        self.true_positives = np.diag(cm)
    
    
    # calculate precision for each class
    def calc_precision(self):
        columns_sum = np.sum(self.cm, axis=0)
        prec = list(map(truediv, self.true_positives, columns_sum))
        self.precision = prec
        return prec
    
    
    # calculate recall for each class
    # recall = accuracy per class
    # how accuratly each class is predicted
    def calc_recall(self):
        rows_sum = np.sum(self.cm, axis=1)
        rec = list(map(truediv, self.true_positives, rows_sum))
        self.recall = rec
        return rec
    
    
    # calculate f1_score for each class
    def calc_f1_score(self):
        tempPrec = np.array(self.precision)
        tempRec = np.array(self.recall)
        numerator = tempPrec * tempRec
        Denominator = tempPrec + tempRec
        f1s = 2 * (numerator / Denominator)
        self.f1_score = f1s
        return f1s
    
    def calc_accuracy(self):
        total_samples = np.sum(self.cm)
        sum_true_positives = sum(self.true_positives)
        acc = (sum_true_positives / total_samples)
        return acc

## obj

In [None]:
cf_matrix = confusion_matrix.get_cf()
cf_matrix = np.array(cf_matrix)
            
calcmet = CalculateMetrics(cf_matrix)
precision = calcmet.calc_precision()
precision = [round(elem * 100) for elem in precision]

recall = calcmet.calc_recall()
recall = [round(elem * 100) for elem in recall]

f1_score = calcmet.calc_f1_score()
f1_score = [round(elem * 100) for elem in f1_score]

acc = calcmet.calc_accuracy()
acc = round(acc * 100)

In [None]:
print("-" * 15)
print("accuracy           :", acc)
print("precision          :", precision)
print("recall             :", recall)
print("f1_score           :", f1_score)

---------------
accuracy           : 60
precision          : [70, 40]
recall             : [70, 40]
f1_score           : [70, 40]


# results

In [None]:
print("len(ds.bugRepTokens)   : ", len(ds.bugRepTokens))
print("ds.docMaxTokenNo_org   : ", ds.docMaxTokenNo_org)
print("len(ds.w2vDic)         : ", len(ds.w2vDic))
print("len(tfidf.vocabulary_) : ", len(tfidf.vocabulary_))
print("len(gfno)              : ", len(gfno))
print("vec.shape              : ", vec.shape)
print("len(texts)             : ", len(texts))
print("len(predicted)         : ", len(predicted))
print("len(labels)            : ", len(labels))

len(ds.bugRepTokens)   :  9201
ds.docMaxTokenNo_org   :  8463
len(ds.w2vDic)         :  27969
len(tfidf.vocabulary_) :  27969
len(gfno)              :  27969
vec.shape              :  (9201, 27969)
len(texts)             :  9201
len(predicted)         :  9201
len(labels)            :  9201


In [None]:
np.random.choice(
  ['pooh', 'rabbit', 'piglet', 'Christopher'], 
  5,
  p=[0.5, 0.1, 0.1, 0.3]
)

array(['pooh', 'Christopher', 'piglet', 'Christopher', 'pooh'],
      dtype='<U11')