# Set Variables

In [None]:
# -------- dataset
# software_name = "Camel"
# software_name = "CloudStack"
software_name = "Geode"
# software_name = "Hbase"

token_threshold = 20000

# --------
my_keyword_Based = True
# my_keyword_Based = False

my_docMaxLen = 100 if my_keyword_Based else None

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_folder =    "00- My Data/one-phase method/"

# output folder path
output_folder =  "01- Jupyter Notebook/50- one-phase method _ CNN/00. Output/"

In [None]:
# dataset folder path
dataset_folder = software_name + "/"

# output data-folder path
output_data_folder = project_folder + output_folder + dataset_folder + dataset_file_name + "/"

# Libs

In [None]:
import string
import re
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import transforms
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.text import TextCollection

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Config

In [None]:
mypaths = {
    "data": {
        "dataset":           project_folder + data_folder   + dataset_folder + dataset_file_name + ".csv",
        "w2v_word_vectors":  project_folder + data_folder   + "w2vGoogle.bin"
    },
    "output": {
        "keyword_based": {
            "tfidf_word_weights": output_data_folder + "tfidf-word-weights-v01.json"
        }
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1]
        },
    },
    "keyword_Based": my_keyword_Based,
    "docMaxLen": my_docMaxLen,
}

# I. Read Files

In [None]:
df_main = pd.read_csv(
    mypaths["data"]["dataset"], 
    names=preprocessing_params["data"]["dataset"]["columns_name"], 
    dtype=preprocessing_params["data"]["dataset"]["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

In [None]:
len_df_main_before_compse = len(df_main)

# II. Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

## obj

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["data"]["dataset"]["columns_name"], 
        preprocessing_params["data"]["dataset"]["bug_classes"]
    )
])

df_main = composed_pre(df_main)

In [None]:
texts = df_main["text"].tolist()
labels = df_main["bug_class_2"].tolist()

In [None]:
len_df_main_after_compse = len(df_main)
len_texts_before_token_threshold = len(texts)
len_labels_before_token_threshold = len(labels)

In [None]:
print("len df_main before compose:", len_df_main_before_compse)
print("len df_main after  compose:", len_df_main_after_compse)
print("len(texts)                :", len(texts))
print("len(labels)               :", len(labels))

len df_main before compose: 3608
len df_main after  compose: 3608
len(texts)                : 3608
len(labels)               : 3608


# I. Preprocessing

In [None]:
class Preprocessing():
    
    my_deleted_bug = {}
    
    docMaxLen = 0 # max keywords allowed
    w2vDic = {} # dic : {"w1": [0.1, 0.2, ...], "w2": [0.1, 0.3, ...], ...}
    paddingVector = np.zeros(300, dtype="float32")
    bugRepTokens = [] # [[w1, w2, w3, ...], [w1, w2, ...], ...]
    docMaxTokenNo_org = 0
    docMaxTokenNo_token_threshold = 0
    docMaxTokenNo = 0 # max doc len after vectorization
    vector_tfidf = [] # array of dictinaries: [{"w1": 0.1, "w2": 0.3, ...}, {}, ...]
    vector_em = [] # array of matrix : [ [w1Vector, w2Vector], [], ...] 
    
    
    def __init__(self, docMaxLen, token_threshold):
        self.docMaxLen = docMaxLen
        self.token_threshold = token_threshold
    
    
    # tfidf of corpuses words
    def load_tfidf(self, tfidf_path):
        with open(tfidf_path, "r") as filehandle:
            self.vector_tfidf = json.load(filehandle)
    
    
    def tokenize(self, texts):
        stop_words = set(stopwords.words("english"))
        excludedTokens = {"http", "url", "https"}
        
        # self.df.columns[0] : "description"
        for i, doc in enumerate(texts):
            thisTokens = []
            doc = doc.lower()
            for token in WordPunctTokenizer().tokenize(doc):
                if (token in string.punctuation or token in stop_words or token in excludedTokens or 
                    (not re.findall("\w", token)) or re.findall("\A[0-9]", token)):
                    continue
                thisTokens.append(token)
                self.w2vDic[token] = self.paddingVector
            if len(thisTokens) <= self.token_threshold:
                self.bugRepTokens.append(thisTokens)
                if (len(thisTokens) > self.docMaxTokenNo_token_threshold):
                    self.docMaxTokenNo_token_threshold = len(thisTokens)
            else:
                self.my_deleted_bug[i] = len(thisTokens)
                del labels[i]
                del self.vector_tfidf[i]
            if (len(thisTokens) > self.docMaxTokenNo_org):
                self.docMaxTokenNo_org = len(thisTokens)
    
    
    def loadW2V(self, w2vpath):
         with open(w2vpath, "rb") as f:
            header = f.readline()
            model_vocab_size, model_vector_size = map(int, header.split())
            binary_len = np.dtype("float32").itemsize * model_vector_size
            
            for line_no in range(model_vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b" ":
                        break
                    if ch == b"":
                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                    if ch != b"\n":
                        word.append(ch)
                word = b"".join(word).decode("utf-8")
                if (word in self.w2vDic.keys()):
                    self.w2vDic[word] = np.frombuffer(f.read(binary_len), dtype="float32")
                else:
                    f.seek(binary_len, 1)
    
    
    def vectorize_w2V (self, keywordBased=False):
        tempVec = []
        x = slice(0, self.docMaxLen)
        if keywordBased:
            print("Keyword Based")
            for doc_tokens, doc_tfidf in zip(self.bugRepTokens, self.vector_tfidf):
                docKeywords = list(doc_tfidf.keys())[x]
                docAbs = [t for t in doc_tokens if t in docKeywords] # getDocAbsrtract_
                tempVec = [self.w2vDic[term] for term in docAbs]
                self.vector_em.append(tempVec)
                if (len(tempVec) > self.docMaxTokenNo):
                    self.docMaxTokenNo = len(tempVec)
        else:
            print("Not Keyword Based")
            for doc_tokens in self.bugRepTokens:
                tempVec = [self.w2vDic[term] for term in doc_tokens]
                self.vector_em.append(tempVec)
                if (len(tempVec) > self.docMaxTokenNo):
                    self.docMaxTokenNo = len(tempVec)
    
    
    def padding(self):
        for doc in self.vector_em:
            if (len(doc) < self.docMaxTokenNo):
                doc.extend([self.paddingVector] * (self.docMaxTokenNo - len(doc)))
    
    
    def freeMem(self):
        self.w2vDic = {}
        self.bugRepTokens = []
        self.vector_tfidf = []
        self.vector_em = []

## obj

In [None]:
ds = Preprocessing(preprocessing_params["docMaxLen"], token_threshold)
ds.load_tfidf(mypaths["output"]["keyword_based"]["tfidf_word_weights"])
ds.tokenize(texts)

# --- vectorize: w2v (keywordbased or no)
print("loadW2V")
ds.loadW2V(mypaths["data"]["w2v_word_vectors"])

print("vectorize_w2V")
ds.vectorize_w2V(preprocessing_params["keyword_Based"])

ds.padding()

loadW2V
vectorize_w2V
Keyword Based


In [None]:
len_texts_after_token_threshold = len(texts)
len_labels_after_token_threshold = len(labels)

# See results

In [None]:
print("len df_main before compse         :", len_df_main_before_compse)

print("-" * 50)

# should have same values:
print("len df_main after  compse         :", len_df_main_after_compse)
print("df_main length                    :", len(df_main))
print("len texts  before token_threshold :", len_texts_before_token_threshold)
print("len texts  after  token_threshold :", len_texts_after_token_threshold)
print("len labels before token_threshold :", len_labels_before_token_threshold)

len df_main before compse         : 3608
--------------------------------------------------
len df_main after  compse         : 3608
df_main length                    : 3608
len texts  before token_threshold : 3608
len texts  after  token_threshold : 3608
len labels before token_threshold : 3608


In [None]:
# should have same values:
print("len labels after token_threshold :", len_labels_after_token_threshold)
print("vector_tfidf                     :", len(ds.vector_tfidf))
print("bugRepTokens                     :", len(ds.bugRepTokens))
print("vector_em                        :", len(ds.vector_em))

len labels after token_threshold : 3606
vector_tfidf                     : 3606
bugRepTokens                     : 3606
vector_em                        : 3606


In [None]:
print("docMaxTokenNo_org             :", ds.docMaxTokenNo_org) # orginal
print("docMaxTokenNo_token_threshold :", ds.docMaxTokenNo_token_threshold) # after applying token_threshold

# should have same values
print("Max sentence length           :", ds.docMaxTokenNo) # after applying keyword
print("vector_em[0]                  :", len(ds.vector_em[0]))

docMaxTokenNo_org             : 160706
docMaxTokenNo_token_threshold : 12782
Max sentence length           : 10883
vector_em[0]                  : 10883


In [None]:
print("w2vDic           :", len(ds.w2vDic)) # vocabulary
print("my_deleted_bug   :", len(ds.my_deleted_bug))
print("my_deleted_bug   :", ds.my_deleted_bug)

w2vDic           : 19299
my_deleted_bug   : 2
my_deleted_bug   : {2671: 25257, 2831: 160706}
