# Set Variables

In [None]:
# -------- dataset
software_name = "Camel"
# software_name = "CloudStack"
# software_name = "Geode"
# software_name = "Hbase"

In [None]:
dataset_file_names = {
    "Camel":      "Camel_DE - v.02",
    "CloudStack": "CloudStack_DE - v.01",
    "Geode":      "Geode_DE - v.01",
    "Hbase":      "Hbase_DE - v.01"
}

dataset_file_name = dataset_file_names[software_name]

# Google Colab

In [None]:
# Libs
!pip install enlighten
!pip install --upgrade matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# load data from google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
!ls "/content/gdrive/My Drive/"

Mounted at /content/gdrive
'Colab Notebooks'


In [None]:
# project folder path
project_folder = "gdrive/MyDrive/Colab Notebooks/paper/"

# data folder path
data_folder =    "00- My Data/one-phase method/"

# output folder path
output_folder =  "01- Jupyter Notebook/50- one-phase method _ CNN/00. Output/"

In [None]:
# dataset folder path
dataset_folder = software_name + "/"

# output data-folder path
output_data_folder = project_folder + output_folder + dataset_folder + dataset_file_name + "/"

# Libs

In [None]:
import string
import re
import json

import enlighten

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import transforms
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.text import TextCollection

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Config

In [None]:
mypaths = {
    "data": {
        "dataset": project_folder + data_folder + dataset_folder + dataset_file_name + ".csv"
    },
    "output": {
        "keyword_based": {
            "tfidf_word_weights": output_data_folder + "tfidf-word-weights-v01.json"
        }
    }
}

preprocessing_params = {
    "data":{
        "dataset": {
            "columns_name":   ["text", "bug_class_2"],
            "columns_dtype" : {0: "str", 1: "int64"},
            "bug_classes": [0, 1]
        },
    }
}

# I. Read Files

In [None]:
df_main = pd.read_csv(
    mypaths["data"]["dataset"], 
    names=preprocessing_params["data"]["dataset"]["columns_name"], 
    dtype=preprocessing_params["data"]["dataset"]["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

In [None]:
len_df_main_before_compse = len(df_main)

# II. Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

## obj

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["data"]["dataset"]["columns_name"], 
        preprocessing_params["data"]["dataset"]["bug_classes"]
    )
])

df_main = composed_pre(df_main)

In [None]:
len_df_main_after_compse = len(df_main)
texts = df_main["text"].tolist()

In [None]:
print("len df_main before compose: ", len_df_main_before_compse)
print("len df_main after  compose: ", len_df_main_after_compse)

len df_main before compose:  9019
len df_main after  compose:  9019


# IV. ProgressLines

In [None]:
class ProgressLines():
    
    def progress_lines(self, num, total, description, unit, colour):
        desc = self.set_strings_to_equal_len_(description)
        manager = enlighten.get_manager()
        progresses = []
        for i in range(num):
            prog = manager.counter(total=total[i], desc=desc[i], unit=unit[i], color=colour[i])
            prog.refresh()
            progresses.append(prog)
        self.progresses = progresses
    
    
    def set_strings_to_equal_len_(self, description):
        max_len = 0
        longest_string_length = len(max(description, key=len))
        w = []
        for i, word in enumerate(description):
            temp = longest_string_length - len(word)
            w.append(word + " " * temp)
        return w

# V. Preprocessing

In [None]:
class Preprocessing():
    
    w2vDic = {} # dic : {"w1": [0.1, 0.2, ...], "w2": [0.1, 0.3, ...], ...}
    paddingVector = np.zeros(300, dtype="float32")
    bugRepTokens = [] # [[w1, w2, w3, ...], [w1, w2, ...], ...]
    docMaxTokenNo_org = 0
    vector_tfidf = [] # array of dictinaries: [{"w1": 0.1, "w2": 0.3, ...}, {}, ...]
    
    
    # ************************** tokenize ************************** #
    
    def tokenize(self, texts):
        stop_words = set(stopwords.words("english"))
        excludedTokens = {"http", "url", "https"}
        
        for i, doc in enumerate(texts):
            thisTokens = []
            doc = doc.lower()
            for token in WordPunctTokenizer().tokenize(doc):
                if (token in string.punctuation or token in stop_words or token in excludedTokens or 
                    (not re.findall("\w", token)) or re.findall("\A[0-9]", token)):
                    continue
                thisTokens.append(token)
                self.w2vDic[token] = self.paddingVector
            self.bugRepTokens.append(thisTokens)
            if (len(thisTokens) > self.docMaxTokenNo_org):
                self.docMaxTokenNo_org = len(thisTokens)
    
    
    # calculate tfidf of corpuses words
    def vectorize_tfidf(self):
        texts = TextCollection(self.bugRepTokens)
        tempDic = {}
        
        # --- ProgressLines
        pl = ProgressLines()
        pl.progress_lines(1, [len(self.bugRepTokens)], ["TF-IDF_word-weights"], ["bug"], ["blue"])
        
        # --- vectorize_tfidf
        for doc in self.bugRepTokens:
            tempDic = {term: texts.tf_idf(term, doc) for term in doc}
            tempDic = {term: w for term, w in sorted(tempDic.items(), key=lambda item:item[1], reverse=True)}
            self.vector_tfidf.append(tempDic)
            pl.progresses[0].update()
    
    
    def save_to_file_tfidf(self, vector_tfidf_path):
        with open(vector_tfidf_path, "w") as fout:
            json.dump(self.vector_tfidf, fout)

## Vectorize

In [None]:
ds = Preprocessing()

ds.tokenize(texts)
ds.vectorize_tfidf()

ds.save_to_file_tfidf(mypaths["output"]["keyword_based"]["tfidf_word_weights"])

# See properties

In [None]:
print("len df_main before compose:", len_df_main_before_compse)
print("len df_main after  compose:", len_df_main_after_compse)
print("df_main length            :", len(df_main))

len df_main before compose: 9019
len df_main after  compose: 9019
df_main length            : 9019


In [None]:
# should have same values
print("len df_main before compose :", len_df_main_before_compse)
print("len df_main after  compose :", len_df_main_after_compse)
print("df_main length             :", len(df_main))
print("vector_tfidf               :", len(ds.vector_tfidf))

print("-" * 40)

# should have same values | after applying token_threshold
print("bugRepTokens               :", len(ds.bugRepTokens))

len df_main before compose : 9019
len df_main after  compose : 9019
df_main length             : 9019
vector_tfidf               : 9019
----------------------------------------
bugRepTokens               : 9019


In [None]:
print("docMaxTokenNo_org             :", ds.docMaxTokenNo_org) # orginal

docMaxTokenNo_org             : 18427


In [None]:
print("w2vDic           :", len(ds.w2vDic)) # vocabulary

w2vDic           : 28870
