# PredictBugsClass

In [1]:
class PredictBugsClass():
    
    def __init__(self, df, empty_bug_class_predictions, l1_th):
        # df: DataFrame("bug_report_id", "label|prediction_th")
        
        self.gdf = df.groupby("bug_report_id")
        self.bug_class_predictions = empty_bug_class_predictions
        self.l1_th = l1_th
    
    # =============================================
    #                   bug class
    # =============================================
    
    def predict_bugs_class(self):
        if self.l1_th:
            self.gdf.apply(lambda g: self.sub_bug_class_l1th_yes_(g))
        else:
            self.gdf.apply(lambda g: self.sub_bug_class_l1th_no_(g))
    
    
    def sub_bug_class_l1th_yes_(self, g):
        # --------- bc_2: {1, 0}
        bc = None
        label_column = g.iloc[:, -1].to_list()
        label_column_num_ones = label_column.count(1)
        
        temp3 = label_column_num_ones / len(label_column)
        bc = 1 if temp3 >= self.l1_th else 0
        self.bug_class_predictions.loc[(self.bug_class_predictions["bug_report_id"] == g.name), "bug_class_2"] = bc
    

    def sub_bug_class_l1th_no_(self, g):
        label_column = g.iloc[:, -1].to_list()
        temp = {
            "label": {
                "contain_1":  True if 1  in label_column else False,
                "contain_n1": True if -1 in label_column else False,
                "contain_0":  True if 0  in label_column else False,
            }
        }
        
        # --------- bc_2: {1, 0}
        
        bc = None
        bc = 1 if temp["label"]["contain_1"] else 0
        self.bug_class_predictions.loc[(self.bug_class_predictions["bug_report_id"] == g.name), "bug_class_2"] = bc
        
        
        # --------- bc_3: {1, -1, 0}
        
        bc = None
        if temp["label"]["contain_1"]:
            bc = 1
        elif temp["label"]["contain_n1"]:
            bc = -1
        else:
            bc = 0
        self.bug_class_predictions.loc[(self.bug_class_predictions["bug_report_id"] == g.name), "bug_class_3"] = bc
        
        
        # --------- bc_4: {+-, +, -, 0}
        
        bc = None
        if (temp["label"]["contain_1"] and temp["label"]["contain_n1"]):
            bc = "+-"
        elif temp["label"]["contain_1"]:
            bc = "+"
        elif temp["label"]["contain_n1"]:
            bc = "-"
        else:
            bc = "0"
        self.bug_class_predictions.loc[(self.bug_class_predictions["bug_report_id"] == g.name), "bug_class_4"] = bc
    
    def free_memory(self):
        self.gdf = None

# CalcBugClassDistribution

In [2]:
class CalcBugClassDistribution:
    
    def __init__(self, df_bug_pred):
        self.df_bug_pred = df_bug_pred
        self.my_bug_classes = my_bug_classes_config
        self.each_bug_class_num = {}
        self.each_bug_class_per = {}
        self.each_bug_class_ratio = {}
        
        for i, bug_class in enumerate(self.my_bug_classes):
            self.each_bug_class_ratio[bug_class] = {}
            if i == 0: # bug_label_2
                self.each_bug_class_num[bug_class] =   {"0": 0, "1":  0}
                self.each_bug_class_per[bug_class] =   {"0": 0, "1":  0}
            elif i == 1: # bug_label_3
                self.each_bug_class_num[bug_class] =   {"0": 0, "1":  0, "-1": 0}
                self.each_bug_class_per[bug_class] =   {"0": 0, "1":  0, "-1": 0}
            else: # bug_label_4
                self.each_bug_class_num[bug_class] =   {"0": 0, "+-": 0, "+":  0, "-": 0}
                self.each_bug_class_per[bug_class] =   {"0": 0, "+-": 0, "+":  0, "-": 0}
        

        # plot values
        self.colours = ["red", "blue", "green", "cyan"]
    
    
    # --------------------------- calc_bug_label_num_per
    
    def calc_bug_label_num_per(self):
        for mbc in self.my_bug_classes:
            self.df_bug_pred.groupby(mbc).apply(lambda g: self.sub_bln_(g, mbc))
        #self.df_bug_pred.groupby("bug_class_2").apply(lambda g: self.sub_bln_(g, "bug_class_2"))
    
    
    def sub_bln_(self, g, gby):
        self.each_bug_class_num[gby][str(g.name)] = len(g)
        self.each_bug_class_per[gby][str(g.name)] = (len(g) / len(self.df_bug_pred)) * 100
    
    
    # --------------------------- calc_ratios
    
    def calc_bug_label_ratios(self):
        
        for bug_class in self.my_bug_classes:
            temp = self.each_bug_class_num[bug_class]
            mydata_sorted = [(key, value) for key, value in sorted(temp.items(), 
                                                                   key=lambda item: item[1], reverse=True)]
        
            num_classes = len(temp)
            for i in range(num_classes - 1):
                for j in range(i + 1, num_classes):
                    tempKey1 = mydata_sorted[i][0]
                    tempKey2 = mydata_sorted[j][0]
                    mykey = tempKey1 + "/" + tempKey2

                    tempValue1 = mydata_sorted[i][1]
                    tempValue2 = mydata_sorted[j][1]
                    if tempValue2 != 0:
                        self.each_bug_class_ratio[bug_class][mykey] = tempValue1 / tempValue2
    
    
    def free_memory(self):
        self.df_bug_pred = None

# ClassDistribution

In [3]:
class ClassDistribution:
    
    def __init__(self, df_bug_pred, bug_class_num, bug_class_per):
        self.df_bug_pred = df_bug_pred
        self.my_bug_classes = my_bug_classes_config
        self.bug_class_num = bug_class_num
        self.bug_class_per = bug_class_per

        # plot values
        self.colours = ["red", "blue", "green", "cyan"]
    
    # --------------------------- bar plot
    
    def plot_percent_num(self, bla, th):
        # figsize=(a, b) : a_width, b_height
        fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10, 10))
        width = 0.2
        my_x = [-0.3, -0.1, 0.1, 0.3]
        
        temp1 = str(len(self.df_bug_pred))
        temp2 = len(temp1) - 1
        temp3 = "1" + "0" * temp2
        temp4 = temp1[0] + "0" * temp2
        temp5 = int(temp3) + int(temp4)
        
        ml = list( range(0, temp5, int(temp3)) )
        ml.append(len(self.df_bug_pred))
        
        for i, key in enumerate(self.my_bug_classes):
            ax1 = axes[i][0]
            ax2 = axes[i][1]
            for j, cl in enumerate(self.bug_class_per[key]):
                ax1.bar(my_x[j], self.bug_class_num[key][cl], width=width, label=cl, color=self.colours[j])
                ax2.bar(my_x[j], self.bug_class_per[key][cl], width=width, label=cl, color=self.colours[j])
            
            tempXlabel = "{}".format(key)
            ax1.set_xlabel(tempXlabel)
            ax2.set_xlabel(tempXlabel)
            
            ax1.set_ylabel("Number of Bug Reports")
            ax2.set_ylabel("Percentages (%)")
            
            tempXlim = [-0.5, 0.5]
            ax1.set_xlim(tempXlim)
            ax2.set_xlim(tempXlim)
            
            
            ax1.set_ylim([0, temp5])
            ax2.set_ylim([0, 110])
            
            
            ax1.get_xaxis().set_ticks([])
            ax2.get_xaxis().set_ticks([])
            
            
            ax1.get_yaxis().set_ticks(ml)
            ax2.get_yaxis().set_ticks(list(range(0, 101, 10)))
            
            ax1.legend()
            ax2.legend()
            
            ax1.grid(axis="y")
            ax2.grid(axis="y")
        
        fig.suptitle("bla: {}, th: {}".format(bla, th), fontsize=16)
        fig.tight_layout(pad=3.0)
    
    
    def free_memory(self):
        self.df_bug_pred = None

# CalcNumtsNumbugs

In [4]:
class CalcNumtsNumbugs():
    
    def __init__(self, df, bcp):
        self.df_main = df

        self.bug_class_predictions_numts = bcp
        self.bug_class_predictions_numts["num_ts"] = ""

        self.df_numts_numbugs = None
    
    # ---------------------- calc_bug_numts
    # brids: bug report ids
    # cids: class ids
    def calc_bug_numts(self):
        tempdf = self.df_main.groupby("bug_report_id")
        tempdf.apply(lambda g: self.sub_calc_(g))
    
    
    def sub_calc_(self, g):
        len_time_stamps = len(g)
        self.bug_class_predictions_numts.loc[
            self.bug_class_predictions_numts["bug_report_id"] == g.name, 
            "num_ts"] = len_time_stamps
    
    
    # ---------------------- unique_num_ts

    def create_df_numts_numbugs(self):
        temp = self.bug_class_predictions_numts["num_ts"].tolist()
        temp_unique_num_ts = list(set(temp))
        temp_unique_num_ts_bugs = []
        for unts in temp_unique_num_ts:
            temp_unique_num_ts_bugs.append(temp.count(unts))
        self.df_numts_numbugs = pd.DataFrame(data=zip(temp_unique_num_ts, temp_unique_num_ts_bugs),
                                           columns=["unique_num_ts","num_bugs"])
    

    def sort_df_numts_numbugs(self, base_unts=True):
        if base_unts:
            self.df_numts_numbugs = self.df_numts_numbugs.sort_values(by=["unique_num_ts"], ascending=True)
        else:
            self.df_numts_numbugs = self.df_numts_numbugs.sort_values(by=["num_bugs"], ascending=True)
    

    def add_columns_bugper_cumulative(self):
        num_bugs = len(self.bug_class_predictions_numts)
        self.df_numts_numbugs["per_bugs"] = 100 * self.df_numts_numbugs["num_bugs"].div(num_bugs).round(2)
        self.df_numts_numbugs["cum_num_bugs"] = self.df_numts_numbugs["num_bugs"].cumsum()
        self.df_numts_numbugs["cum_per_bugs"] = 100 * self.df_numts_numbugs["cum_num_bugs"].div(num_bugs).round(2)


    # ------------------------- find_max_numbugs_inwhich_numts
    def find_max_numbugs_unumts_numbugs(self):
        max_unique_num_ts = self.df_numts_numbugs.iloc[self.df_numts_numbugs["unique_num_ts"].argmax()]
        max_num_bugs = self.df_numts_numbugs.iloc[self.df_numts_numbugs["num_bugs"].argmax()]
        print("MAX _ unique_num_ts :")
        print(max_unique_num_ts)
        print("-" * 50)

        print("MAX _ num_bugs      :")
        print(max_num_bugs)
        return max_unique_num_ts["unique_num_ts"], max_num_bugs["num_bugs"]
    
    def free_memory(self):
        self.df_main = None

# PlotNumtsNumbugs

In [6]:
class PlotNumtsNumbugs():

    # ******************************* plots_numts_bugclass

    def plots_numts_bugclass(self, max_unique_num_ts, bug_class_predictions_numts):
        # figsize=(a, b) : a_width, b_height
        # print(len(self.each_bug_num_ts))
        # print(len(self.each_bug_class))
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 3))

        axes.set_xlabel("Number of Timestamp", rotation=0, labelpad=10)
        axes.set_ylabel("Bug Class [0, 1]", rotation=90, labelpad=10)

        my_unit = 500
        my_mode = max_unique_num_ts % my_unit
        end_xlim = 0
        if my_mode:
            temp2 = max_unique_num_ts / my_unit
            temp3 = math.ceil(temp2)
            end_xlim = temp3 * my_unit
        else:
            temp2 = max_unique_num_ts / my_unit
            end_xlim = temp2 + my_unit
            
        axes.set_xlim([-my_unit, end_xlim])
        axes.set_ylim([-0.5, 1.5])

        temp_xticks = list(range(0, max_unique_num_ts, my_unit))
        temp_xticks.append(max_unique_num_ts)
        axes.get_xaxis().set_ticks(temp_xticks)
        axes.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

        axes.get_yaxis().set_ticks(list(range(0, 2, 1)))

        x = bug_class_predictions_numts["num_ts"].tolist()
        y = bug_class_predictions_numts["bug_class_2"].tolist()
        axes.scatter(x, y)
        
        axes.grid(True)
        fig.tight_layout(pad=3.0)
    

    # ******************************* plotdist

    def plotdist(self, max_unique_num_ts, df_numts_numbugs):
        x = df_numts_numbugs["unique_num_ts"]
        y_ax1 = df_numts_numbugs["num_bugs"]
        y_ax2 = df_numts_numbugs["cum_num_bugs"]

        fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 7))
        ax1 = axes[0]
        ax2 = axes[1]


        # -------- set_xlabel
        tempXlabel = "Unique Number of Timestamp"
        ax1.set_xlabel(tempXlabel, rotation=0, labelpad=10)
        ax2.set_xlabel(tempXlabel, rotation=0, labelpad=10)


        # -------- set_ylabel
        ax1.set_ylabel("Number of Bug", rotation=90, labelpad=10)
        ax2.set_ylabel("Cumulative Number of Bug", rotation=90, labelpad=10)


        # -------- set_xlim
        my_unit = 500
        my_mode = max_unique_num_ts % my_unit
        end_xlim = 0
        if my_mode:
            temp2 = max_unique_num_ts / my_unit
            temp3 = math.ceil(temp2)
            end_xlim = temp3 * my_unit
        else:
            temp2 = max_unique_num_ts / my_unit
            end_xlim = temp2 + my_unit

        tempXlim = [-my_unit, end_xlim]
        ax1.set_xlim(tempXlim)
        ax1.set_xlim(tempXlim)
        
        
        # -------- get_xaxis().set_ticks
        temp_xticks = list(range(0, max_unique_num_ts, my_unit))
        temp_xticks.append(max_unique_num_ts)

        ax1.get_xaxis().set_ticks(temp_xticks)
        ax2.get_xaxis().set_ticks(temp_xticks)
        ax1.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        ax2.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        

        # -------- plot
        ax1.scatter(x, y_ax1)
        ax2.plot(x, y_ax2)


        # -------- grid
        ax1.grid(True)
        ax2.grid(True)


        # -------- fig.tight_layout
        fig.tight_layout(h_pad=5, w_pad=5)