In [None]:
%matplotlib inline

import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

from scipy import stats

FIGURE_PATH = '../figures/'
PNG_PATH = '../images/'
DATA_PATH = '../data/'

SIGNIFICANCE_LEVEL = 0.05

# 4 size metrics, 14 code metrics, 2 groups (perfective, corrective) 3 statistical tests (normality, MWU)
# 4 * 2 * 3 + 14 * 2 * 3 + 14 * 2 * 3
NUM_TESTS = 192 

THRESH = SIGNIFICANCE_LEVEL / NUM_TESTS

COLUMBUSQM_FEATURES = ['McCC_file', 'LLOC_file', 'NLE_method_sum', 'NUMPAR_method_sum', 'CC_class_sum', 'CLOC_class_sum', 'CD_class_sum', 'AD_class_sum', 'NOA_class_sum', 'CBO_class_sum', 'NII_class_sum', 'Minor', 'Major', 'Critical']

In [None]:
def ll_values(values):
    signs = np.sign(values)
    vals = np.log10(np.abs(values) + 1)
    return vals * signs

def cliffsd(u, n1, n2):
    d = (2*u) / (n1 * n2) - 1
    return abs(d)

def cd_name(cd):
    if cd < 0.1:
        return 'n'
    elif 0.1 <= cd < 0.33:
        return 's'
    elif 0.33 <= cd < 0.474:
        return 'm'
    elif 0.474 <= cd:
        return 'l'

def plot_parent_density(df, feature_names):
    for m in feature_names:
        col = 'parent_{}'.format(m)

        perf = ll_values(df[(df['internal_quality'] == True)][col].values)
        corr = ll_values(df[(df['external_quality'] == True)][col].values)

        fig = plt.figure(figsize=(3, 2.3))
        ax = fig.gca()

        dtf = pd.DataFrame()
        dtf['perf.'] = perf

        dtf2 = pd.DataFrame()
        dtf2['corr.'] = corr

        dtf.plot(kind='density', ax=ax)
        dtf2.plot(kind='density', ax=ax)
        ax.set_xlabel('log {} + 1'.format(m.split('_')[0]))
        plt.tight_layout()
        plt.savefig('{}/density_parent_{}.pdf'.format(FIGURE_PATH, m))
    
def print_boxes_size(df, save=False, predictions=False, only_changes=False):
    names = {'lines_added': '#lines added',
             'lines_deleted': '#lines deleted',
             'files_modified': '#files modified',
             'num_hunks': '#hunks'}

    for m in ['lines_added', 'lines_deleted', 'files_modified', 'num_hunks']:
        col = '{}'.format(m)

        a = df[(df[col] != 0)][col].values
        iq = df[(df[col] != 0) & (df['internal_quality'] == True)][col].values
        eq = df[(df[col] != 0) & (df['external_quality'] == True)][col].values

        if len(iq) < 3 or len(eq) < 3:
            print('skipping', m)
            continue

        fig = plt.figure(figsize=(3, 2.3))
        ax = fig.add_subplot(111)
        bp = ax.boxplot([a, iq, eq], showfliers=False, positions=[1, 1.5, 2])

        ax.set_xticklabels(['all', 'perf.', 'corr.'])
        ax.set_ylabel('{}'.format(names[m]))

        plt.tight_layout()
        
        if save:
            if predictions:
                m = '{}_predictions'.format(m)
            
            if only_changes:
                m = '{}_only_changes'.format(m)

            plt.savefig('{}/boxplot_{}.pdf'.format(FIGURE_PATH, m))
            plt.savefig('{}/boxplot_{}.png'.format(PNG_PATH, m))

def print_boxes_metrics(df, feature_names, density=False, predictions=False, only_changes=False):

    for m in feature_names:
        col = 'delta_{}'.format(m)

        a = ll_values(df[(df[col] != 0)][col].values)
        quali = ll_values(df[(df[col] != 0) & (df['internal_quality'] == True)][col].values)
        bugfix = ll_values(df[(df[col] != 0) & (df['external_quality'] == True)][col].values)

        if density:
            dcol = 'density_{}'.format(col)
            df[dcol] = df[col] / (df['lines_added'] + df['lines_deleted'] + 1)

            a = ll_values(df[(df[dcol] != 0)][dcol].values)
            quali = ll_values(df[(df[dcol] != 0) & (df['internal_quality'] == True)][dcol].values)
            bugfix = ll_values(df[(df[dcol] != 0) & (df['external_quality'] == True)][dcol].values)

        fig = plt.figure(figsize=(3, 2.3))
        ax = fig.add_subplot(111)
        bp = ax.boxplot([a, quali, bugfix], showfliers=False, positions=[1, 1.5, 2])

        ax.set_xticklabels(['all', 'perf.', 'corr.'])
        ax.set_ylabel('log {} delta + 1'.format(m.split('_')[0]))
        ax.yaxis.set_major_formatter(FormatStrFormatter('% 1.2f'))

        plt.tight_layout()
        if density:
            m = '{}_density'.format(m)
        if predictions:
            m = '{}_predictions'.format(m)
        if only_changes:
            m = '{}_only_changes'.format(m)

        plt.savefig('{}/boxplot_{}.pdf'.format(FIGURE_PATH, m))
        plt.savefig('{}/boxplot_{}.png'.format(PNG_PATH, m))

def print_tables_metrics(df, feature_names, density=False, thresh=0.05):
    tbl = ''

    # alternative hypothesis for MWU, we expect these to be lower for all quality improving changes (H0), therfore alternative (H1) is greater
    greater = ['McCC_file', 'LLOC_file', 'NLE_method_sum', 'NUMPAR_method_sum', 'CC_class_sum' 'NOA_class_sum', 'CBO_class_sum', 'NII_class_sum', 'Minor', 'Major', 'Critical']

    # we expect these to be greater, therefore alternative is lower
    less = ['CLOC_class_sum', 'CD_class_sum', 'AD_class_sum']

    features = ['delta_{}'.format(m) for m in feature_names]
    for f in features:
        df['scaled_{}'.format(f)] = np.copy(df[f].values)
        df['density_{}'.format(f)] = df[f] / (df['lines_added'] + df['lines_deleted'] + 1)

    for m in feature_names:
        col = 'delta_{}'.format(m)

        quali = df[(df['internal_quality'] == True)][col].values
        no_quali = df[(df['internal_quality'] == False)][col].values

        bugfix = df[(df['external_quality'] == True)][col].values
        no_bugfix = df[(df['external_quality'] == False)][col].values

        if density:
            quali = df[(df['internal_quality'] == True)]['density_' + col].values
            no_quali = df[(df['internal_quality'] == False)]['density_' + col].values
            
            bugfix = df[(df['external_quality'] == True)]['density_' + col].values
            no_bugfix = df[(df['external_quality'] == False)]['density_' + col].values

        try:
            alternative = 'greater'
            if col in less:
                alternative = 'less'

            #print(col)
            #print('quali', stats.shapiro(quali))
            #print('no_quali', stats.shapiro(no_quali))
            #print('bugfix', stats.shapiro(bugfix))
            #print('no_bugfix', stats.shapiro(no_bugfix))
            mwu, mwu_p = stats.mannwhitneyu(no_quali, quali, alternative=alternative)
            qe = '-'
            
            p1 = '{:.4f}'.format(mwu_p)
            if mwu_p < 0.0001:
                p1 = r'\textless0.0001'

            cd1 = cliffsd(mwu, len(no_quali), len(quali))
            s1 = cd_name(cd1)
            if mwu_p < thresh:
                qe = '{:.2f} ({})'.format(cd1, s1)
                p1 = r'\textbf{' + p1 + '}'

            mwu2, mwu_p2 = stats.mannwhitneyu(no_bugfix, bugfix, alternative=alternative)            
            bfe = '-'
            
            p2 = '{:.4f}'.format(mwu_p2)
            if mwu_p2 < 0.0001:
                p2 = r'\textless0.0001'

            cd2 = cliffsd(mwu2, len(no_bugfix), len(bugfix))
            s2 = cd_name(cd2)
            if mwu_p2 < thresh:
                bfe = '{:.2f} ({})'.format(cd2, s2)
                p2 = r'\textbf{' + p2 + '}'

            tbl += r'{} & {} & {} & {} & {}\\'.format(m.split('_')[0], 
                                                                        p1, qe, 
                                                                        p2, bfe) + "\n"
        except ValueError as e:
            print(e)
            tbl += r'{} & '.format(m.split('_')[0]) + r'\multicolumn{2}{c}{-}' + ' & ' + r'\multicolumn{2}{c}{-}\\' + "\n"
            pass
    print(tbl)

def print_tables_size(df, thresh=0.05):
    tbl = ''
    names = {'lines_added': '\#lines add',
             'lines_deleted': '\#lines del',
             'files_modified': '\#files mod',
             'num_hunks': '\#hunks'}
    for m in ['lines_added', 'lines_deleted', 'files_modified', 'num_hunks']:
        col = '{}'.format(m)

        quali = df[(df['internal_quality'] == True)][col].values
        no_quali = df[(df['internal_quality'] == False)][col].values
        bugfix = df[(df['external_quality'] == True)][col].values
        no_bugfix = df[(df['external_quality'] == False)][col].values

        try:
            #print(col)
            #print('quali', stats.shapiro(quali))
            #print('no_quali', stats.shapiro(no_quali))
            #print('bugfix', stats.shapiro(bugfix))
            #print('no_bugfix', stats.shapiro(no_bugfix))
            mwu, mwu_p = stats.mannwhitneyu(no_quali, quali, alternative='two-sided')
            qe = '-'
            p1 = '{:.4f}'.format(mwu_p)
            if mwu_p < 0.0001:
                p1 = r'\textless0.0001'
            
            cd1 = cliffsd(mwu, len(no_quali), len(quali))
            s1 = cd_name(cd1)
            if mwu_p < thresh:
                qe = '{:.2f} ({})'.format(cd1, s1)
                p1 = r'\textbf{' + p1 + '}'
            
            mwu2, mwu_p2 = stats.mannwhitneyu(no_bugfix, bugfix, alternative='two-sided')
            bfe = '-'
            p2 = '{:.4f}'.format(mwu_p2)

            if mwu_p2 < 0.0001:
                p2 = r'\textless0.0001'

            cd2 = cliffsd(mwu2, len(no_bugfix), len(bugfix))
            s2 = cd_name(cd2)
            if mwu_p2 < thresh:
                bfe = '{:.2f} ({})'.format(cd2, s2)
                p2 = r'\textbf{' + p2 + '}'
            
            #tbl += r'{} & {} & {} & {} & {} & {} & {}\\'.format(names[m], np.median(no_quali) - np.median(quali), p1, qe, np.median(no_bugfix) - np.median(bugfix), p2, bfe) + "\n"
            tbl += r'{} & {} & {} & {} & {}\\'.format(names[m], p1, qe, p2, bfe) + "\n"
        except ValueError as e:
            print(e)
            #tbl += r'{} & '.format(m) + r'\multicolumn{3}{c}{-}' + ' & ' + r'\multicolumn{3}{c}{-}\\' + "\n"
            tbl += r'{} & '.format(m) + r'\multicolumn{2}{c}{-}' + ' & ' + r'\multicolumn{2}{c}{-}\\' + "\n"
            pass
    print(tbl)

def print_nz_table(df, feature_names):
    """for each metric, number of non zero values, non zero perfective, non zero corrective"""
    tbl = ''
    for m in feature_names:
        col = 'delta_{}'.format(m)

        num = len(df)
        num_nz = len(df[(df[col] != 0)])
        
        num_quali = len(df[(df['internal_quality'] == True)][col])
        num_bugfix = len(df[(df['external_quality'] == True)][col])

        num_nz_quali = len(df[(df[col] != 0) & (df['internal_quality'] == True)][col])
        num_nz_bugfix = len(df[(df[col] != 0) & (df['external_quality'] == True)][col])
        
        tbl += r'{} & {:.2f} & {:.2f} & {:.2f}\\'.format(m.split('_')[0], (num_nz * 100) / num, (num_nz_quali * 100) / num_quali, (num_nz_bugfix * 100) / num_bugfix) + "\n"

    print(tbl)
    
# how were the files before the change?
def parent_files(df, features, save=False, predictions=False):
    table = ''
    for metric in features:
        delta = 'delta_' + metric
        df['fp_{}'.format(metric)] = df['parent_{}'.format(metric)] / df['files_modified']
        df['fc_{}'.format(metric)] = df['current_{}'.format(metric)] / df['files_modified']

        for m in ['fp_{}'.format(metric)]:  #, 'current_{}'.format(metric)]:
            
            a = df[m].values
            perf = df[(df['internal_quality'] == True)][m].values
            corr = df[(df['external_quality'] == True)][m].values

            fig = plt.figure(figsize=(3, 2.3))
            ax = fig.add_subplot(111)
            bp = ax.boxplot([a, perf, corr], showfliers=False, positions=[1, 1.5, 2])

            ax.set_xticklabels(['all', 'perf.', 'corr.'])
            ax.set_ylabel(r'$\sum ' + str('_'.join(m.split('_')[1:]).split('_')[0]) +  r'/changed files$')
            plt.tight_layout()
        
            if save:
                if predictions:
                    m = '{}_predictions'.format(m)
                plt.savefig('{}/boxplot_parent_{}.pdf'.format(FIGURE_PATH, m))
                plt.savefig('{}/boxplot_parent_{}.png'.format(PNG_PATH, m))

            # plt.show()
            table += r'{} & {:.2f} & {:.2f} & {:.2f}\\'.format(m.replace('fp_', '').split('_')[0], np.median(a), np.median(perf), np.median(corr)) + '\n'
            # print(m, 'median, all:', np.median(a), 'perfective:', np.median(perf), 'corrective:', np.median(corr))
            #print(m, 'avg, all:', np.mean(a), 'perfective:', np.mean(perf), 'corrective:', np.mean(corr))
    print(table)
    
def print_tables_parents(df, feature_names, thresh=0.05):
    tbl = ''

    for m in feature_names:
        col = 'parent_{}'.format(m)

        quali = df[(df['internal_quality'] == True)][col].values
        no_quali = df[(df['internal_quality'] == False)][col].values

        bugfix = df[(df['external_quality'] == True)][col].values
        no_bugfix = df[(df['external_quality'] == False)][col].values

        try:
            # alternative is two-sided, we have no expectations
            alternative = 'two-sided'

            #print(col)
            #print('quali', stats.shapiro(quali))
            #print('no_quali', stats.shapiro(no_quali))
            #print('bugfix', stats.shapiro(bugfix))
            #print('no_bugfix', stats.shapiro(no_bugfix))
            mwu, mwu_p = stats.mannwhitneyu(no_quali, quali, alternative=alternative)
            qe = '-'
            
            p1 = '{:.4f}'.format(mwu_p)
            if mwu_p < 0.0001:
                p1 = r'\textless0.0001'

            cd1 = cliffsd(mwu, len(no_quali), len(quali))
            s1 = cd_name(cd1)
            if mwu_p < thresh:
                qe = '{:.2f} ({})'.format(cd1, s1)
                p1 = r'\textbf{' + p1 + '}'

            mwu2, mwu_p2 = stats.mannwhitneyu(no_bugfix, bugfix, alternative=alternative)            
            bfe = '-'
            
            p2 = '{:.4f}'.format(mwu_p2)
            if mwu_p2 < 0.0001:
                p2 = r'\textless0.0001'

            cd2 = cliffsd(mwu2, len(no_bugfix), len(bugfix))
            s2 = cd_name(cd2)
            if mwu_p2 < thresh:
                bfe = '{:.2f} ({})'.format(cd2, s2)
                p2 = r'\textbf{' + p2 + '}'

            tbl += r'{} & {} & {} & {} & {}\\'.format(m.split('_')[0], 
                                                                        p1, qe, 
                                                                        p2, bfe) + "\n"
        except ValueError as e:
            print(e)
            tbl += r'{} & '.format(m.split('_')[0]) + r'\multicolumn{2}{c}{-}' + ' & ' + r'\multicolumn{2}{c}{-}\\' + "\n"
            pass
    print(tbl)
    
def print_project_table(df):
    df['committer_date'] = pd.to_datetime(df['committer_date'])
    tbl = ''
    sums = {'commits': 0, 'samples': 0, 'perfective_gt': 0, 'corrective_gt': 0, 'perfective': 0, 'corrective': 0}
    for project_name in sorted(df['project'].unique()):
        tmp = df[df['project'] == project_name]
        year_start = tmp['committer_date'].min().year
        year_end = tmp['committer_date'].max().year
        commits = len(tmp)
        num_gt = len(tmp[tmp['is_manual'] == True])
        perfective_gt = len(tmp[(tmp['is_manual'] == True) & (tmp['internal_quality'] == True)])
        corrective_gt = len(tmp[(tmp['is_manual'] == True) & (tmp['external_quality'] == True)])
        perfective = len(tmp[(tmp['internal_quality'] == True)])
        corrective = len(tmp[(tmp['external_quality'] == True)])
        tbl += r'{} & {}-{} & {:,} & {:,} & {:,} & {:,} & {:,} & {:,}\\'.format(project_name, year_start, year_end, commits, num_gt, perfective_gt, corrective_gt, perfective, corrective) + "\n"
        sums['commits'] += commits
        sums['samples'] += num_gt
        sums['perfective_gt'] += perfective_gt
        sums['corrective_gt'] += corrective_gt
        sums['perfective'] += perfective
        sums['corrective'] += corrective
    tbl += r'\midrule' + "\n" 
    tbl += r' & & {:,} & {:,} & {:,} & {:,} & {:,} & {:,}\\'.format(sums['commits'], sums['samples'], sums['perfective_gt'], sums['corrective_gt'], sums['perfective'], sums['corrective']) + "\n"
    print(tbl)

In [None]:
df = pd.read_csv('../data/all_changes_sebert.csv.gz')

In [None]:
gt = df[df['is_manual'] == True].copy()  # ground truth (only manually labeled)

In [None]:
print_project_table(df)

In [None]:
print_tables_parents(gt, COLUMBUSQM_FEATURES, thresh=THRESH)

In [None]:
print_tables_parents(df, COLUMBUSQM_FEATURES, thresh=THRESH)

In [None]:
print_nz_table(gt, COLUMBUSQM_FEATURES)

In [None]:
print_nz_table(df, COLUMBUSQM_FEATURES)

In [None]:
print_tables_metrics(gt, COLUMBUSQM_FEATURES, density=True, thresh=THRESH)

In [None]:
print_tables_metrics(df, COLUMBUSQM_FEATURES, density=True, thresh=THRESH)

In [None]:
print_tables_size(df, thresh=THRESH)

In [None]:
print_tables_size(gt, thresh=THRESH)

In [None]:
print_boxes_size(df, save=True, predictions=True)

In [None]:
print_boxes_size(gt, save=True, predictions=False)

In [None]:
print_boxes_metrics(df, COLUMBUSQM_FEATURES, density=True, predictions=True, only_changes=False)

In [None]:
print_boxes_metrics(gt, COLUMBUSQM_FEATURES, density=True, predictions=False, only_changes=False)

In [None]:
parent_files(df, COLUMBUSQM_FEATURES, save=True, predictions=True)

In [None]:
parent_files(gt, COLUMBUSQM_FEATURES, save=True, predictions=False)

In [None]:
plot_parent_density(df, COLUMBUSQM_FEATURES)