In [None]:
%matplotlib inline

import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

from scipy import stats

FIGURE_PATH = '../figures/'
DATA_PATHZ = '../data/'

THRESH = 0.00046  # bonferroni correction

In [None]:
def ll_values(values):
    signs = np.sign(values)
    vals = np.log10(np.abs(values) + 1)
    return vals * signs

def cliffsd(u, n1, n2):
    d = (2*u) / (n1 * n2) - 1
    return abs(d)

def cd_name(cd):
    if cd < 0.1:
        return 'n'
    elif 0.1 <= cd < 0.33:
        return 's'
    elif 0.33 <= cd < 0.474:
        return 'm'
    elif 0.474 <= cd:
        return 'l'

def print_boxes_size(df, save=False):
    names = {'lines_added': '#lines added',
             'lines_deleted': '#lines deleted',
             'num_files': '#files modified',
             'num_hunks': '#hunks'}

    for m in ['lines_added', 'lines_deleted', 'num_files', 'num_hunks']:
        col = '{}'.format(m)

        a = df[(df[col] != 0)][col].values
        iq = df[(df[col] != 0) & (df['internal_quality'] == True)][col].values
        eq = df[(df[col] != 0) & (df['external_quality'] == True)][col].values

        if len(iq) < 3 or len(eq) < 3:
            print('skipping', m)
            continue

        fig = plt.figure(figsize=(3, 2.3))
        ax = fig.add_subplot(111)
        bp = ax.boxplot([a, iq, eq], showfliers=False, positions=[1, 1.5, 2])

        ax.set_xticklabels(['all', 'perf.', 'corr.'])
        ax.set_ylabel('{}'.format(names[m]))

        plt.tight_layout()
        
        if save:
            plt.savefig('{}/boxplot_{}.pdf'.format(FIGURE_PATH, m))

def print_boxes_metrics(df, density=False):

    for m in ['McCC', 'LLOC', 'NLE', 'NUMPAR', 'CC', 'CLOC', 'CD', 'AD', 'NOA', 'CBO', 'NII', 'Minor', 'Major', 'Critical']:
        col = 'delta_{}'.format(m)

        a = ll_values(df[(df[col] != 0)][col].values)
        quali = ll_values(df[(df[col] != 0) & (df['internal_quality'] == True)][col].values)
        bugfix = ll_values(df[(df[col] != 0) & (df['external_quality'] == True)][col].values)

        if density:
            dcol = 'density_{}'.format(col)
            df[dcol] = df[col] / (df['lines_added'] + df['lines_deleted'] + 1)

            a = ll_values(df[(df[dcol] != 0)][dcol].values)
            quali = ll_values(df[(df[dcol] != 0) & (df['internal_quality'] == True)][dcol].values)
            bugfix = ll_values(df[(df[dcol] != 0) & (df['external_quality'] == True)][dcol].values)

        fig = plt.figure(figsize=(3, 2.3))
        ax = fig.add_subplot(111)
        bp = ax.boxplot([a, quali, bugfix], showfliers=False, positions=[1, 1.5, 2])

        ax.set_xticklabels(['all', 'perf.', 'corr.'])
        ax.set_ylabel('log {} delta + 1'.format(m))
        ax.yaxis.set_major_formatter(FormatStrFormatter('% 1.2f'))

        plt.tight_layout()
        if density:
            m = '{}_density'.format(m)
        plt.savefig('{}/boxplot_{}.pdf'.format(FIGURE_PATH, m))

def print_tables_metrics(df, density=False, thresh=0.05):
    tbl = ''

    # alternative hypothesis for MWU, we expect these to be lower for all quality improving changes, therfore alternative is greater
    greater = ['McCC', 'LLOC', 'NLE', 'NUMPAR', 'CC', 'NOA', 'CBO', 'NII', 'Minor', 'Major', 'Critical']

    # we expect these to be greater, therefore alternative is lower
    less = ['CLOC', 'CD', 'AD']

    features = ['delta_{}'.format(m) for m in ['McCC', 'LLOC', 'NLE', 'NUMPAR', 'CC', 'CLOC', 'CD', 'AD', 'NOA', 'CBO', 'NII', 'Minor', 'Major', 'Critical']]
    for f in features:
        df['scaled_{}'.format(f)] = np.copy(df[f].values)
        df['density_{}'.format(f)] = df[f] / (df['lines_added'] + df['lines_deleted'] + 1)

    for m in ['McCC', 'LLOC', 'NLE', 'NUMPAR', 'CC', 'CLOC', 'CD', 'AD', 'NOA', 'CBO', 'NII', 'Minor', 'Major', 'Critical']:
        col = 'delta_{}'.format(m)

        quali = df[(df['internal_quality'] == True)][col].values
        no_quali = df[(df['internal_quality'] == False)][col].values

        bugfix = df[(df['external_quality'] == True)][col].values
        no_bugfix = df[(df['external_quality'] == False)][col].values

        if density:
            quali = df[(df['internal_quality'] == True)]['density_' + col].values
            no_quali = df[(df['internal_quality'] == False)]['density_' + col].values
            
            bugfix = df[(df['external_quality'] == True)]['density_' + col].values
            no_bugfix = df[(df['external_quality'] == False)]['density_' + col].values

        try:
            alternative = 'greater'
            if col in less:
                alternative = 'less'

            #print(col)
            #print(stats.shapiro(quali))
            #print(stats.shapiro(no_quali))
            #print(stats.shapiro(bugfix))
            #print(stats.shapiro(no_bugfix))
            #print(stats.mannwhitneyu(no_quali, quali, alternative=alternative))
            #print(stats.mannwhitneyu(no_bugfix, bugfix, alternative=alternative))
            mwu, mwu_p = stats.mannwhitneyu(no_quali, quali, alternative=alternative)
            qe = '-'
            
            p1 = '{:.4f}'.format(mwu_p)
            cd1 = cliffsd(mwu, len(no_quali), len(quali))
            s1 = cd_name(cd1)
            if mwu_p < thresh:
                qe = '{:.2f} ({})'.format(cd1, s1)
                p1 = r'\textbf{' + p1 + '}'

            mwu2, mwu_p2 = stats.mannwhitneyu(no_bugfix, bugfix, alternative=alternative)            
            bfe = '-'
            
            p2 = '{:.4f}'.format(mwu_p2)
            cd2 = cliffsd(mwu2, len(no_bugfix), len(bugfix))
            s2 = cd_name(cd2)
            if mwu_p2 < thresh:
                bfe = '{:.2f} ({})'.format(cd2, s2)
                p2 = r'\textbf{' + p2 + '}'

            tbl += r'{} & {:.4f} & {} & {} & {:.4f} & {} & {}\\'.format(m, 
                                                                        np.median(no_quali) - np.median(quali), p1, qe, 
                                                                        np.median(no_bugfix) - np.median(bugfix), p2, bfe) + "\n"
        except ValueError as e:
            print(e)
            tbl += r'{} & '.format(m) + r'\multicolumn{3}{c}{-}' + ' & ' + r'\multicolumn{3}{c}{-}\\' + "\n"
            pass
    print(tbl)

def print_tables_size(df, thresh=0.05):
    tbl = ''
    names = {'lines_added': '\#lines add',
             'lines_deleted': '\#lines del',
             'num_files': '\#files mod',
             'num_hunks': '\#hunks'}
    for m in ['lines_added', 'lines_deleted', 'num_files', 'num_hunks']:
        col = '{}'.format(m)

        quali = df[(df['internal_quality'] == True)][col].values
        no_quali = df[(df['internal_quality'] == False)][col].values
        bugfix = df[(df['external_quality'] == True)][col].values
        no_bugfix = df[(df['external_quality'] == False)][col].values

        try:
            #print(col)
            #print(stats.shapiro(quali))
            #print(stats.shapiro(bugfix))
            #print(stats.mannwhitneyu(no_quali, quali, alternative='two-sided'))
            #print(stats.mannwhitneyu(no_bugfix, bugfix, alternative='two-sided'))

            #iqr_quali = stats.iqr(quali)
            #iqr_no_quali = stats.iqr(no_quali)
            mwu, mwu_p = stats.mannwhitneyu(no_quali, quali, alternative='two-sided')
            qe = '-'
            p1 = '{:.4f}'.format(mwu_p)
            
            cd1 = cliffsd(mwu, len(no_quali), len(quali))
            s1 = cd_name(cd1)
            if mwu_p < thresh:
                qe = '{:.2f} ({})'.format(cd1, s1)
                p1 = r'\textbf{' + p1 + '}'
            
            mwu2, mwu_p2 = stats.mannwhitneyu(no_bugfix, bugfix, alternative='two-sided')
            bfe = '-'
            p2 = '{:.4f}'.format(mwu_p2)
            
            cd2 = cliffsd(mwu2, len(no_bugfix), len(bugfix))
            s2 = cd_name(cd2)
            if mwu_p2 < thresh:
                bfe = '{:.2f} ({})'.format(cd2, s2)
                p2 = r'\textbf{' + p2 + '}'
            
            tbl += r'{} & {} & {} & {} & {} & {} & {}\\'.format(names[m], np.median(no_quali) - np.median(quali), p1, qe, np.median(no_bugfix) - np.median(bugfix), p2, bfe) + "\n"
        except ValueError as e:
            print(e)
            tbl += r'{} & '.format(m) + r'\multicolumn{3}{c}{-}' + ' & ' + r'\multicolumn{3}{c}{-}\\' + "\n"
            pass
    print(tbl)

def print_nz_table(df):
    """for each metric, number of non zero values, non zero perfective, non zero corrective"""
    tbl = ''
    for m in ['McCC', 'LLOC', 'NLE', 'NUMPAR', 'CC', 'CLOC', 'CD', 'AD', 'NOA', 'CBO', 'NII', 'Minor', 'Major', 'Critical']:
        col = 'delta_{}'.format(m)

        num = len(df)
        num_nz = len(df[(df[col] != 0)])
        
        num_quali = len(df[(df['internal_quality'] == True)][col])
        num_bugfix = len(df[(df['external_quality'] == True)][col])

        num_nz_quali = len(df[(df[col] != 0) & (df['internal_quality'] == True)][col])
        num_nz_bugfix = len(df[(df[col] != 0) & (df['external_quality'] == True)][col])
        
        tbl += r'{} & {:.2f} & {:.2f} & {:.2f}\\'.format(m, (num_nz * 100) / num, (num_nz_quali * 100) / num_quali, (num_nz_bugfix * 100) / num_bugfix) + "\n"

    print(tbl)

In [None]:
df = pd.read_csv('../data/sample_metrics_size_consensus.csv')

In [None]:
print_nz_table(df)

In [None]:
print_tables_metrics(df, density=True, thresh=THRESH)

In [None]:
print_tables_metrics(df, density=False, thresh=THRESH)

In [None]:
print_tables_size(df, thresh=THRESH)

In [None]:
print_boxes_size(df, save=True)

In [None]:
print_boxes_metrics(df, density=True)

In [None]:
print_boxes_metrics(df, density=False)