In [None]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as matplotlib

from scipy import stats

# const from Gierlappen
from const import PMD_RULES

# TANGLING Projects
PROJECTS = ['ant-ivy', 'commons-bcel', 'commons-beanutils', 'commons-codec', 'commons-collections', 'commons-compress', 'commons-configuration', 'commons-dbcp', 'commons-digester',
            'commons-io', 'commons-jcs', 'commons-lang', 'commons-math', 'commons-net', 'commons-scxml', 'commons-validator', 'commons-vfs', 'giraph', 'gora', 'opennlp', 'parquet-mr', 'santuario-java',
            'wss4j']
PROJECTS = sorted(PROJECTS)

# Static Source Code Metrics from Sourcemeter homepage https://www.sourcemeter.com/resources/java/ 2018-07-24
STATIC = ['PDA', 'LOC', 'CLOC', 'PUA', 'McCC', 'LLOC',  'LDC', 'NOS', 'MISM', 'CCL', 'TNOS', 'TLLOC',
          'NLE', 'CI', 'HPL', 'MI', 'HPV', 'CD', 'NOI', 'NUMPAR', 'MISEI', 'CC', 'LLDC', 'NII', 'CCO', 'CLC', 'TCD', 'NL', 'TLOC',  'CLLC', 'TCLOC', 'MIMS', 'HDIF', 'DLOC', 'NLM', 'DIT', 'NPA', 'TNLPM', 
          'TNLA', 'NLA', 'AD', 'TNLPA', 'NM', 'TNG', 'NLPM', 'TNM', 'NOC', 'NOD', 'NOP', 'NLS', 'NG', 'TNLG', 'CBOI', 'RFC', 'NLG', 'TNLS', 'TNA', 'NLPA', 'NOA', 'WMC', 'NPM', 'TNPM', 'TNS', 'NA', 'LCOM5', 'NS', 'CBO', 'TNLM', 'TNPA']
JIT_FEATURES = ['comm', 'adev', 'add', 'del', 'own', 'minor', 'sctr', 'nadev', 'ncomm', 'nsctr', 'oexp', 'exp', 'nd', 'entropy', 'la', 'ld', 'lt', 'age', 'nuc', 'cexp', 'sexp', 'rexp', 'fix_bug']

STATIC_FEATURES = []
for f in STATIC:
    STATIC_FEATURES.append('parent_' + f)
    STATIC_FEATURES.append('current_' + f)
    STATIC_FEATURES.append('delta_' + f)
PMD_FEATURES = []
for f in PMD_RULES:
    PMD_FEATURES.append('parent_' + f)
    PMD_FEATURES.append('current_' + f)
    PMD_FEATURES.append('delta_' + f)
PMD_FEATURES += ['file_system_sum_WD', 'author_delta_sum_WD', 'system_WD']

DATA_PATH = '../data/'
FIGURES_PATH = '../figures/'

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
def load_project(file_path, project_name):
    if os.path.exists(file_path):
        tmp = pd.read_csv(file_path)
        tmp['project'] = project_name
        tmp = tmp.fillna(0)

        # remove bug matrix and only keep binary labels and counts
        issue_list_adhoc = []
        issue_list_jlr = []
        issue_list_jlmivr = []
        issue_list_jlmivlv = []
        for key in tmp.columns:
            if key.startswith('adhoc') and len(key.split('__')) > 2:
                issue_list_adhoc.append(key)
            if not key.startswith('adhoc') and len(key.split('__')) > 2:
                if 'JL+R' in key:
                    issue_list_jlr.append(key)
                if 'JLMIV+R' in key:
                    issue_list_jlmivr.append(key)
                if 'JLMIVLV' in key:
                    issue_list_jlmivlv.append(key)

        issue_set = set()
        for key in issue_list_jlmivlv:
            issue_key = key.split('__')[1]
            issue_set.add(issue_key)

        num_commits = tmp['commit'].nunique()
        num_file_actions = len(tmp)
        num_issues = len(issue_set)

        tmp['label_jlr'] = tmp[issue_list_jlr].sum(axis=1)
        tmp['label_jlr_binary'] = tmp['label_jlr'] > 0
        tmp['label_jlmivr'] = tmp[issue_list_jlr].sum(axis=1)
        tmp['label_jlmivr_binary'] = tmp['label_jlmivr'] > 0
        tmp['label_jlmivlv'] = tmp[issue_list_jlmivlv].sum(axis=1)
        tmp['label_jlmivlv_binary'] = tmp['label_jlmivlv'] > 0
        tmp['label_adhoc'] = tmp[issue_list_adhoc].sum(axis=1)
        tmp['label_adhoc_binary'] = tmp['label_adhoc'] > 0

        tmp.drop(issue_list_adhoc + issue_list_jlr + issue_list_jlmivr + issue_list_jlmivlv, axis=1, inplace=True)
        return tmp, num_commits, num_file_actions, num_issues

In [None]:
dfs = []
aggs = []
tbl = ''
sums = {'commits': 0, 'file_actions': 0, 'issues': 0}
for project_name in PROJECTS:
    f = '{}/jit2_sn_{}_JLMIVLV_production_pmd6.csv'.format(DATA_PATH, project_name)
    
    if not os.path.exists(f):
        print('missing', project_name)
        continue
    #if project_name not in ['commons-bcel', 'commons-beanutils', 'commons-codec', 'commons-configuration', 'commons-dbcp', 'commons-validator']:
    #    continue
    print('loading', project_name)
    df, num_commits, num_file_actions, num_issues = load_project(f, project_name)
    sums['commits'] += num_commits
    sums['file_actions'] += num_file_actions
    sums['issues'] += num_issues
    
    # lets try cutting off data, (makes no difference)
    df['committer_date'] = pd.to_datetime(df['committer_date'], utc=True)
    df['year'] = df['committer_date'].dt.year
    
    min_year = df['year'].min()
    max_year = df['year'].max()
    #df = df[df['year'].between(min_year + 1, max_year - 1)].copy()
    tbl += r'{} & {} & {} & {} & {}-{}\\'.format(project_name, num_commits, num_file_actions, num_issues, min_year, max_year) + '\n'
    dfs.append(df)

tbl += r'Sum & {} & {} & {} & \\'.format(sums['commits'], sums['file_actions'], sums['issues']) + '\n'

df = pd.concat(dfs).reset_index()

# Study Subjects

In [None]:
print(tbl)

In [None]:
df['diff_WD'] = df['current_WD'] - df['system_WD']
df['diff2_WD'] = df['parent_WD'] - df['parent_system_WD']
df['diff_default_WD'] = df['current_default_WD'] - df['current_system_default_WD']
df['diff2_default_WD'] = df['parent_default_WD'] - df['parent_system_default_WD']

In [None]:
vs = df[df['label_jlmivlv'] > 0]['diff_WD']
vs2 = df[df['label_jlmivlv'] > 0]['diff2_WD']

fig = plt.figure()
ax = fig.gca()
ax.set_title("Bug inducing file changes: {}".format(len(vs)), fontsize=14)
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('fd(f)', fontsize=14)
ax.set_xticklabels(['before ({:.4f})'.format(np.median(vs2)), 'after ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_wdf.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_WD']

fig = plt.figure()
ax = fig.gca()
ax.set_title("Bug inducing file changes: {}".format(len(vs)), fontsize=14)
ax.boxplot(vs, showfliers=False)
ax.set_ylabel('dfd(f)', fontsize=14)
ax.set_xticklabels(['after ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_dfd.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] > 0]['diff_default_WD']
vs2 = df[df['label_jlmivlv'] > 0]['diff2_default_WD']

fig = plt.figure()
ax = fig.gca()
ax.set_title("Bug inducing file changes: {}".format(len(vs)), fontsize=14)
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('fd(f) (only default)', fontsize=14)
ax.set_xticklabels(['before ({:.4f})'.format(np.median(vs2)), 'after ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_wdf_default.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_default_WD']

fig = plt.figure()
ax = fig.gca()
ax.set_title("Bug inducing file changes: {}".format(len(vs)), fontsize=14)
ax.boxplot(vs, showfliers=False)
ax.set_ylabel('dfd(f) (only default)', fontsize=14)
ax.set_xticklabels(['after ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_dfd_default.pdf')
plt.show()

In [None]:
fig = plt.figure(figsize=(12,12))
ax = fig.gca()

# sort by median first
project_medians = {}
for project_name in df['project'].unique():
    median = np.median(df[(df['label_jlmivlv'] > 0) & (df['project'] == project_name)]['decayed_file_system_sum_WD'].values)
    project_medians[project_name] = median 

data = []
xticks = []
for project_name, median in sorted(project_medians.items(), key=lambda x:x[1]):
    data.append(df[(df['label_jlmivlv'] > 0) & (df['project'] == project_name)]['decayed_file_system_sum_WD'])
    xticks.append(project_name + ' ({})'.format(len(df[(df['label_jlmivlv'] > 0) & (df['project'] == project_name)])))
    
ax.set_title('Bug inducing file changes', fontsize=14)
ax.boxplot(data, showfliers=False)
ax.set_ylabel('dfd(f)', fontsize=14)
ax.set_xticklabels(xticks, rotation=90, fontsize=14)
plt.axhline(y=0, color='gray', linewidth=0.5)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_dfd_all.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] == 0]['diff_WD']
vs2 = df[df['label_jlmivlv'] > 0]['diff_WD']

fig = plt.figure()
ax = fig.gca()
#ax.set_title('compare file changes')
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('fd(f)', fontsize=14)
ax.set_xticklabels(['bug inducing ({:.4f})'.format(np.median(vs2)), 'other ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/compare_wdf.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] == 0]['diff_default_WD']
vs2 = df[df['label_jlmivlv'] > 0]['diff_default_WD']

fig = plt.figure()
ax = fig.gca()
#ax.set_title('compare file changes')
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('fd(f) (only default)', fontsize=14)
ax.set_xticklabels(['bug inducing ({:.4f})'.format(np.median(vs2)), 'other ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/compare_default_wdf.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] == 0]['decayed_file_system_sum_WD']
vs2 = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_WD']

fig = plt.figure()
ax = fig.gca()
#ax.set_title('compare file changes')
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('dfd(f)', fontsize=14)
ax.set_xticklabels(['bug inducing ({:.4f})'.format(np.median(vs2)), 'other ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/compare_dfd.pdf')
plt.show()

In [None]:
vs = df[df['label_jlmivlv'] == 0]['decayed_file_system_sum_default_WD']
vs2 = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_default_WD']

fig = plt.figure()
ax = fig.gca()
#ax.set_title('compare file changes')
ax.boxplot([vs2, vs], showfliers=False)
ax.set_ylabel('dfd(f) (only default)', fontsize=14)
ax.set_xticklabels(['bug inducing ({:.4f})'.format(np.median(vs2)), 'other ({:.4f})'.format(np.median(vs))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/compare_default_dfd.pdf')
plt.show()

In [None]:
def cliffsd(u, n1, n2):
    d = (2*u) / (n1 * n2) - 1
    return abs(d)

def cd_name(cd):
    if cd < 0.1:
        return 'n'
    elif 0.1 <= cd < 0.33:
        return 's'
    elif 0.33 <= cd < 0.474:
        return 'm'
    elif 0.474 <= cd:
        return 'l'


thresh = 0.05 / (4 + 8)
inducing = df[df['label_jlmivlv'] > 0]['diff_WD']
non_inducing = df[df['label_jlmivlv'] == 0]['diff_WD']

default_inducing = df[df['label_jlmivlv'] > 0]['diff_default_WD']
default_non_inducing = df[df['label_jlmivlv'] == 0]['diff_default_WD']

dfd_inducing = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_WD']
dfd_non_inducing = df[df['label_jlmivlv'] == 0]['decayed_file_system_sum_WD']

default_dfd_inducing = df[df['label_jlmivlv'] > 0]['decayed_file_system_sum_default_WD']
default_dfd_non_inducing = df[df['label_jlmivlv'] == 0]['decayed_file_system_sum_default_WD']

def print_tblline(x, y, thresh, alternative='greater'):

    mwu, mwu_p = stats.mannwhitneyu(x, y, alternative=alternative)
    qe = '-'
    p1 = '{:.4f}'.format(mwu_p)
    if mwu_p < 0.0001:
        p1 = r'\textless0.0001'
    cd1 = cliffsd(mwu, len(y), len(x))
    s1 = cd_name(cd1)
    if mwu_p < thresh:
        qe = '{:.2f} ({})'.format(cd1, s1)
        p1 = r'\textbf{' + p1 + '}'
    print('{:,} & {:,} & {:,} & {:.4f} & {:.4f} & {} & {}'.format(len(y), len(x), mwu, np.median(y), np.median(x), p1, qe))

print(thresh)

#print_tblline(inducing, non_inducing, thresh, alternative='two-sided')
print_tblline(inducing, non_inducing, thresh, alternative='two-sided')
print_tblline(default_inducing, default_non_inducing, thresh, alternative='two-sided')


# not significant either way
print_tblline(dfd_inducing, dfd_non_inducing, thresh, alternative='two-sided')
print_tblline(default_dfd_inducing, default_dfd_non_inducing, thresh, alternative='two-sided')

In [None]:
# normality test
print('inducing', stats.anderson(inducing))
print('other', stats.anderson(non_inducing))

print('default_inducing', stats.anderson(default_inducing))
print('default_other', stats.anderson(default_non_inducing))

print('dfd_inducing', stats.anderson(dfd_inducing))
print('dfd_other', stats.anderson(dfd_non_inducing))

print('default_dfd_inducing', stats.anderson(default_dfd_inducing))
print('default_dfd_other', stats.anderson(default_dfd_non_inducing))

In [None]:
vs = df[df['label_jlmivlv'] > 0]['comm']
vs2 = df[df['label_jlmivlv'] == 0]['comm']

fig = plt.figure()
ax = fig.gca()
#ax.set_title("bug inducing file changes: {}, non-inducing file changes: {}".format(len(vs), len(vs2)))
ax.boxplot([vs, vs2], showfliers=False)
ax.set_ylabel('#changes', fontsize=14)
ax.set_xticklabels(['bug inducing, median: {},\nsamples: {}'.format(np.median(vs), len(vs)), 'other, median: {},\nsamples: {}'.format(np.median(vs2), len(vs2))], fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_PATH + '/inducing_comm.pdf')
plt.show()