In [None]:
%matplotlib inline

import os
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

PROJECTS = ['ant-ivy', 'archiva', 'calcite', 'cayenne', 'commons-bcel', 'commons-beanutils',
            'commons-codec', 'commons-collections', 'commons-compress', 'commons-configuration',
            'commons-dbcp', 'commons-digester', 'commons-io', 'commons-jcs', 'commons-jexl',
            'commons-lang', 'commons-math', 'commons-net', 'commons-scxml', 
            'commons-validator', 'commons-vfs', 'deltaspike', 'eagle', 'giraph', 'gora', 'jspwiki',
            'knox', 'kylin', 'lens', 'mahout', 'manifoldcf','nutch','opennlp','parquet-mr',
            'santuario-java', 'systemml', 'tika', 'wss4j']

DATA_PATH = '../data/'
FIGURES_PATH = '../figures/'

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load train/test data

In [None]:
r = pd.read_csv('{}/train_test_all.csv'.format(DATA_PATH))

# Logistic Regression train/test plots

In [None]:
for plot_name, lbl_name in zip(['bug', 'ad-hoc', 'commit'], ['bug_label', 'adhoc_label', 'pascarella_commit']):
    for pn1, ms1 in [('F-measure', 'lr_f1'), ('AUC', 'lr_roc_auc')]:
        fig = plt.figure(figsize=(5,2))
        ax = fig.add_subplot(111)
        plt.title('Logistic Regression')
        bp = []
        names = []
        for pn, ms in [('combined', 'jit_static_pmd'), ('jit', 'jit'), ('static', 'static'), ('pmd', 'pmd')]:
            bp.append(r[(r['metric_set'] == ms) & (r['label'] == lbl_name)][ms1].values)
            names.append(pn)
        ax.boxplot(bp)
        for i, (name, bp) in enumerate(zip(names, bp)):
            y = bp
            x = np.random.normal(1+i, 0.05, size=len(y))
            ax.plot(x, y, 'k.', alpha=0.2)
        ax.set_xticklabels(names)
        ax.set_ylabel(pn1)
        ax.set_ylim([0,1])
        plt.show()
        #plt.tight_layout()
        #plt.savefig(FIGURES_PATH + '/tt_{}_{}.pdf'.format(ms1, lbl_name))

# Random Forest train/test plots

In [None]:
for plot_name, lbl_name in zip(['bug', 'ad-hoc', 'commit'], ['bug_label', 'adhoc_label', 'pascarella_commit']):
    for pn1, ms1 in [('F-measure', 'rf_f1'), ('AUC', 'rf_roc_auc')]:
        fig = plt.figure(figsize=(5,2))
        ax = fig.add_subplot(111)
        plt.title('Random Forest')
        bp = []
        names = []
        for pn, ms in [('combined', 'jit_static_pmd'), ('jit', 'jit'), ('static', 'static'), ('pmd', 'pmd')]:
            bp.append(r[(r['metric_set'] == ms) & (r['label'] == lbl_name)][ms1].values)
            names.append(pn)
        ax.boxplot(bp)
        for i, (name, bp) in enumerate(zip(names, bp)):
            y = bp
            x = np.random.normal(1+i, 0.05, size=len(y))
            ax.plot(x, y, 'k.', alpha=0.2)
        ax.set_xticklabels(names)
        ax.set_ylabel(pn1)
        ax.set_ylim([0,1])
        plt.show()
        #plt.tight_layout()
        #plt.savefig(FIGURES_PATH + '/tt_{}_{}.pdf'.format(ms1, lbl_name))

# Cost model

In [None]:
dats = []
for project in r['project'].unique():
    for plot_name, lbl_name in zip(['bug', 'adhoc'], ['bug_label', 'adhoc_label']):
        df = r[(r['label'] == lbl_name) & (r['metric_set'] != 'jit_statc') & (r['project'] == project)].copy()
        for metric_set in df['metric_set'].unique():
            tmp = df[df['metric_set'] == metric_set]
            dats.append({'classifier': 'Random Forest', 'project': project, 'feature_set': metric_set, 'ub': tmp['rf_ub'].values[0], 'lb': tmp['rf_lb'].values[0], 'label': plot_name})
            dats.append({'classifier': 'Logistic Regression', 'project': project, 'feature_set': metric_set, 'ub': tmp['lr_ub'].values[0], 'lb': tmp['lr_lb'].values[0], 'label': plot_name})

test = pd.DataFrame(dats)

print('projects', test['project'].nunique())
for label in test['label'].unique():
    for cl in test['classifier'].unique():
        for ms in test['feature_set'].unique():
            if ms not in ['jit', 'static', 'pmd', 'jit_static_pmd']:
                continue
            tmp = test[(test['label'] == label) & (test['classifier'] == cl) & (test['feature_set'] == ms) & (np.isfinite(test['ub'])) & (np.isfinite(test['lb'])) & (test['ub'] > test['lb'])]
            s = np.sum(tmp['ub'] - tmp['lb'])
            print(label, cl, ms, len(tmp))

In [None]:
dats = []
for project in r['project'].unique():
    for plot_name, lbl_name in zip(['bug', 'adhoc'], ['bug_label', 'adhoc_label']):
        df = r[(r['label'] == lbl_name) & (r['metric_set'] != 'jit_statc') & (r['project'] == project)].copy()
        for metric_set in df['metric_set'].unique():
            tmp = df[df['metric_set'] == metric_set]
            dats.append({'classifier': 'Random Forest', 'project': project, 'feature_set': metric_set, 'ub': tmp['rf_ub'].values[0], 'lb': tmp['rf_lb'].values[0], 'label': plot_name})
            dats.append({'classifier': 'Logistic Regression', 'project': project, 'feature_set': metric_set, 'ub': tmp['lr_ub'].values[0], 'lb': tmp['lr_lb'].values[0], 'label': plot_name})

test = pd.DataFrame(dats)

print('projects', test['project'].nunique())
for label in test['label'].unique():
    for ms in test['feature_set'].unique():
        if ms not in ['jit', 'static', 'pmd', 'jit_static_pmd']:
            continue
        tmp = test[(test['label'] == label) & (test['feature_set'] == ms) & (np.isfinite(test['ub'])) & (np.isfinite(test['lb'])) & (test['ub'] > test['lb'])]
        s = np.sum(tmp['ub'] - tmp['lb'])
        print(label, ms, len(tmp) / 2)