In [137]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score, classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from new_JIT_defect_prediction_dir.my_util import *

import warnings
warnings.filterwarnings('ignore')

In [138]:
RF_data_dir = './text_metric_data/'

In [139]:
def get_recall_at_k_percent_effort(percent_effort, result_df_arg, real_buggy_commits):
    cum_LOC_k_percent = (percent_effort/100)*result_df_arg.iloc[-1]['cum_LOC']
    buggy_line_k_percent =  result_df_arg[result_df_arg['cum_LOC'] <= cum_LOC_k_percent]
    buggy_commit = buggy_line_k_percent[buggy_line_k_percent['label']==1]
    recall_k_percent_effort = len(buggy_commit)/float(len(real_buggy_commits))
    
    return recall_k_percent_effort

def eval_metrics(method, result_df):
    
    pred = result_df['defective_commit_pred']
    y_test = result_df['label']
    
    prec, rec, f1, _ = precision_recall_fscore_support(y_test,pred,average='binary') # at threshold = 0.5
    tn, fp, fn, tp = confusion_matrix(y_test, pred, labels=[0, 1]).ravel()
#     rec = tp/(tp+fn)
    
    FAR = fp/(fp+tn) # false alarm rate
    dist_heaven = math.sqrt((pow(1-rec,2)+pow(0-FAR,2))/2.0) # distance to heaven
    
    auc = roc_auc_score(y_test, result_df['defective_commit_prob'])

    result_df['defect_density'] = result_df['defective_commit_prob']/result_df['LOC'] # predicted defect density
    result_df['actual_defect_density'] = result_df['label']/result_df['LOC'] #defect density

    result_df = result_df.sort_values(by='defect_density',ascending=False)
    actual_result_df = result_df.sort_values(by='actual_defect_density',ascending=False)
    actual_worst_result_df = result_df.sort_values(by='actual_defect_density',ascending=True)

    result_df['cum_LOC'] = result_df['LOC'].cumsum()
    actual_result_df['cum_LOC'] = actual_result_df['LOC'].cumsum()
    actual_worst_result_df['cum_LOC'] = actual_worst_result_df['LOC'].cumsum()

    real_buggy_commits = result_df[result_df['label'] == 1]

    label_list = list(result_df['label'])

    all_rows = len(label_list)

    # find recall (use LOC to find 20% of buggy commit)
    cum_LOC_20_percent = 0.2*result_df.iloc[-1]['cum_LOC']
    buggy_line_20_percent = result_df[result_df['cum_LOC'] <= cum_LOC_20_percent]
    buggy_commit = buggy_line_20_percent[buggy_line_20_percent['label']==1]
    recall_20_percent_effort = len(buggy_commit)/float(len(real_buggy_commits))

    # find effort @20% LOC recall
    buggy_20_percent = real_buggy_commits.head(math.ceil(0.2 * len(real_buggy_commits)))
    buggy_20_percent_LOC = buggy_20_percent.iloc[-1]['cum_LOC']
    effort_at_20_percent_LOC_recall = int(buggy_20_percent_LOC) / float(result_df.iloc[-1]['cum_LOC'])
    
    # find P_opt
    percent_effort_list = []
    predicted_recall_at_percent_effort_list = []
    actual_recall_at_percent_effort_list = []
    actual_worst_recall_at_percent_effort_list = []
    
    for percent_effort in np.arange(10,101,10):
        predicted_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, result_df, real_buggy_commits)
        actual_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, actual_result_df, real_buggy_commits)
        actual_worst_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, actual_worst_result_df, real_buggy_commits)
        
        percent_effort_list.append(percent_effort/100)
        
        predicted_recall_at_percent_effort_list.append(predicted_recall_k_percent_effort)
        actual_recall_at_percent_effort_list.append(actual_recall_k_percent_effort)
        actual_worst_recall_at_percent_effort_list.append(actual_worst_recall_k_percent_effort)

    p_opt = 1 - ((metrics.auc(percent_effort_list, actual_recall_at_percent_effort_list) - 
                 metrics.auc(percent_effort_list, predicted_recall_at_percent_effort_list)) /
                (metrics.auc(percent_effort_list, actual_recall_at_percent_effort_list) -
                metrics.auc(percent_effort_list, actual_worst_recall_at_percent_effort_list)))
    
    return prec, rec, f1, auc, FAR, dist_heaven, recall_20_percent_effort, effort_at_20_percent_LOC_recall, p_opt



In [145]:
def eval_result(proj_name,sampling_method = 'DE_SMOTE_min_df_3'):
    
    cc2vec_result = pd.read_csv(cc2vec_data_dir+proj_name+'-CC2Vec-prediction-result.csv') # train+test
    cc2vec_result_train_only = pd.read_csv(cc2vec_data_dir+proj_name+'-CC2Vec-train-data-only-prediction-result.csv')
    deepJIT_result = pd.read_csv(deepJIT_data_dir+proj_name+'-DeepJIT-prediction-result.csv')
    LR_result = pd.read_csv(baseline_data_dir+proj_name+'_LR_baseline_prediction_result.csv')
    RF_result = pd.read_csv(RF_data_dir+proj_name+'_RF_'+sampling_method+'_prediction_result.csv')
    
    RF_result.columns = ['Unnamed', 'defective_commit_prob','defective_commit_pred','label','test_commit'] # for new result
    LR_result.columns = ['Unnamed', 'defective_commit_prob','defective_commit_pred','label','test_commit'] # for new result
    RF_baseline_result.columns = ['Unnamed', 'defective_commit_prob','defective_commit_pred','label','test_commit'] # for new result
    
    test_combined_code, test_commit, test_label = prepare_data(proj_name, mode='test',use_text=True,
                                                          remove_python_common_tokens=False, data_dir='./new_JIT_defect_prediction_dir/dataset/')

    cc2vec_result = cc2vec_result.drop('LOC',axis=1)
    deepJIT_result = deepJIT_result.drop('LOC',axis=1)

    RF_LOC = [len(code.splitlines()) for code in test_combined_code]
    RF_df = pd.DataFrame()
    RF_df['commit_id'] = test_commit
    RF_df['LOC'] = RF_LOC

    RF_result = pd.merge(RF_df, RF_result,how='inner',left_on = 'commit_id', right_on='test_commit')
    cc2vec_result = pd.merge(RF_df, cc2vec_result,how='inner',left_on = 'commit_id', right_on='test_commit')
    cc2vec_result_train_only = pd.merge(RF_df, cc2vec_result_train_only,how='inner',left_on = 'commit_id', right_on='test_commit')
    deepJIT_result = pd.merge(RF_df, deepJIT_result,how='inner',left_on = 'commit_id', right_on='test_commit')
    LR_result = pd.merge(RF_df, LR_result,how='inner',left_on = 'commit_id', right_on='test_commit')
    
    all_df = {}
    all_df['RF'] = RF_result
    all_df['LR'] = LR_result
#     all_df['RF_baseline'] = RF_baseline_result
    all_df['deepJIT'] = deepJIT_result
    all_df['cc2vec'] = cc2vec_result
    all_df['cc2vec_train_only'] = cc2vec_result_train_only
    
    
    metrics_df = pd.DataFrame()
    methods = []
    prec_list = []
    rec_list = []
    f1_list = []
    auc_list = []
    FAR_list = []
    dist_heaven_list = []
    recall_20_percent_effort_list = []
    effort_20_percent_recall_list = []
    popt_list = []
    
    for method, df in all_df.items():
        prec, rec, f1, auc, FAR, dist_heaven, recall_20_percent_effort, effort_at_20_percent_LOC_recall,p_opt = eval_metrics(method, df)
        methods.append(method)
        prec_list.append(prec)
        rec_list.append(rec)
        f1_list.append(f1)
        auc_list.append(auc)
        mcc_list.append(mcc)
        bal_acc_list.append(balanced_acc)
        FAR_list.append(FAR)
        dist_heaven_list.append(dist_heaven)
        recall_20_percent_effort_list.append(recall_20_percent_effort)
        effort_20_percent_recall_list.append(effort_at_20_percent_LOC_recall)
        popt_list.append(p_opt)

    result_dict = {'methods':list(all_df.keys()), 'precision':prec_list, 'recall':rec_list, 'f1':f1_list, 'AUC':auc_list,
                   'FAR': FAR_list, 'D2H':dist_heaven_list, 'recall_20_percent_effort':recall_20_percent_effort_list, 
                   'effort_20_percent_recall':effort_20_percent_recall_list, 'p_opt':popt_list}
    eval_result_df = pd.DataFrame(result_dict)
    eval_result_df.to_csv('./final_eval_result.csv',index=False)


In [None]:
eval_result('openstack')

In [None]:
eval_result('qt')