In [13]:
import numpy as np
import pandas as pd
import os

#importing base learners of Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Importing three component ensembles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

#importing SVC for second-step classification
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score


In [14]:
#defining ml techniques
base_learner1 = LogisticRegression(random_state=1)
base_learner2 = DecisionTreeClassifier()
base_learner3 = GaussianNB()

ensemble1 = VotingClassifier(estimators=[('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')
ensemble2 = RandomForestClassifier()
ensemble3 = AdaBoostClassifier(n_estimators=50)


In [66]:
#predicts bug and their probabilities for given datafile and classifier pair

def predict_util(datafile, classifiertype):
    ncols = datafile.columns
    #extracting relevant columns, software metrics in X, and labels in Y
    if classifier == ensemble1:
        ncols = ncols[ :-1]
        X     = datafile.iloc[ : , :-1]
        X1    = datafile.as_matrix(ncols)
    elif classifier == ensemble2:
        ncols = ncols[ :-1]
        X     = datafile.iloc[ : , :-1]
        X1    = datafile.as_matrix(ncols)
    else:
        ncols = ncols[ :-1]
        X     = datafile.iloc[ : , :-1]
        X1    = datafile.as_matrix(ncols)
    y = datafile['bug_binarized']
    Y = np.array(y)
    
    #performing leave-one out validation for instances less than 100
    #and 10 fold validation for others
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = [] 
    
    for train_index, test_index in kf.split(X):
        print(classifier)
        if classifier == ensemble1:
            classifier = VotingClassifier(estimators=[
                                         ('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')      
        elif classifier == ensemble2:
            classifier = RandomForestClassifier()
        else:
            classifier = AdaBoostClassifier(n_estimators=50)
            
        print(classifier)
        for i in train_index:
                train_X.append(X1[i])
                train_Y.append(Y[i])
        
        print(train_X, train_Y)
        classifier.fit(train_X, train_Y)
        
        for j in test_index:
            prediction.append(classifier.predict([X1[j]])[0])
            predict_prob.append(classifier.predict_proba([X1[j]])[0][1])
        
        train_X  = []
        train_Y  = []
    
    #print(prediction, predict_prob)
    return prediction, Y, predict_prob
    



In [67]:
def computePerformanceMeasures(predictions, labels, prediction_probability):
    
    precision = precision_score(y_true=labels, y_pred=predictions)
    recall    = recall_score(y_true=labels, y_pred=predictions)
    roc_score = roc_auc_score(labels, prediction_probability)
    accuracy  = accuracy_score(y_true=labels, y_pred=predictions)
    f_measure = 2*(precision*recall)/float(precision+recall) 
    
    metrics = [precision, recall, roc_score, accuracy, f_measure]
    
    return metrics  

In [68]:
def predict():
    directory = 'dataset/dataset/'
    
    for projectName in os.listdir(directory):
        performanceMetrics = []
    
        inputData = pd.read_csv(directory + projectName, dtype={'bug_binarized':np.bool})
        projectData = pd.read_csv(directory + projectName, dtype={'bug_binarized':np.bool})
        
        metricsFrame = pd.DataFrame(performanceMetrics, 
                                    index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure'])
        
        predictionEnsemble1, YEnsemble1, predict_probEnsemble1 = predict_util(inputData, ensemble1)
        projectData['Voting_Prediction'] = predictionEnsemble1
        projectData['Voting_Pred_Prob']  = predict_probEnsemble1
        VotingMetrics = computePerformanceMeasures(predictionEnsemble1, YEnsemble1, predict_probEnsemble1)
        metricsFrame.insert(loc = 0, column = 'Voting', value = VotingMetrics)
                
            
        predictionEnsemble2, YEnsemble2, predict_probEnsemble2 = predict_util(inputData, ensemble2)
        projectData['RandomForest_Prediction'] = predictionEnsemble2
        projectData['RandomForest_Pred_Prob']  = predict_probEnsemble2
        RandomForestMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc = 1, column='RandomForest', value = RandomForestMetrics)
        
        predictionEnsemble3, YEnsemble3, predict_probEnsemble3 = predict_util(inputData, ensemble3)
        projectData['AdaBoost_Prediction'] = predictionEnsemble3
        projectData['AdaBoost_Pred_Prob']  = predict_probEnsemble3
        AdaBoostMetrics = computePerformanceMeasures(predictionEnsemble3, YEnsemble3, predict_probEnsemble3)
        metricsFrame.insert(loc = 2, column='AdaBoost', value = AdaBoostMetrics)
        
        metricsFrame.to_csv('dataset/metrics/' + projectName)
        print(projectName)
        print(metricsFrame)
        
        projectData.to_csv('dataset/annotated/' + projectName)

In [64]:
def predict():
    directory = 'dataset/dataset/'
    
    #for projectName in os.listdir(directory):
    for x in range(0,1):
        projectName = 'zuzel.csv'
        performanceMetrics = []
    
        inputData = pd.read_csv(directory + projectName, dtype={'bug_binarized':np.bool})
        projectData = pd.read_csv(directory + projectName, dtype={'bug_binarized':np.bool})
        
        metricsFrame = pd.DataFrame(performanceMetrics, 
                                    index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure'])
        
        print('Voting')
        predictionEnsemble1, YEnsemble1, predict_probEnsemble1 = predict_util(inputData, ensemble1)
        projectData['Voting_Prediction'] = predictionEnsemble1
        projectData['Voting_Pred_Prob']  = predict_probEnsemble1
        VotingMetrics = computePerformanceMeasures(predictionEnsemble1, YEnsemble1, predict_probEnsemble1)
        metricsFrame.insert(loc = 0, column = 'Voting', value = VotingMetrics)
                
        print('Random Forest')
        predictionEnsemble2, YEnsemble2, predict_probEnsemble2 = predict_util(inputData, ensemble2)
        projectData['RandomForest_Prediction'] = predictionEnsemble2
        projectData['RandomForest_Pred_Prob']  = predict_probEnsemble2
        RandomForestMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc = 1, column='RandomForest', value = RandomForestMetrics)
        
        print('Adaboost')
        predictionEnsemble3, YEnsemble3, predict_probEnsemble3 = predict_util(inputData, ensemble3)
        projectData['AdaBoost_Prediction'] = predictionEnsemble3
        projectData['AdaBoost_Pred_Prob']  = predict_probEnsemble3
        AdaBoostMetrics = computePerformanceMeasures(predictionEnsemble3, YEnsemble3, predict_probEnsemble3)
        metricsFrame.insert(loc = 2, column='AdaBoost', value = AdaBoostMetrics)
        
        metricsFrame.to_csv('dataset/metrics/' + projectName)
        print(projectName)
        print(projectData)
        print(metricsFrame)
        
        #projectData.to_csv('dataset/annotated/' + projectName)

In [65]:
import warnings
warnings.filterwarnings('ignore')
predict()

Voting
VotingClassifier(estimators=[('logregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('dtre..., presort=False, random_state=None,
            splitter='best')), ('gnb', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)
VotingClassifier(estimators=[('logregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('dtre..., presort=False, random_state=None,
            splitter='best')), ('gnb', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)
[arra

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([  5. ,  44. , 276. ,   0.5]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.00000e+01, 5.60000e+01, 1.60600e+03, 2.77778e-01]), a

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([  5. ,  44. , 276. ,   0.5]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([  0.,  42., 841.,   1.]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.00000e+01, 5.60000e+

[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([  0.,  42., 841.,   1.]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.00000e+01, 5.60000e+01, 1.60600e+03, 2.77778e-01]), array([1.10000e+01, 6.30000e+01, 8.36000e+02, 2.31579e-01]), array([2.00e+01, 6.70e+01, 6.65e+02, 2.30e-01])] [False, False, True, True, True, False, False, False, True, False, True, False, False, True, True, True]
AdaBoostClassifier(algorithm='SAMME.R', ba

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([  5. ,  44. , 276. ,   0.5]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([  0.,  42., 841.,   1.]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.10000e+01, 6.30000e+

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([  5. ,  44. , 276. ,   0.5]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.00000e+01, 5.60000e+01, 1.60600e+03, 2.77778e-01]), a

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
[array([  0.,  27., 177.,   1.]), array([14.      , 11.      , 72.      ,  0.533333]), array([7.00000e+00, 4.90000e+01, 4.24000e+02, 3.55556e-01]), array([  4.      ,  32.      , 169.      ,   0.229814]), array([8.00000e+00, 9.30000e+01, 1.41200e+03, 2.18182e-01]), array([  5. ,  44. , 276. ,   0.5]), array([ 3.      , 12.      , 69.      ,  0.545455]), array([  0.,  42., 841.,   1.]), array([ 1.      , 14.      , 66.      ,  0.458333]), array([1.50000e+01, 4.40000e+01, 1.76000e+03, 3.15789e-01]), array([  8.      ,  36.      , 375.      ,   0.461538]), array([1.00000e+01, 8.00000e+01, 9.38000e+02, 2.30769e-01]), array([  2.   ,  17.   , 160.   ,   0.375]), array([ 6.      , 14.      , 66.      ,  0.458333]), array([1.00000e+01, 5.60000e+

In [13]:
def bestEnsembleSelector():
    annotated_directory   = 'dataset/annotated/'
    performance_directory = 'dataset/metrics/'

    for projectName in os.listdir(annotated_directory):
        print(projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName, dtype={'bug_binarized':np.bool})
        metricData    = pd.read_csv(performance_directory + projectName)


        predictionMatrix = annotatedData.as_matrix(columns = ['bug_binarized','Voting_Prediction','AdaBoost_Prediction','RandomForest_Prediction'])
        print(metricData)
        print(metricData.loc[0, 'Voting'])
        
       # defining constants
        auc_score_constant    = 2     # auc_score is at the 2nd row
        voting_constant       = 'Voting'
        adaBoost_constant     = 'AdaBoost'
        randomForest_constant = 'RandomForest'
        
        ensemble=[]
        
        for i in range(len(predictionMatrix)):
            if   predictionMatrix[i][0] == predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('Voting')
            elif predictionMatrix[i][0] == predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('AdaBoost')
            elif predictionMatrix[i][0] == predictionMatrix[i][3] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2]:
                ensemble.append('RandomForest')
            else:
                p_voting       = metricData.loc[auc_score_constant, voting_constant]
                p_adaBoost     = metricData.loc[auc_score_constant, adaBoost_constant]
                p_randomForest = metricData.loc[auc_score_constant, randomForest_constant]

                if p_voting > p_adaBoost and p_voting > p_randomForest:
                    ensemble.append('Voting')

                elif p_adaBoost>p_randomForest and p_adaBoost > p_voting:
                    ensemble.append('AdaBoost')

                else:
                    ensemble.append('RandomForest')
                
        annotatedData['selectedEnsemble'] = ensemble
        annotatedData.to_csv(annotated_directory + projectName)


In [14]:
bestEnsembleSelector()

camel-1.6.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.718750      0.755814  0.755814
1     Recall  0.370968      0.349462  0.349462
2  Auc_Score  0.723048      0.722837  0.722837
3   Accuracy  0.800554      0.803324  0.803324
4  F_Measure  0.489362      0.477941  0.477941
0.71875
e-learning.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.833333      0.829787  0.829787
1     Recall  0.816327      0.795918  0.795918
2  Auc_Score  0.909605      0.902063  0.902063
3   Accuracy  0.879433      0.872340  0.872340
4  F_Measure  0.824742      0.812500  0.812500
0.8333333333333334
intercafe.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.666667      0.666667  0.666667
1     Recall  0.666667      0.666667  0.666667
2  Auc_Score  0.746032      0.746032  0.746032
3   Accuracy  0.739130      0.739130  0.739130
4  F_Measure  0.666667      0.666667  0.666667
0.6666666666666666
ivy-2.0.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Pre

In [15]:
def svctrain():
    directory = 'dataset/dataset/'
    annotated_directory = 'dataset/annotated/'
    DSE_directory = 'dataset/DSE/'
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : , :-1])
        Y = np.array(annotatedData.iloc[ : , -1])
        
        npoints = X.shape[0]
        
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y = []
        
        predictedEnsemble = []
        predict_prob      = []
        final_prediction  = []  # this stores the prediction(bugginess) of the best ensemble predicted by SVC
        
        prediction_constant = '_Prediction'
        probab_constant = '_Pred_Prob'
        
        
        for train_index, test_index in kf.split(X):
            classifier = SVC(probability = True)
            
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                    predict_prob.append(annotatedData.loc[j, unique_labels[0] + probab_constant])
                    final_prediction.append(annotatedData.loc[j, unique_labels[0] + prediction_constant])
           
            else:
                classifier.fit(train_X, train_Y)
                
                for j in test_index:
                    predictedBestEnsemble = classifier.predict([X[j]])[0]
                    predictedEnsemble.append(predictedBestEnsemble)
                    final_prediction.append(annotatedData.loc[j, predictedBestEnsemble + prediction_constant])
                    
            # total probability of available classifiers, i.e the classifiers reported in unique_labels predicting true
                    predict_proba_true = 0
                    
            # probability of classifiers being predicted
                    predict_proba_classifiers = classifier.predict_proba([X[j]])[0]
                    k = 0
            # class probabilities are always reported in a sorted by name fashion, i.e AdaBoost, RandomForest, Voting 
            # np.unique also reports labels in a sorted by name fashion
                    for classifierName in unique_labels:
                        predict_proba_true +=  predict_proba_classifiers[k] * annotatedData.loc[j, classifierName + probab_constant]
                        k += 1
                    predict_prob.append(predict_proba_true)
                    
        annotatedData['PredictedEnsemble'] = predictedEnsemble
        annotatedData['DSE_Prediction'] = final_prediction
        annotatedData['DSE_Pred_Prob'] = predict_prob
        annotatedData.to_csv(DSE_directory + projectName)    


In [16]:
svctrain()

camel-1.6.csv
e-learning.csv
intercafe.csv
ivy-2.0.csv
jedit-4.3.csv
kalkulator.csv
log4j-1.2.csv
lucene-2.4.csv
poi-2.5.csv
prop-6.csv
redaktor.csv
serapion.csv
sklebagd.csv
synapse-1.2.csv
systemdata.csv
szybkafucha.csv
termoproject.csv
tomcat.csv
velocity-1.6.csv
workflow.csv
wspomaganiepi.csv
xalan-2.7.csv
xerces-1.4.csv
zuzel.csv


In [17]:
def computePerformanceMeasuresDSE():
    DSEdirectory = 'dataset/DSE/'
    projectMetrics = []
    index = 0
    projectMetrics = pd.DataFrame(projectMetrics,
                                    columns = ['Project','Precision', 'Recall', 'ROCScore', 'Accuracy', 'Fmeasure'])
    for projectName in os.listdir(DSEdirectory):
        project = pd.read_csv(DSEdirectory + projectName)
        projectData = project.as_matrix(columns=[
                                         'DSE_Prediction',
                                         'DSE_Pred_Prob',
                                         'bug_binarized'])
      
        row = []
        row.append(projectName)
        row.extend(computePerformanceMeasures(project['DSE_Prediction'], 
                                                         project['bug_binarized'], 
                                                         project['DSE_Pred_Prob']))
        projectMetrics.loc[index] = row
        index = index + 1
  
    print(projectMetrics)
    projectMetrics.to_csv('dataset/' + 'results.csv')

In [18]:
computePerformanceMeasuresDSE()   

              Project  Precision    Recall  ROCScore  Accuracy  Fmeasure
0       camel-1.6.csv   0.718750  0.370968  0.723088  0.800554  0.489362
1      e-learning.csv   0.833333  0.816327  0.909605  0.879433  0.824742
2       intercafe.csv   0.666667  0.666667  0.746032  0.739130  0.666667
3         ivy-2.0.csv   0.756098  0.746988  0.883647  0.861486  0.751515
4       jedit-4.3.csv   0.833333  0.650685  0.953546  0.846491  0.730769
5      kalkulator.csv   0.727273  0.800000  0.792857  0.791667  0.761905
6       log4j-1.2.csv   0.932773  0.932773  0.964113  0.921569  0.932773
7      lucene-2.4.csv   0.640000  0.603774  0.625130  0.604061  0.621359
8         poi-2.5.csv   0.861111  0.880682  0.888549  0.827715  0.870787
9          prop-6.csv   0.686275  0.614035  0.841028  0.815832  0.648148
10       redaktor.csv   0.933333  0.823529  0.930104  0.911765  0.875000
11       serapion.csv   0.428571  0.428571  0.702857  0.750000  0.428571
12       sklebagd.csv   0.833333  0.714286  0.47619