In [1]:
import numpy as np
import pandas as pd
import os

#importing base learners of Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Importing three component ensembles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

#importing SVC for second-step classification
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score


In [2]:
#defining ml techniques
base_learner1 = LogisticRegression(random_state=1)
base_learner2 = DecisionTreeClassifier()
base_learner3 = GaussianNB()

ensemble1 = VotingClassifier(estimators=[('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')
ensemble2 = RandomForestClassifier()
ensemble3 = AdaBoostClassifier(n_estimators=50)


In [3]:
#predicts bug and their probabilities for given datafile and classifier pair

def predict_util(datafile, classifier):
    ncols = datafile.columns
    #extracting relevant columns, software metrics in X, and labels in Y
    if classifier == ensemble1:
        ncols = ncols[ :-1]
        X     = datafile.iloc[ : , :-1]
        X1    = datafile.as_matrix(ncols)
    elif classifier == ensemble2:
        ncols = ncols[ :-3]
        X     = datafile.iloc[ : , :-3]
        X1    = datafile.as_matrix(ncols)
    else:
        ncols = ncols[ :-5]
        X     = datafile.iloc[ : , :-5]
        X1    = datafile.as_matrix(ncols)
    y = datafile['bug_binarized']
    Y = np.array(y)
    
    #performing leave-one out validation for instances less than 100
    #and 10 fold validation for others
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = [] 
    
    for train_index, test_index in kf.split(X):

        if classifier == ensemble1:
            classifier = VotingClassifier(estimators=[('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')      
        elif classifier == ensemble2:
            classifier = RandomForestClassifier()
        else:
            classifier = AdaBoostClassifier(n_estimators=50)
            

        for i in train_index:
                train_X.append(X1[i])
                train_Y.append(Y[i])
        
        classifier.fit(train_X, train_Y)
        
        for j in test_index:
            prediction.append(classifier.predict([X1[j]])[0])
            predict_prob.append(classifier.predict_proba([X1[j]])[0][1])
        
        train_X  = []
        train_Y  = []
        
    return prediction, Y, predict_prob
    



In [4]:
#file=pd.read_csv('dataset/dataset/camel-1.6.csv' , dtype={'bug_binarized':np.bool})
#predict_util(file, ensemble3)

In [5]:
def computePerformanceMeasures(predictions, labels, prediction_probability):
    
    precision = precision_score(y_true=labels, y_pred=predictions)
    recall    = recall_score(y_true=labels, y_pred=predictions)
    roc_score = roc_auc_score(labels, prediction_probability)
    accuracy  = accuracy_score(y_true=labels, y_pred=predictions)
    f_measure = 2*(precision*recall)/float(precision+recall) 
    
    metrics = [precision, recall, roc_score, accuracy, f_measure]
    
    return metrics  

In [6]:
def predict():
    directory = 'dataset/dataset/'
    
    for projectName in os.listdir(directory):
        performanceMetrics = []
    
        projectData = pd.read_csv(directory + projectName, dtype={'bug_binarized':np.bool})
        
        metricsFrame = pd.DataFrame(performanceMetrics, 
                                    index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure'])
        
        predictionEnsemble1, YEnsemble1, predict_probEnsemble1 = predict_util(projectData, ensemble1)
        projectData['Voting_Prediction'] = predictionEnsemble1
        projectData['Voting_Pred_Prob']  = predict_probEnsemble1
        VotingMetrics = computePerformanceMeasures(predictionEnsemble1, YEnsemble1, predict_probEnsemble1)

        metricsFrame.insert(loc = 0, column = 'Voting', value = VotingMetrics)
                
        predictionEnsemble2, YEnsemble2, predict_probEnsemble2 = predict_util(projectData, ensemble2)
        projectData['RandomForest_Prediction'] = predictionEnsemble2
        projectData['RandomForest_Pred_Prob']  = predict_probEnsemble2
        RandomForestMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc=1, column='RandomForest', value = RandomForestMetrics)
        
        predictionEnsemble3, YEnsemble3, predict_probEnsemble3 = predict_util(projectData, ensemble3)
        projectData['AdaBoost_Prediction'] = predictionEnsemble3
        projectData['AdaBoost_Pred_Prob']  = predict_probEnsemble3
        AdaBoostMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc=2, column='AdaBoost', value = AdaBoostMetrics)
        
        metricsFrame.to_csv('dataset/metrics/' + projectName)

        projectData.to_csv('dataset/annotated/' + projectName)

In [7]:
import warnings
warnings.filterwarnings('ignore')
predict()

In [8]:
def bestEnsembleSelector():
    annotated_directory   = 'dataset/annotated/'
    performance_directory = 'dataset/metrics/'

    for projectName in os.listdir(annotated_directory):
        print(projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName, dtype={'bug_binarized':np.bool})
        metricData    = pd.read_csv(performance_directory + projectName)


        predictionMatrix = annotatedData.as_matrix(columns = ['bug_binarized','Voting_Prediction','AdaBoost_Prediction','RandomForest_Prediction'])
        print(metricData)
        print(metricData.loc[0, 'Voting'])
        
       # defining constants
        f_measure_constant    = 4     # f-measure is at the 4th row
        auc_score_constant    = 2     # auc_score is at the 2nd row
        voting_constant       = 'Voting'
        adaBoost_constant     = 'AdaBoost'
        randomForest_constant = 'RandomForest'
        
        ensemble=[]
        
        for i in range(len(predictionMatrix)):
            if   predictionMatrix[i][0] == predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('Voting')
            elif predictionMatrix[i][0] == predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('AdaBoost')
            elif predictionMatrix[i][0] == predictionMatrix[i][3] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2]:
                ensemble.append('RandomForest')
            else:
                f_voting       = metricData.loc[f_measure_constant, voting_constant]
                f_adaboost     = metricData.loc[f_measure_constant, adaBoost_constant]
                f_randomForest = metricData.loc[f_measure_constant, randomForest_constant]
                flag=True
                if f_voting > f_adaboost and f_voting > f_randomForest:
                    ensemble.append('Voting')
                    flag=False
                elif f_adaboost > f_randomForest and f_randomForest > f_voting:
                    ensemble.append('AdaBoost')
                    flag=False
                else:
                    ensemble.append('RandomForest')
                    flag=False
            if flag == True:
                p_voting       = metricData[auc_score_constant][voting_constant]
                p_adaBoost     = metricData[auc_score_constant][adaBoost_constant]
                p_randomForest = metricData[auc_score_constant][randomForest_constant]

                if p_voting > p_adaBoost and p_voting > p_randomForest:
                    ensemble.append('Voting')
                    flag=False

                elif p_adaBoost>p_randomForest and p_adaBoost>p_voting:
                    ensemble.append('AdaBoost')
                    flag=False

                else:
                    ensemble.append('RandomForest')
                    flag=False
            if flag == True:
                ensemble.append('Voting')
                
        annotatedData['selectedEnsemble'] = ensemble
        annotatedData.to_csv(annotated_directory + projectName)


In [9]:
bestEnsembleSelector()

camel-1.6.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.718750      0.750000  0.750000
1     Recall  0.370968      0.354839  0.354839
2  Auc_Score  0.723068      0.730290  0.730290
3   Accuracy  0.800554      0.803324  0.803324
4  F_Measure  0.489362      0.481752  0.481752
0.71875
e-learning.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.833333      0.812500  0.812500
1     Recall  0.816327      0.795918  0.795918
2  Auc_Score  0.909605      0.899290  0.899290
3   Accuracy  0.879433      0.865248  0.865248
4  F_Measure  0.824742      0.804124  0.804124
0.8333333333333334
intercafe.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.666667      0.666667  0.666667
1     Recall  0.666667      0.666667  0.666667
2  Auc_Score  0.746032      0.746032  0.746032
3   Accuracy  0.739130      0.739130  0.739130
4  F_Measure  0.666667      0.666667  0.666667
0.6666666666666666
ivy-2.0.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Pre

In [10]:
def computePerformanceMeasuresDSE():
    annotated_directory = 'dataset/annotated/'
    for projectName in os.listdir(annotated_directory):
        DSEPrediction = []
        predict_prob  = []
        annotatedProject = pd.read_csv(annotated_directory + projectName)
        annotatedProjectData = annotatedProject.as_matrix(columns=[
                                         'Voting_Prediction',
                                         'Voting_Pred_Prob',
                                         'AdaBoost_Prediction',
                                         'AdaBoost_Pred_Prob',
                                         'RandomForest_Prediction',
                                         'RandomForest_Pred_Prob'])
        ensemble = np.array(annotatedProject['selectedEnsemble'])
        
        prediction_constant = '_Prediction'
        probab_constant     = '_Pred_Prob'
        
        for i in range(len(ensemble)):
            DSEPrediction.append(annotatedProject.loc[i, ensemble[i] + prediction_constant])
            predict_prob.append(annotatedProject.loc[i, ensemble[i] + probab_constant])
            
        annotatedProject['DSE_Prediction'] = DSEPrediction
        annotatedProject['DSE_Pred_Prob']  = predict_prob
        projectMetrics = []
        projectMetrics.append(projectName)
        projectMetrics.extend(computePerformanceMeasures(DSEPrediction, annotatedProject['bug_binarized'], predict_prob))
        print(projectMetrics)
#         metrics = [precision, recall, roc_score, accuracy, f_measure]
#         DSEmetricsFrame = pd.DataFrame(performanceMetrics, 
#                                     index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure'])
                
computePerformanceMeasuresDSE()       

['camel-1.6.csv', 0.75, 0.3870967741935484, 0.7417850264805006, 0.8088642659279779, 0.5106382978723403]
['e-learning.csv', 0.8333333333333334, 0.8163265306122449, 0.9096051464063887, 0.8794326241134752, 0.8247422680412372]
['intercafe.csv', 0.6666666666666666, 0.6666666666666666, 0.746031746031746, 0.7391304347826086, 0.6666666666666666]
['ivy-2.0.csv', 0.7777777777777778, 0.7590361445783133, 0.9023134792691895, 0.8716216216216216, 0.7682926829268293]
['jedit-4.3.csv', 0.8392857142857143, 0.6438356164383562, 0.9558329650905877, 0.8464912280701754, 0.7286821705426357]
['kalkulator.csv', 0.7272727272727273, 0.8, 0.7928571428571429, 0.7916666666666666, 0.761904761904762]
['log4j-1.2.csv', 0.9327731092436975, 0.9327731092436975, 0.9644092931290164, 0.9215686274509803, 0.9327731092436976]
['lucene-2.4.csv', 0.6504854368932039, 0.6320754716981132, 0.6431681526021149, 0.6192893401015228, 0.6411483253588518]
['poi-2.5.csv', 0.861878453038674, 0.8863636363636364, 0.8929820179820179, 0.831460674

In [40]:
def svctrain():
    directory = 'dataset/dataset/'
    annotated_directory = 'dataset/annotated/'
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : ,0:-1])
        Y = np.array(annotatedData.iloc[ : ,-1])
        
        npoints = X.shape[0]
        
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y  = []
        predictedEnsemble = []
        predict_prob = [] 
    
        for train_index, test_index in kf.split(X):
            classifier = SVC()
            
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                #predict_prob.append(classifier.predict_proba([X1[j]])[0])
            
           
            else:
                classifier.fit(train_X, train_Y)
            
                for j in test_index:
                    predictedEnsemble.append(classifier.predict([X[j]])[0])
               # predict_prob.append(classifier.predict_proba([X1[j]])[0])
                    print(classifier.predict_proba([X[j]]))
            
        print(predictedEnsemble)

        
        

In [41]:
svctrain()

camel-1.6.csv


AttributeError: predict_proba is not available when  probability=False