In [0]:
import numpy as np
import pandas as pd
import os
import math

#importing smote for correcting imbalanced data
from imblearn.over_sampling import SMOTE
from collections import Counter

#importing base learners of Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

#Importing three component ensembles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#importing SVC for second-step classification
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score


In [0]:
#defining ml techniques
base_learner1 = LogisticRegression(random_state=1)
base_learner2 = DecisionTreeClassifier()
base_learner3 = GaussianNB()



In [0]:
#predicts bug and their probabilities for given datafile and classifier pair

def predict_util(datafile, classifiertype):
    ncols = datafile.columns
    #extracting relevant columns, software metrics in X, and labels in Y
    
    ncols = ncols[ :-1]
    X     = datafile.iloc[ : , :-1]
    X1    = datafile.as_matrix(ncols)
    y = datafile['bug']
    Y = np.array(y)
    
    #performing leave-one out validation for instances less than 100
    #and 10 fold validation for others
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = [] 
    
    for train_index, test_index in kf.split(X):
        if classifiertype == 'Voting':
            classifier = VotingClassifier(estimators=[
                                         ('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')      
        elif classifiertype == 'RandomForest':
            classifier = RandomForestClassifier()
        else:
            classifier = AdaBoostClassifier(base_estimator = RandomForestClassifier(), n_estimators = 100, learning_rate = 0.5)
            
        for i in train_index:
                train_X.append(X1[i])
                train_Y.append(Y[i])
#         smote = SMOTE(sampling_strategy='auto',k_neighbors=2)
#         train_X,train_Y = smote.fit_sample(train_X,train_Y)
        classifier.fit(train_X, train_Y)
        for j in test_index:
            prediction.append(classifier.predict([X1[j]])[0])
            predict_prob.append(classifier.predict_proba([X1[j]])[0][1])
        
        train_X  = []
        train_Y  = []
    
    return prediction, Y, predict_prob
    



In [0]:
def computePerformanceMeasures(predictions, labels, prediction_probability):
    
    precision = precision_score(y_true = labels, y_pred = predictions)
    recall    = recall_score(y_true = labels, y_pred = predictions)
    roc_score = roc_auc_score(labels, prediction_probability)
    accuracy  = accuracy_score(y_true = labels, y_pred = predictions)
    f_measure = 2*(precision * recall)/float(precision + recall) 
    g_mean = math.sqrt(precision * recall)
    
    metrics = [precision, recall, roc_score, accuracy, f_measure, g_mean]
    
    return metrics  

In [0]:
def predict():
    directory = 'dataset/dataset/'
    
    for projectName in os.listdir(directory):
        print(projectName)
        performanceMetrics = []
    
        inputData = pd.read_csv(directory + projectName, dtype={'bug':np.bool})
        projectData = pd.read_csv(directory + projectName, dtype={'bug':np.bool})
        
        metricsFrame = pd.DataFrame(performanceMetrics, 
                                    index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure', 'GMean'])
        
        predictionEnsemble1, YEnsemble1, predict_probEnsemble1 = predict_util(inputData, 'Voting')
        projectData['Voting_Prediction'] = predictionEnsemble1
        projectData['Voting_Pred_Prob']  = predict_probEnsemble1
        VotingMetrics = computePerformanceMeasures(predictionEnsemble1, YEnsemble1, predict_probEnsemble1)
        metricsFrame.insert(loc = 0, column = 'Voting', value = VotingMetrics)
                
            
        predictionEnsemble2, YEnsemble2, predict_probEnsemble2 = predict_util(inputData, 'RandomForest')
        projectData['RandomForest_Prediction'] = predictionEnsemble2
        projectData['RandomForest_Pred_Prob']  = predict_probEnsemble2
        RandomForestMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc = 1, column='RandomForest', value = RandomForestMetrics)
        
        predictionEnsemble3, YEnsemble3, predict_probEnsemble3 = predict_util(inputData, 'AdaBoost')
        projectData['AdaBoost_Prediction'] = predictionEnsemble3
        projectData['AdaBoost_Pred_Prob']  = predict_probEnsemble3
        AdaBoostMetrics = computePerformanceMeasures(predictionEnsemble3, YEnsemble3, predict_probEnsemble3)
        metricsFrame.insert(loc = 2, column='AdaBoost', value = AdaBoostMetrics)
        
        metricsFrame.to_csv('dataset/metrics/metrics/' + projectName)
        print(projectName)
        print(metricsFrame)
        
        projectData.to_csv('dataset/annotated/annotated/' + projectName, index = False)

In [0]:
import warnings
warnings.filterwarnings('ignore')
predict()

ant-1.7.csv
ant-1.7.csv
             Voting  RandomForest  AdaBoost
Precision  0.636364      0.613861  0.629921
Recall     0.421687      0.373494  0.481928
Auc_Score  0.810288      0.789432  0.825957
Accuracy   0.817450      0.808054  0.821477
F_Measure  0.507246      0.464419  0.546075
GMean      0.518021      0.478825  0.550978
arc.csv
arc.csv
             Voting  RandomForest  AdaBoost
Precision  0.312500      0.400000  0.235294
Recall     0.185185      0.296296  0.148148
Auc_Score  0.735910      0.699678  0.780283
Accuracy   0.858974      0.867521  0.846154
F_Measure  0.232558      0.340426  0.181818
GMean      0.240563      0.344265  0.186704
camel-1.6.csv
camel-1.6.csv
             Voting  RandomForest  AdaBoost
Precision  0.581818      0.413333  0.425743
Recall     0.170213      0.164894  0.228723
Auc_Score  0.703545      0.673293  0.707508
Accuracy   0.814508      0.791710  0.789637
F_Measure  0.263374      0.235741  0.297578
GMean      0.314695      0.261067  0.312053
e-learni

workflow.csv
             Voting  RandomForest  AdaBoost
Precision  0.437500      0.421053  0.421053
Recall     0.350000      0.400000  0.400000
Auc_Score  0.447368      0.450000  0.418421
Accuracy   0.435897      0.410256  0.410256
F_Measure  0.388889      0.410256  0.410256
GMean      0.391312      0.410391  0.410391
wspomaganiepi.csv
wspomaganiepi.csv
             Voting  RandomForest  AdaBoost
Precision  0.923077      0.923077  0.909091
Recall     1.000000      1.000000  0.833333
Auc_Score  0.944444      0.861111  0.861111
Accuracy   0.944444      0.944444  0.833333
F_Measure  0.960000      0.960000  0.869565
GMean      0.960769      0.960769  0.870388
xalan-2.7.csv
xalan-2.7.csv
             Voting  RandomForest  AdaBoost
Precision  0.995560      0.995560  0.995560
Recall     0.998886      0.998886  0.998886
Auc_Score  0.938803      0.856955  0.751367
Accuracy   0.994499      0.994499  0.994499
F_Measure  0.997221      0.997221  0.997221
GMean      0.997222      0.997222  0.997222

In [0]:
def bestEnsembleSelector():
    annotated_directory   = 'dataset/annotated/annotated/'
    performance_directory = 'dataset/metrics/metrics/'

    for projectName in os.listdir(annotated_directory):
        print(projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName, dtype={'bug':np.bool})
        metricData    = pd.read_csv(performance_directory + projectName)


        predictionMatrix = annotatedData.as_matrix(columns = ['bug','Voting_Prediction','AdaBoost_Prediction','RandomForest_Prediction'])
        print(metricData)
        
       # defining constants
        auc_score_constant    = 2     # auc_score is at the 2nd row
        voting_constant       = 'Voting'
        adaBoost_constant     = 'AdaBoost'
        randomForest_constant = 'RandomForest'
        
        ensemble=[]
        
        for i in range(len(predictionMatrix)):
            if   predictionMatrix[i][0] == predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('Voting')
            elif predictionMatrix[i][0] == predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('AdaBoost')
            elif predictionMatrix[i][0] == predictionMatrix[i][3] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2]:
                ensemble.append('RandomForest')
            else:
                p_voting       = metricData.loc[auc_score_constant, voting_constant]
                p_adaBoost     = metricData.loc[auc_score_constant, adaBoost_constant]
                p_randomForest = metricData.loc[auc_score_constant, randomForest_constant]

                if p_voting > p_adaBoost and p_voting > p_randomForest:
                    ensemble.append('Voting')

                elif p_adaBoost>p_randomForest and p_adaBoost > p_voting:
                    ensemble.append('AdaBoost')

                else:
                    ensemble.append('RandomForest')
                
        annotatedData['selectedEnsemble'] = ensemble
        annotatedData.to_csv(annotated_directory + projectName, index = False)


In [0]:
bestEnsembleSelector()

ant-1.7.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.636364      0.613861  0.629921
1     Recall  0.421687      0.373494  0.481928
2  Auc_Score  0.810288      0.789432  0.825957
3   Accuracy  0.817450      0.808054  0.821477
4  F_Measure  0.507246      0.464419  0.546075
5      GMean  0.518021      0.478825  0.550978
arc.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.312500      0.400000  0.235294
1     Recall  0.185185      0.296296  0.148148
2  Auc_Score  0.735910      0.699678  0.780283
3   Accuracy  0.858974      0.867521  0.846154
4  F_Measure  0.232558      0.340426  0.181818
5      GMean  0.240563      0.344265  0.186704
camel-1.6.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.581818      0.413333  0.425743
1     Recall  0.170213      0.164894  0.228723
2  Auc_Score  0.703545      0.673293  0.707508
3   Accuracy  0.814508      0.791710  0.789637
4  F_Measure  0.263374      0.235741  0.297578
5      GMean  0.314695    

  # This is added back by InteractiveShellApp.init_path()


e-learning.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.500000      0.333333  0.500000
1     Recall  0.600000      0.200000  0.400000
2  Auc_Score  0.854237      0.940678  0.916949
3   Accuracy  0.921875      0.906250  0.921875
4  F_Measure  0.545455      0.250000  0.444444
5      GMean  0.547723      0.258199  0.447214
intercafe.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  1.000000      1.000000  1.000000
1     Recall  0.750000      0.750000  0.750000
2  Auc_Score  0.902174      0.831522  0.750000
3   Accuracy  0.962963      0.962963  0.962963
4  F_Measure  0.857143      0.857143  0.857143
5      GMean  0.866025      0.866025  0.866025
ivy-2.0.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.461538      0.529412  0.458333
1     Recall  0.300000      0.225000  0.275000
2  Auc_Score  0.776442      0.745032  0.817388
3   Accuracy  0.880682      0.889205  0.880682
4  F_Measure  0.363636      0.315789  0.343750
5      GMean  0.372

xerces-1.4.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.920705      0.939052  0.936652
1     Recall  0.956522      0.951945  0.947368
2  Auc_Score  0.908285      0.925084  0.909164
3   Accuracy  0.906463      0.918367  0.913265
4  F_Measure  0.938272      0.945455  0.941980
5      GMean  0.938442      0.945477  0.941995
zuzel.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.909091      0.687500  0.916667
1     Recall  0.769231      0.846154  0.846154
2  Auc_Score  0.899038      0.846154  0.870192
3   Accuracy  0.862069      0.758621  0.896552
4  F_Measure  0.833333      0.758621  0.880000
5      GMean  0.836242      0.762713  0.880705


In [0]:
def knn_param_selection(X, y, nfolds=10):
    k_range = range(1, 31)
    param_grid={'n_neighbors':list(k_range)}
    grid_search = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid,cv=nfolds,scoring='fowlkes_mallows_score')
    grid_search.fit(X, y)
    return grid_search.best_params_

In [0]:
def knntrain():
    directory = 'dataset/dataset/'
    annotated_directory = 'dataset/annotated/annotated_smote/'
    DSE_directory = 'dataset/DSE/DSE_smote_DT/'
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : , :-1])
        Y = np.array(annotatedData.iloc[ : , -1])
        
        npoints = X.shape[0]
        
        try:
            params = knn_param_selection(X,Y)
        except:
            params={'n_neighbors':3}

        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y = []
        
        predictedEnsemble = []
        predict_prob      = []
        final_prediction  = []  # this stores the prediction(bugginess) of the best ensemble predicted by SVC
        
        prediction_constant = '_Prediction'
        probab_constant = '_Pred_Prob'
        
        
        for train_index, test_index in kf.split(X):
            classifier = KNeighborsClassifier(n_neighbors = params['n_neighbors'])
            #print(classifier)
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                    predict_prob.append(annotatedData.loc[j, unique_labels[0] + probab_constant])
                    final_prediction.append(annotatedData.loc[j, unique_labels[0] + prediction_constant])
           
            else:
                classifier.fit(train_X, train_Y)
                
                for j in test_index:
                    predictedBestEnsemble = classifier.predict([X[j]])[0]
                    predictedEnsemble.append(predictedBestEnsemble)
                    final_prediction.append(annotatedData.loc[j, predictedBestEnsemble + prediction_constant])
                    
            # total probability of available classifiers, i.e the classifiers reported in unique_labels predicting true
                    predict_proba_true = 0
                    
            # probability of classifiers being predicted
                    predict_proba_classifiers = classifier.predict_proba([X[j]])[0]
                    k = 0
            # class probabilities are always reported in a sorted by name fashion, i.e AdaBoost, RandomForest, Voting 
            # np.unique also reports labels in a sorted by name fashion
                    for classifierName in unique_labels:
                        predict_proba_true +=  predict_proba_classifiers[k] * annotatedData.loc[j, classifierName + probab_constant]
                        k += 1
                    predict_prob.append(predict_proba_true)
                    
        annotatedData['PredictedEnsemble'] = predictedEnsemble
        annotatedData['DSE_Prediction'] = final_prediction
        annotatedData['DSE_Pred_Prob'] = predict_prob
        annotatedData.to_csv(DSE_directory + projectName, index = False)    


In [0]:
knntrain()

ant-1.7.csv
arc.csv
camel-1.6.csv
e-learning.csv
intercafe.csv
ivy-2.0.csv
jedit-4.3.csv
kalkulator.csv
log4j-1.2.csv
lucene-2.4.csv
nieruchomosci.csv
pbeans2.csv
pdftranslator.csv
poi-2.5.csv
prop-6.csv
redaktor.csv
serapion.csv
skarbonka.csv
sklebagd.csv
synapse-1.2.csv
systemdata.csv
szybkafucha.csv
termoproject.csv
tomcat.csv
velocity-1.6.csv
workflow.csv
wspomaganiepi.csv
xalan-2.7.csv
xerces-1.4.csv
zuzel.csv


In [0]:
def computePerformanceMeasuresDSE():
    DSEdirectory = 'dataset/DSE/DSE_smote_DT/'
    projectMetrics = []
    index = 0
    projectMetrics = pd.DataFrame(projectMetrics,
                                    columns = ['Project','Precision', 'Recall', 'Auc_Score', 'Accuracy', 'Fmeasure', 'GMean'])
    for projectName in os.listdir(DSEdirectory):
        project = pd.read_csv(DSEdirectory + projectName)
        projectData = project.as_matrix(columns=[
                                         'DSE_Prediction',
                                         'DSE_Pred_Prob',
                                         'bug'])
      
        row = []
        row.append(projectName)
        row.extend(computePerformanceMeasures(project['DSE_Prediction'], 
                                                         project['bug'], 
                                                         project['DSE_Pred_Prob']))
        projectMetrics.loc[index] = row
        index = index + 1
  
    print(projectMetrics)
    projectMetrics.to_csv('dataset/results/' + 'results_smote_dt.csv', index = False)

In [0]:
computePerformanceMeasuresDSE()   

              Project  Precision    Recall  Auc_Score  Accuracy  Fmeasure  \
0         ant-1.7.csv   0.584416  0.542169   0.830170  0.812081  0.562500   
1             arc.csv   0.333333  0.481481   0.755770  0.829060  0.393939   
2       camel-1.6.csv   0.418033  0.271277   0.721501  0.784456  0.329032   
3      e-learning.csv   0.272727  0.600000   0.850847  0.843750  0.375000   
4       intercafe.csv   0.428571  0.750000   0.902174  0.814815  0.545455   
5         ivy-2.0.csv   0.371429  0.325000   0.806330  0.860795  0.346667   
6       jedit-4.3.csv   0.092593  0.454545   0.828104  0.888211  0.153846   
7      kalkulator.csv   0.600000  0.500000   0.849206  0.814815  0.545455   
8       log4j-1.2.csv   0.969925  0.682540   0.739418  0.687805  0.801242   
9      lucene-2.4.csv   0.795000  0.783251   0.809248  0.750000  0.789082   
10  nieruchomosci.csv   0.600000  0.600000   0.888235  0.703704  0.600000   
11        pbeans2.csv   0.666667  0.600000   0.679487  0.857143  0.631579   

In [0]:
for fileName in os.listdir('C:\Users\DELL\Desktop\Dynamic-Selection-of-Ensembles\dataset\dataset'):
    print fileName,pd.read_csv('C:/Users/DELL/Desktop/Dynamic-Selection-of-Ensembles/dataset/dataset/'+fileName).columns 

ant-1.7.csv Index([u'cbo', u'rfc', u'lcom', u'loc', u'cam', u'amc', u'max_cc', u'bug'], dtype='object')
arc.csv Index([u'cbo', u'rfc', u'ce', u'npm', u'cam', u'bug'], dtype='object')
camel-1.6.csv Index([u'dit', u'noc', u'cbo', u'lcom', u'ca', u'npm', u'cam', u'ic', u'cbm',
       u'amc', u'avg_cc', u'bug'],
      dtype='object')
e-learning.csv Index([u'wmc', u'npm', u'loc', u'bug'], dtype='object')
intercafe.csv Index([u'cbo', u'bug'], dtype='object')
ivy-2.0.csv Index([u'wmc', u'cbo', u'rfc', u'ce', u'npm', u'loc', u'moa', u'amc', u'bug'], dtype='object')
jedit-4.3.csv Index([u'rfc', u'moa', u'bug'], dtype='object')
kalkulator.csv Index([u'dit', u'amc', u'bug'], dtype='object')
log4j-1.2.csv Index([u'cbo', u'ca', u'npm', u'bug'], dtype='object')
lucene-2.4.csv Index([u'noc', u'cbo', u'rfc', u'lcom', u'ca', u'ce', u'npm', u'loc', u'dam',
       u'mfa', u'ic', u'cbm', u'max_cc', u'bug'],
      dtype='object')
nieruchomosci.csv Index([u'loc', u'mfa', u'amc', u'avg_cc', u'bug'], dtype='o