In [1]:
import numpy as np
import pandas as pd
import os
import math
import random
from operator import itemgetter

#importing base learners of Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Importing three component ensembles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

#importing SVC for second-step classification
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [2]:
#defining ml techniques
base_learner1 = LogisticRegression(random_state=1)
base_learner2 = DecisionTreeClassifier()
base_learner3 = GaussianNB()



In [3]:
#predicts bug and their probabilities for given datafile and classifier pair

def predict_util(datafile, classifiertype):
    ncols = datafile.columns
    #extracting relevant columns, software metrics in X, and labels in Y
    
    ncols = ncols[ :-1]
    X     = datafile.iloc[ : , :-1]
    X1    = datafile.as_matrix(ncols)
    y = datafile['buggy']
    Y = np.array(y)
    
    #performing leave-one out validation for instances less than 100
    #and 10 fold validation for others
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = [] 
    
    for train_index, test_index in kf.split(X):
        if classifiertype == 'Voting':
            classifier = VotingClassifier(estimators=[
                                         ('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')      
        elif classifiertype == 'RandomForest':
            classifier = RandomForestClassifier()
        else:
            classifier = AdaBoostClassifier(base_estimator = RandomForestClassifier(), n_estimators = 100, learning_rate = 0.5)
            
        for i in train_index:
                train_X.append(X1[i])
                train_Y.append(Y[i])
        
        classifier.fit(train_X, train_Y)
        
        for j in test_index:
            prediction.append(classifier.predict([X1[j]])[0])
            predict_prob.append(classifier.predict_proba([X1[j]])[0][1])
        
        train_X  = []
        train_Y  = []
    
    return prediction, Y, predict_prob
    



In [4]:
def computePerformanceMeasures(predictions, labels, prediction_probability):
    
    precision = precision_score(y_true = labels, y_pred = predictions)
    recall    = recall_score(y_true = labels, y_pred = predictions)
    roc_score = roc_auc_score(labels, prediction_probability)
    accuracy  = accuracy_score(y_true = labels, y_pred = predictions)
    f_measure = 2*(precision * recall)/float(precision + recall) 
    g_mean = math.sqrt(precision * recall)
    
    metrics = [precision, recall, roc_score, accuracy, f_measure, g_mean]
    
    return metrics  

In [5]:
def predict():
    directory = 'dataset/dataset/'
    
    for projectName in os.listdir(directory):
        performanceMetrics = []
    
        inputData = pd.read_csv(directory + projectName, dtype={'buggy':np.bool})
        projectData = pd.read_csv(directory + projectName, dtype={'buggy':np.bool})
        
        metricsFrame = pd.DataFrame(performanceMetrics, 
                                    index = ['Precision', 'Recall', 'Auc_Score', 'Accuracy', 'F_Measure', 'GMean'])
        
        predictionEnsemble1, YEnsemble1, predict_probEnsemble1 = predict_util(inputData, 'Voting')
        projectData['Voting_Prediction'] = predictionEnsemble1
        projectData['Voting_Pred_Prob']  = predict_probEnsemble1
        VotingMetrics = computePerformanceMeasures(predictionEnsemble1, YEnsemble1, predict_probEnsemble1)
        metricsFrame.insert(loc = 0, column = 'Voting', value = VotingMetrics)
                
            
        predictionEnsemble2, YEnsemble2, predict_probEnsemble2 = predict_util(inputData, 'RandomForest')
        projectData['RandomForest_Prediction'] = predictionEnsemble2
        projectData['RandomForest_Pred_Prob']  = predict_probEnsemble2
        RandomForestMetrics = computePerformanceMeasures(predictionEnsemble2, YEnsemble2, predict_probEnsemble2)
        metricsFrame.insert(loc = 1, column='RandomForest', value = RandomForestMetrics)
        
        predictionEnsemble3, YEnsemble3, predict_probEnsemble3 = predict_util(inputData, 'AdaBoost')
        projectData['AdaBoost_Prediction'] = predictionEnsemble3
        projectData['AdaBoost_Pred_Prob']  = predict_probEnsemble3
        AdaBoostMetrics = computePerformanceMeasures(predictionEnsemble3, YEnsemble3, predict_probEnsemble3)
        metricsFrame.insert(loc = 2, column='AdaBoost', value = AdaBoostMetrics)
        
        metricsFrame.to_csv('dataset/metrics/' + projectName)
        print(projectName)
        print(metricsFrame)
        
        projectData.to_csv('dataset/annotated/' + projectName, index = False)

In [6]:
import warnings
warnings.filterwarnings('ignore')
predict()

synapse-1.2.csv
             Voting  RandomForest  AdaBoost
Precision  0.666667      0.600000  0.450000
Recall     0.347826      0.391304  0.391304
Auc_Score  0.783421      0.674319  0.761276
Accuracy   0.853846      0.846154  0.807692
F_Measure  0.457143      0.473684  0.418605
GMean      0.481543      0.484544  0.419627
zuzel.csv
             Voting  RandomForest  AdaBoost
Precision  0.875000      0.777778  0.777778
Recall     0.777778      0.777778  0.777778
Auc_Score  0.888889      0.916667  0.902778
Accuracy   0.823529      0.764706  0.764706
F_Measure  0.823529      0.777778  0.777778
GMean      0.824958      0.777778  0.777778
ivy-2.0.csv
             Voting  RandomForest  AdaBoost
Precision  0.770270      0.769231  0.791045
Recall     0.686747      0.602410  0.638554
Auc_Score  0.875502      0.901720  0.904604
Accuracy   0.854730      0.837838  0.851351
F_Measure  0.726115      0.675676  0.706667
GMean      0.727311      0.680729  0.710721
jedit-4.3.csv
             Voting  Ran

In [7]:
def bestEnsembleSelector():
    annotated_directory   = 'dataset/annotated/'
    performance_directory = 'dataset/metrics/'

    for projectName in os.listdir(annotated_directory):
        print(projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName, dtype={'buggy':np.bool})
        metricData    = pd.read_csv(performance_directory + projectName)


        predictionMatrix = annotatedData.as_matrix(columns = ['buggy','Voting_Prediction','AdaBoost_Prediction','RandomForest_Prediction'])
        print(metricData)
        
       # defining constants
        auc_score_constant    = 2     # auc_score is at the 2nd row
        voting_constant       = 'Voting'
        adaBoost_constant     = 'AdaBoost'
        randomForest_constant = 'RandomForest'
        
        ensemble=[]
        
        for i in range(len(predictionMatrix)):
            if   predictionMatrix[i][0] == predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('Voting')
            elif predictionMatrix[i][0] == predictionMatrix[i][2] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][3]:
                ensemble.append('AdaBoost')
            elif predictionMatrix[i][0] == predictionMatrix[i][3] and predictionMatrix[i][0] != predictionMatrix[i][1] and predictionMatrix[i][0] != predictionMatrix[i][2]:
                ensemble.append('RandomForest')
            else:
                p_voting       = metricData.loc[auc_score_constant, voting_constant]
                p_adaBoost     = metricData.loc[auc_score_constant, adaBoost_constant]
                p_randomForest = metricData.loc[auc_score_constant, randomForest_constant]

                if p_voting > p_adaBoost and p_voting > p_randomForest:
                    ensemble.append('Voting')

                elif p_adaBoost>p_randomForest and p_adaBoost > p_voting:
                    ensemble.append('AdaBoost')

                else:
                    ensemble.append('RandomForest')
                
        annotatedData['selectedEnsemble'] = ensemble
        annotatedData.to_csv(annotated_directory + projectName, index = False)


In [8]:
bestEnsembleSelector()

camel-1.6.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.635294      0.767857  0.700000
1     Recall  0.290323      0.462366  0.526882
2  Auc_Score  0.748099      0.771009  0.778306
3   Accuracy  0.774238      0.825485  0.819945
4  F_Measure  0.398524      0.577181  0.601227
5      GMean  0.429465      0.595845  0.607303
e-learning.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.783784      0.791667  0.854167
1     Recall  0.591837      0.775510  0.836735
2  Auc_Score  0.869343      0.923580  0.928350
3   Accuracy  0.801418      0.851064  0.893617
4  F_Measure  0.674419      0.783505  0.845361
5      GMean  0.681082      0.783547  0.845406
intercafe.csv
  Unnamed: 0    Voting  RandomForest  AdaBoost
0  Precision  0.700000      0.777778  0.714286
1     Recall  0.777778      0.777778  0.555556
2  Auc_Score  0.746032      0.892857  0.928571
3   Accuracy  0.782609      0.826087  0.739130
4  F_Measure  0.736842      0.777778  0.625000
5      GMean  0.7

In [9]:
def svctrain():
    directory = 'dataset/dataset/'
    annotated_directory = 'dataset/annotated/'
    DSE_directory = 'dataset/DSE/'
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : , :-1])
        Y = np.array(annotatedData.iloc[ : , -1])
        
        npoints = X.shape[0]
        
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y = []
        
        predictedEnsemble = []
        predict_prob      = []
        final_prediction  = []  # this stores the prediction(bugginess) of the best ensemble predicted by SVC
        
        prediction_constant = '_Prediction'
        probab_constant = '_Pred_Prob'
        
        
        for train_index, test_index in kf.split(X):
            classifier = SVC(probability = True)
            
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                    predict_prob.append(annotatedData.loc[j, unique_labels[0] + probab_constant])
                    final_prediction.append(annotatedData.loc[j, unique_labels[0] + prediction_constant])
           
            else:
                classifier.fit(train_X, train_Y)
                
                for j in test_index:
                    predictedBestEnsemble = classifier.predict([X[j]])[0]
                    predictedEnsemble.append(predictedBestEnsemble)
                    final_prediction.append(annotatedData.loc[j, predictedBestEnsemble + prediction_constant])
                    
            # total probability of available classifiers, i.e the classifiers reported in unique_labels predicting true
                    predict_proba_true = 0
                    
            # probability of classifiers being predicted
                    predict_proba_classifiers = classifier.predict_proba([X[j]])[0]
                    k = 0
            # class probabilities are always reported in a sorted by name fashion, i.e AdaBoost, RandomForest, Voting 
            # np.unique also reports labels in a sorted by name fashion
                    for classifierName in unique_labels:
                        predict_proba_true +=  predict_proba_classifiers[k] * annotatedData.loc[j, classifierName + probab_constant]
                        k += 1
                    predict_prob.append(predict_proba_true)
                    
        annotatedData['PredictedEnsemble'] = predictedEnsemble
        annotatedData['DSE_Prediction'] = final_prediction
        annotatedData['DSE_Pred_Prob'] = predict_prob
        annotatedData.to_csv(DSE_directory + projectName, index = False)    


In [10]:
svctrain()

camel-1.6.csv
e-learning.csv
intercafe.csv
ivy-2.0.csv
jedit-4.3.csv
kalkulator.csv
log4j-1.2.csv
lucene-2.4.csv
poi-2.5.csv
prop-6.csv
redaktor.csv
serapion.csv
sklebagd.csv
synapse-1.2.csv
systemdata.csv
szybkafucha.csv
termoproject.csv
tomcat.csv
velocity-1.6.csv
workflow.csv
wspomaganiepi.csv
xalan-2.7.csv
xerces-1.4.csv
zuzel.csv


In [11]:
def computePerformanceMeasuresDSE():
    DSEdirectory = 'dataset/DSE/'
    projectMetrics = []
    index = 0
    projectMetrics = pd.DataFrame(projectMetrics,
                                    columns = ['Project','Precision', 'Recall', 'Auc_Score', 'Accuracy', 'Fmeasure', 'GMean'])
    for projectName in os.listdir(DSEdirectory):
        project = pd.read_csv(DSEdirectory + projectName)
        projectData = project.as_matrix(columns=[
                                         'DSE_Prediction',
                                         'DSE_Pred_Prob',
                                         'buggy'])
      
        row = []
        row.append(projectName)
        row.extend(computePerformanceMeasures(project['DSE_Prediction'], 
                                                         project['buggy'], 
                                                         project['DSE_Pred_Prob']))
        projectMetrics.loc[index] = row
        index = index + 1
  
    print(projectMetrics)
    projectMetrics.to_csv('dataset/' + 'results.csv', index = False)

In [12]:
computePerformanceMeasuresDSE()   

              Project  Precision    Recall  Auc_Score  Accuracy  Fmeasure  \
0       camel-1.6.csv   0.700000  0.526882   0.784410  0.819945  0.601227   
1      e-learning.csv   0.897959  0.897959   0.945874  0.929078  0.897959   
2       intercafe.csv   0.833333  0.555556   0.928571  0.782609  0.666667   
3         ivy-2.0.csv   0.894737  0.819277   0.939024  0.922297  0.855346   
4       jedit-4.3.csv   0.926667  0.952055   0.971410  0.960526  0.939189   
5      kalkulator.csv   0.666667  0.800000   0.885714  0.750000  0.727273   
6       log4j-1.2.csv   0.974359  0.957983   0.989521  0.960784  0.966102   
7      lucene-2.4.csv   0.699029  0.679245   0.735538  0.670051  0.688995   
8         poi-2.5.csv   0.893258  0.903409   0.922172  0.865169  0.898305   
9          prop-6.csv   0.803571  0.789474   0.916510  0.888530  0.796460   
10       redaktor.csv   0.902439  0.725490   0.931373  0.867647  0.804348   
11       serapion.csv   0.571429  0.571429   0.825714  0.812500  0.571429   

In [2]:
mutationRate = 0.001
crossOverRate = 0.001
iterations = 10
poolSize = 50

In [55]:
def roulette(fitness):
    index = 0
    cumalativeFitness = 0.0
    r = random.random()
    
    for i in range(len(fitness)):
        cumalativeFitness += fitness[i]
        if cumalativeFitness > r:
            return i


def selectFittest(fitness, rankedPool):
    while True:
        idx1 = roulette(fitness)
        idx2 = roulette(fitness)
        
        if idx1 is None or idx2 is None:
            continue
        elif idx1==idx2:
            continue
        else:
            break
    
    return rankedPool[idx1], rankedPool[idx2]

def crossover(chrome1, chrome2):
    randomSplitPoint = random.randint(1, len(chrome1))
    return chrome1[:randomSplitPoint]+chrome2[randomSplitPoint:], chrome2[:randomSplitPoint]+chrome1[randomSplitPoint:]


def mutate(chromosome):
#     print chromosome
    mutatedChrom = []
    for ch in chromosome:
        if random.random()<mutationRate:
            if ch==1:
                mutatedChrom.append(0)
            else:
                mutatedChrom.append(1)
        else:
            mutatedChrom.append(ch)
    return mutatedChrom
    
def breed(chrome1, chrome2):
    if random.random()<crossOverRate:
        newChrome1, newChrome2 = crossover(chrome1, chrome2)
    else:
        newChrome1 = chrome1
        newChrome2 = chrome2
        
    newChrome1 = mutate(newChrome1)
    newChrome2 = mutate(newChrome2)
    
    return newChrome1, newChrome2

In [56]:
def rankPop(pool, X, y, classifier):
    scores = []
    
    for chromosome in pool:
#         print chromosome
        chosen_idx = [idx for gene, idx in zip(chromosome, range(X.shape[1])) if gene==1]
        if len(chosen_idx)==0:
            continue
        chosenX = X.iloc[:, chosen_idx]
        
        classifier.fit(chosenX, y)
        scores.append(accuracy_score(y, classifier.predict(chosenX)))
        
    fitness = [x/sum(scores) for x in scores]
    pairedPop = zip(pool, fitness)
    rankedPop = sorted(pairedPop, key=itemgetter(-1), reverse = True)
    
    return rankedPop

In [57]:
def iteratePop(rankedPop):
    fitness = [item[-1] for item in rankedPop]
    rankedPool = [item[0] for item in rankedPop]
   
    new_pool = []
    new_pool.extend(rankedPool[:poolSize/15])
    
    while(len(new_pool)<poolSize):
        ch1, ch2 = selectFittest(fitness, rankedPool)
        ch1, ch2 = breed(ch1, ch2)
        
        new_pool.append(ch1)
        new_pool.append(ch2)
    
    return new_pool[:poolSize]

In [62]:
def geneticAlgoFit(datafile):
    datafile = pd.read_csv(datafile, dtype={'buggy':np.bool})
   
    X     = datafile.iloc[ : , :-1]
    y = datafile['buggy']
    
    classifier = RandomForestClassifier()
  
    pool = np.random.randint(0, 2, (poolSize, X.shape[1]))  
    
    for iteration in range(iterations):
        rankedPop = rankPop(pool, X, y, classifier)
        pool = []
        pool = iteratePop(rankedPop)
        
    best_chromosome = rankPop(pool, X, y, classifier)[0][0]
    return best_chromosome

In [63]:
datafile = 'dataset/dataset/camel-1.6.csv'
print geneticAlgoFit(datafile)

[([0, 1, 0, 0], 0.022429332240884888), ([0, 0, 1, 1], 0.022429332240884888), ([1, 1, 1, 1], 0.022395193226819616), ([0, 1, 0, 0], 0.021780690973644688), ([0, 1, 0, 0], 0.021746551959579412), ([1, 1, 0, 1], 0.021678273931448867), ([0, 1, 1, 1], 0.021678273931448867), ([0, 1, 1, 1], 0.021644134917383592), ([1, 1, 0, 1], 0.021609995903318316), ([1, 1, 0, 1], 0.021609995903318316), ([0, 1, 0, 0], 0.021575856889253044), ([0, 1, 1, 1], 0.021507578861122496), ([0, 1, 0, 0], 0.021473439847057223), ([0, 1, 0, 0], 0.021473439847057223), ([0, 1, 0, 0], 0.021405161818926675), ([0, 1, 0, 0], 0.021405161818926675), ([0, 1, 1, 1], 0.021234466748600307), ([0, 1, 0, 1], 0.020995493650143388), ([0, 0, 1, 1], 0.020415130411033735), ([0, 1, 0, 0], 0.020346852382903187), ([0, 0, 0, 0], 0.020346852382903187), ([1, 1, 0, 1], 0.020346852382903187), ([1, 1, 0, 1], 0.020346852382903187), ([0, 1, 0, 0], 0.020346852382903187), ([1, 1, 1, 1], 0.020346852382903187), ([0, 1, 0, 0], 0.020346852382903187), ([1, 1, 0, 

In [None]:
"""
1. IN pred_util function instead of directly training the data train on selected featured by geneticAlgoFit
2. Set the parameters for GA.
3. Remaining process remains same having 3 ensembles and 1 fitness function
4. To be used without WEKA
"""