In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
import math
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from operator import itemgetter

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#Importing three component ensembles
from sklearn.ensemble import VotingClassifier

#importing SVC for second-step classification
from sklearn.svm import SVC

In [None]:
mutationRate = 0.001
crossOverRate = 0.06
iterations = 10
poolSize = 50

In [None]:
#Defining functions for genetic algorithms

def roulette(fitnessArray):
    index = 0
    cumalativeFitness = 0.0
    r = random.random()
    
    for i in range(len(fitnessArray)):
        cumalativeFitness += fitnessArray[i]
        if cumalativeFitness > r:
            return i


def selectFittest(fitness, rankedPool):
    while True:
        idx1 = roulette(fitness)
        idx2 = roulette(fitness)
        
        if idx1 is None or idx2 is None:
            continue
        elif idx1==idx2:
            continue
        else:
            break
    
    return rankedPool[idx1], rankedPool[idx2]

def crossover(chromosome1, chromosome2):
    randomSplitPoint = random.randint(1, len(chromosome1))
    return np.concatenate((chromosome1[:randomSplitPoint],chromosome2[randomSplitPoint:])), np.concatenate((chromosome2[:randomSplitPoint],chromosome1[randomSplitPoint:]))


def mutate(chromosome):

    mutatedChrom = []
    for ch in chromosome:
        if random.random()<mutationRate:
            if ch==1:
                mutatedChrom.append(0)
            else:
                mutatedChrom.append(1)
        else:
            mutatedChrom.append(ch)
    return mutatedChrom
    
def breed(chrome1, chrome2):
    if random.random()<crossOverRate:
        newChrome1, newChrome2 = crossover(chrome1, chrome2)
    else:
        newChrome1 = chrome1
        newChrome2 = chrome2
        
    newChrome1 = mutate(newChrome1)
    newChrome2 = mutate(newChrome2)
    
    return newChrome1, newChrome2

In [None]:
def rankPop(pool, X, y, classifier,fitnessFunction):
    scores = []
    for chromosome in pool:
        classifier = RandomForestClassifier()
        chosen_idx = [idx for gene, idx in zip(chromosome, range(X.shape[1])) if gene==1]
        if len(chosen_idx)==0:
            continue
        chosenX = X.iloc[:, chosen_idx]
        #performing leave-one out validation for instances less than 100
        #and 10 fold validation for others
        npoints = X.shape[0]
   
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        classifier.fit(chosenX, y)
        train_X = []
        train_Y  = []
        prediction   = []
        predict_prob = []
        chosenX = np.array(chosenX)
        Y = np.array(y)
        for train_index, test_index in kf.split(X):
            classifier = RandomForestClassifier()
            for i in train_index:
                train_X.append(chosenX[i])
                train_Y.append(Y[i])

            classifier.fit(train_X, train_Y)
            for j in test_index:
                prediction.append(classifier.predict([chosenX[j]])[0])
                predict_prob.append(classifier.predict_proba([chosenX[j]])[0][1])
            train_X  = []
            train_Y  = []
        
   
        if(fitnessFunction == 'f-measure'):
            scores.append(f1_score(y_true=y,y_pred=prediction))
        elif(fitnessFunction == 'g-mean'):
            gScore = math.sqrt(precision_score(y_true = y, y_pred=prediction)*recall_score(y_true = y, y_pred =prediction ))
            scores.append(gScore)
        elif(fitnessFunction == 'accuracy'):
            scores.append(accuracy_score(y_true = y, y_pred = prediction))
        
    fitness = [x/sum(scores) for x in scores]
    pairedPop = zip(pool, fitness)
    rankedPop = sorted(pairedPop, key=itemgetter(-1), reverse = True)
    
    return rankedPop

In [None]:
def iteratePop(rankedPop):
    fitness = [item[-1] for item in rankedPop]
    rankedPool = [item[0] for item in rankedPop]
   
    new_pool = []
    new_pool.extend(rankedPool[:int(poolSize/15)])
    
    while(len(new_pool)<poolSize):
        ch1, ch2 = selectFittest(fitness, rankedPool)
        ch1, ch2 = breed(ch1, ch2)
        
        new_pool.append(ch1)
        new_pool.append(ch2)
    
    return new_pool[:poolSize]

In [None]:
def geneticAlgoFit(datafile,measure):
    datafile = pd.read_csv(datafile, dtype={'buggy':np.bool})
    X     = datafile.iloc[ : , :-1]
    y = datafile['buggy']

    pool = np.random.randint(0, 2, (poolSize, X.shape[1]))  
    for iteration in range(iterations):
#         print iteration
        classifier = RandomForestClassifier()
        rankedPop = rankPop(pool, X, y, classifier,measure)
#         print rankedPop
        pool = []
        pool = iteratePop(rankedPop)
        
    best_chromosome = rankPop(pool, X, y, classifier, measure)[0][0]
    return best_chromosome

In [None]:
directory = 'dataset/dataset/'
for fileName in os.listdir(directory):
    print(fileName)
    print(geneticAlgoFit(directory+fileName,'accuracy'))
    print(geneticAlgoFit(directory+fileName,'f-measure'))
    print(geneticAlgoFit(directory+fileName,'g-mean'))

In [None]:
selectedFeaturesInEachSoftware = np.array([
    [[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1],
    [1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0],
    [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1]],
    
    [[1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
    [1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]],
    
    [[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
    [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0],
    [1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0]],
    
    [[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
    [1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],
    [1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1]],
    
    [[1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1],
    [0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]],
    
    [[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1],
    [0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0]],
    
    [[0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
    [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1],
    [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]],
    
    [[1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
    [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
    [1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0]],
    
    [[1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1],
    [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
    
    [[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1],
    [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0],
    [0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0]],
    
    [[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0],
    [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]],
    
    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]],
    
    [[0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0],
    [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
    [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0]],
    
    [[1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0],
    [0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
    [0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1]],
    
    [[0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0],
    [0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0]],
    
    [[0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0]],
    
    [[0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0],
    [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1]],
    
    [[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1],
    [1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]],
    
    [[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1],
    [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0],
    [1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]],
    
    [[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1],
    [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
    [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1]],
    
    [[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    [0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]],
    
    [[1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
    [0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0],
    [1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1]],
    
    [[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1]],
    
    [[0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
    [1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
    [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0]]
])      

In [None]:
base_learner1 = LogisticRegression(random_state=1)
base_learner2 = DecisionTreeClassifier()
base_learner3 = GaussianNB()
classifier1 = VotingClassifier(estimators=[
                                         ('logregression', base_learner1), 
                                         ('dtree', base_learner2), 
                                         ('gnb', base_learner3)], 
                                          voting='soft')  
classifier2 = RandomForestClassifier()
classifier3 = AdaBoostClassifier(base_estimator = RandomForestClassifier(), n_estimators = 100, learning_rate = 0.5)
directory='dataset/dataset'
for fileName in os.listdir(directory):
    print fileName
    

In [None]:
index = 0
directory = 'dataset/dataset/'
for fileName in os.listdir(directory):
    print fileName
    data = pd.read_csv(directory+fileName)
    X  = data.iloc[ : , :-1]
    Y = data.iloc[:,-1]
    accuracyFeature = selectedFeaturesInEachSoftware[index][0]
    fMeasureFeature = selectedFeaturesInEachSoftware[index][1]
    gMeanFeature = selectedFeaturesInEachSoftware[index][2]
   
    chosen_idx_accuracy = [idx for gene, idx in zip(accuracyFeature, range(X.shape[1])) if gene==1]
    chosen_idx_fMeasure = [idx for gene, idx in zip(fMeasureFeature, range(X.shape[1])) if gene==1]
    chosen_idx_gMean = [idx for gene, idx in zip(gMeanFeature, range(X.shape[1])) if gene==1]
    
    for strings in ['Voting','RF','Ada']:
        print strings
        data['Accuracy_'+strings+'_Pred'],data['Accuracy_'+strings+'_Pred_Prob'] = getPredictions(X.iloc[:,chosen_idx_accuracy],Y,strings)
        data['FMeasure_'+strings+'_Pred'],data['FMeasure_'+strings+'_Pred_Prob'] = getPredictions(X.iloc[:,chosen_idx_fMeasure],Y,strings)   
        data['GMean_'+strings+'_Pred'],data['GMean_'+strings+'_Pred_Prob'] = getPredictions(X.iloc[:,chosen_idx_gMean],Y,strings)
    
    print data['GMean_Ada_Pred']
    data.to_csv('dataset/annotated/'+fileName,index=False)    
    index+=1
    

In [None]:
def getPredictions(X,Y,strings):
    
    base_learner1 = LogisticRegression(random_state=1)
    base_learner2 = DecisionTreeClassifier()
    base_learner3 = GaussianNB()

    classifier = VotingClassifier(estimators=[
                                 ('logregression', base_learner1), 
                                 ('dtree', base_learner2), 
                                 ('gnb', base_learner3)], 
                                  voting='soft')
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = []
    X = np.array(X)
    Y = np.array(Y)
    for train_index, test_index in kf.split(X):
        if strings == 'RF':
            classifier = RandomForestClassifier()
        elif strings == 'Ada':
            classifier = AdaBoostClassifier()
        else:
            classifier = VotingClassifier(estimators=[
                                 ('logregression', base_learner1), 
                                 ('dtree', base_learner2), 
                                 ('gnb', base_learner3)], 
                                  voting='soft')
        for i in train_index:
            train_X.append(X[i])
            train_Y.append(Y[i])

        classifier.fit(train_X, train_Y)
        for j in test_index:
            prediction.append(classifier.predict([X[j]])[0])
            predict_prob.append(classifier.predict_proba([X[j]])[0][1])
        train_X  = []
        train_Y  = []

    return prediction,predict_prob

In [None]:
def getSelectedFitnessFunction():
    for fileName in os.listdir('dataset/annotated'):
        selectedFitnessFunction = [] 
        data = pd.read_csv('dataset/annotated/'+fileName)
        
        tarr =tt
        arr0,arr1 = data[tarr[0]] , data[tarr[1]]
        arr2,arr3 = data[tarr[2]] , data[tarr[3]]        
        arr4,arr5 = data[tarr[4]] , data[tarr[5]]
        arr6,arr7 = data[tarr[6]] , data[tarr[7]]
        arr8,arr9 = data[tarr[8]] , data[tarr[9]]
        arr10,arr11 = data[tarr[10]] , data[tarr[11]]
        arr12,arr13 = data[tarr[12]] , data[tarr[13]]
        arr14,arr15 = data[tarr[14]] , data[tarr[15]]
        arr16,arr17 = data[tarr[16]] , data[tarr[17]]
        
        auc_arr = []
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr1))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr3))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr5))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr7))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr9))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr11))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr13))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr15))
        auc_arr.append(roc_auc_score(y_true=data['buggy'],y_score=arr17))
        
        result = np.where(auc_arr == np.amax(auc_arr))
        
        ans_array  = ['Accuracy_Voting_Pred','FMeasure_Voting_Pred','GMean_Voting_Pred',
                      'Accuracy_RF_Pred','FMeasure_RF_Pred','GMean_RF_Pred','Accuracy_Ada_Pred',
                     'FMeasure_Ada_Pred','GMean_Ada_Pred']
        highest = ans_array[result[0][0]]
        
        buggy = np.array(data['buggy'])
        a = ""
        b=0
        for i in range(len(arr0)):
            flag = True
            count=0
#             print i,
            answer = -1
            if arr0[i]==buggy[i]:
                answer = 0
                count+=1
                if auc_arr[0]>b:
                    a=ans_array[0]
                    b=auc_arr[0]
                    
            if arr2[i]==buggy[i]:
                answer = 2
                count+=1
                if auc_arr[1]>b:
                    a=ans_array[1]
                    b=auc_arr[1]
                    
            if arr4[i]==buggy[i]:
                answer = 4
                count+=1
                if auc_arr[2]>b:
                    a=ans_array[2]
                    b=auc_arr[2]
                
            if arr6[i]==buggy[i]:
                answer = 6
                count+=1
                if auc_arr[3]>b:
                    a=ans_array[3]
                    b=auc_arr[3]
                
            if arr8[i]==buggy[i]:
                answer = 8
                count+=1
                if auc_arr[4]>b:
                    a=ans_array[4]
                    b=auc_arr[4]
                
            if arr10[i]==buggy[i]:
                answer = 10
                count+=1
                if auc_arr[5]>b:
                    a=ans_array[5]
                    b=auc_arr[5]
                
            if arr12[i]==buggy[i]:
                answer = 12
                count+=1
                if auc_arr[6]>b:
                    a=ans_array[6]
                    b=auc_arr[6]
                
            if arr14[i]==buggy[i]:
                answer = 14
                count+=1
                if auc_arr[7]>b:
                    a=ans_array[7]
                    b=auc_arr[7]
                
            if arr16[i]==buggy[i]:
                answer = 16
                count+=1
                if auc_arr[8]>b:
                    a=ans_array[8]
                    b=auc_arr[8]
#             print ("***",answer)
            if count==1 and answer!=-1:
                selectedFitnessFunction.append(ans_array[int(answer/2)])
            else:
                if answer == -1:
                    selectedFitnessFunction.append(highest)
                else:
                    selectedFitnessFunction.append(a)
        print (len(selectedFitnessFunction),len(arr0))
        data['Function to be selected'] = selectedFitnessFunction
#         print data.head()
        data.to_csv('dataset/annotated/'+fileName,index=False)


            

In [None]:
getSelectedFitnessFunction()

In [None]:
def dtreetrain():
    directory = 'dataset/dataset/'
    annotated_directory = 'dataset/annotated/'
    DSF_directory = 'dataset/DSF/'
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        annotatedData = pd.read_csv(annotated_directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : , :-1])
        Y = np.array(annotatedData.iloc[ : , -1])
#         print (X[0],X[1],Y[0],Y[1])
        npoints = X.shape[0]
        
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y = []
        
        predictedEnsemble = []
        predict_prob      = []
        final_prediction  = []  
        
        prediction_constant = '_Pred'
        probab_constant = '_Pred_Prob'
        
        
        for train_index, test_index in kf.split(X):
            classifier = DecisionTreeClassifier()
            
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                    predict_prob.append(annotatedData.loc[j, unique_labels[0]+"_Prob"])
                    final_prediction.append(annotatedData.loc[j, (unique_labels[0])])
           
            else:
                classifier.fit(train_X, train_Y)
                
                for j in test_index:
                    predictedBestEnsemble = classifier.predict([X[j]])[0]
                    predictedEnsemble.append(predictedBestEnsemble)
                    final_prediction.append(annotatedData.loc[j, predictedBestEnsemble])
                    
            # total probability of available classifiers, i.e the classifiers reported in unique_labels predicting true
                    predict_proba_true = 0
                    
            # probability of classifiers being predicted
#                     predict_proba_classifiers = 1#classifier.predict_proba([X[j]])[0]
#                     k = 0
            # class probabilities are always reported in a sorted by name fashion, i.e AdaBoost, RandomForest, Voting 
            # np.unique also reports labels in a sorted by name fashion
#                     for classifierName in unique_labels:
#                         predict_proba_true = 
#                         k += 1
                    predict_prob.append(annotatedData.loc[j, predictedBestEnsemble+"_Prob"])
                    
        annotatedData['PredictedTechnique'] = predictedEnsemble
        annotatedData['DSF_Prediction'] = final_prediction
        annotatedData['DSF_Pred_Prob'] = predict_prob
        annotatedData.to_csv(DSF_directory + projectName, index = False)
        

In [None]:
svctrain()

In [None]:
def computePerformanceMeasuresDSF():
    DSFdirectory = 'dataset/finalResults/'
    projectMetrics = []
    index = 0
    projectMetrics = pd.DataFrame(projectMetrics,
                                    columns = ['Project','Precision', 'Recall', 'Auc_Score', 'Accuracy', 'Fmeasure', 'GMean'])
    for projectName in os.listdir(DSFdirectory):
        print (projectName)
        project = pd.read_csv(DSFdirectory + projectName)
        projectData = project.as_matrix(columns=[
                                         'DSF_Prediction',
                                         'DSF_Pred_Prob',
                                         'buggy'])
      
        row = []
        row.append(projectName)
        row.extend(computePerformanceMeasures(project['DSF_Prediction'], 
                                                         project['buggy'], 
                                                         project['DSF_Pred_Prob']))
        projectMetrics.loc[index] = row
        index = index + 1
  
    print(projectMetrics)
    projectMetrics.to_csv('dataset/' + 'results.csv', index = False)

In [None]:
computePerformanceMeasuresDSF()

In [None]:
def computePerformanceMeasures(predictions, labels, prediction_probability):
    
    precision = precision_score(y_true = labels, y_pred = predictions)
    recall    = recall_score(y_true = labels, y_pred = predictions)
    roc_score = roc_auc_score(labels, prediction_probability)
    accuracy  = accuracy_score(y_true = labels, y_pred = predictions)
    f_measure = 2*(precision * recall)/float(precision + recall) 
    g_mean = math.sqrt(precision * recall)
    
    metrics = [precision, recall, roc_score, accuracy, f_measure, g_mean]
    
    return metrics

In [None]:
def computePerformanceMeasuresForTechniques():
    annotated_directory = 'dataset/annotated/'
    resultsFile = open('dataset/performanceMeasures.csv','w+')
    for projectName in os.listdir(annotated_directory):
        project = pd.read_csv(annotated_directory+projectName)
        for techniques in ['Accuracy_Voting','FMeasure_Voting','GMean_Voting','Accuracy_RF','FMeasure_RF','GMean_RF','Accuracy_Ada','FMeasure_Ada','GMean_Ada']:
            prediction = project[techniques+'_Pred']
            labels     = project['buggy']
            prediction_probability = project[techniques+'_Pred_Prob']
            measures = (','.join(str(x) for x in computePerformanceMeasures(prediction,labels,prediction_probability))+'\n')
            resultsFile.write(str(projectName)+','+techniques+','+measures)


In [None]:
computePerformanceMeasuresForTechniques()

In [None]:
for strings in ['Voting','RF','Ada']:
        print strings

In [None]:
import pandas as pd
for fileName in os.listdir('dataset/annotated'):
        np.where(arr == 15)

In [None]:
print xx

In [None]:
for fileName in os.listdir('dataset/DSF'):
    fil = pd.read_csv('dataset/DSF/'+fileName)
    fil=fil.drop(columns=tt)
    fil.to_csv('dataset/finalResults/'+fileName,index=False)

In [None]:
fil = pd.read_csv('dataset/DSF/camel-1.6.csv') 
tt=fil.columns.values
tt= (tt[21:39])

In [None]:
print (len(tt))