In [90]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
import math
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from operator import itemgetter

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier

#importing SVC for second-step classification
from sklearn.svm import SVC

In [3]:
mutationRate = 0.001
crossOverRate = 0.06
iterations = 10
poolSize = 50

In [4]:
#Defining functions for genetic algorithms

def roulette(fitnessArray):
    index = 0
    cumalativeFitness = 0.0
    r = random.random()
    
    for i in range(len(fitnessArray)):
        cumalativeFitness += fitnessArray[i]
        if cumalativeFitness > r:
            return i


def selectFittest(fitness, rankedPool):
    while True:
        idx1 = roulette(fitness)
        idx2 = roulette(fitness)
        
        if idx1 is None or idx2 is None:
            continue
        elif idx1==idx2:
            continue
        else:
            break
    
    return rankedPool[idx1], rankedPool[idx2]

def crossover(chromosome1, chromosome2):
    randomSplitPoint = random.randint(1, len(chromosome1))
    return np.concatenate((chromosome1[:randomSplitPoint],chromosome2[randomSplitPoint:])), np.concatenate((chromosome2[:randomSplitPoint],chromosome1[randomSplitPoint:]))


def mutate(chromosome):

    mutatedChrom = []
    for ch in chromosome:
        if random.random()<mutationRate:
            if ch==1:
                mutatedChrom.append(0)
            else:
                mutatedChrom.append(1)
        else:
            mutatedChrom.append(ch)
    return mutatedChrom
    
def breed(chrome1, chrome2):
    if random.random()<crossOverRate:
        newChrome1, newChrome2 = crossover(chrome1, chrome2)
    else:
        newChrome1 = chrome1
        newChrome2 = chrome2
        
    newChrome1 = mutate(newChrome1)
    newChrome2 = mutate(newChrome2)
    
    return newChrome1, newChrome2

In [5]:
def rankPop(pool, X, y, classifier,fitnessFunction):
    scores = []
    for chromosome in pool:
        classifier = RandomForestClassifier()
        chosen_idx = [idx for gene, idx in zip(chromosome, range(X.shape[1])) if gene==1]
        if len(chosen_idx)==0:
            continue
        chosenX = X.iloc[:, chosen_idx]
        #performing leave-one out validation for instances less than 100
        #and 10 fold validation for others
        npoints = X.shape[0]
   
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        classifier.fit(chosenX, y)
        train_X = []
        train_Y  = []
        prediction   = []
        predict_prob = []
        chosenX = np.array(chosenX)
        Y = np.array(y)
        for train_index, test_index in kf.split(X):
            classifier = RandomForestClassifier()
            for i in train_index:
                train_X.append(chosenX[i])
                train_Y.append(Y[i])

            classifier.fit(train_X, train_Y)
            for j in test_index:
                prediction.append(classifier.predict([chosenX[j]])[0])
                predict_prob.append(classifier.predict_proba([chosenX[j]])[0][1])
            train_X  = []
            train_Y  = []
        
   
        if(fitnessFunction == 'f-measure'):
            scores.append(f1_score(y_true=y,y_pred=prediction))
        elif(fitnessFunction == 'g-mean'):
            gScore = math.sqrt(precision_score(y_true = y, y_pred=prediction)*recall_score(y_true = y, y_pred =prediction ))
            scores.append(gScore)
        elif(fitnessFunction == 'accuracy'):
            scores.append(accuracy_score(y_true = y, y_pred = prediction))
        
    fitness = [x/sum(scores) for x in scores]
    pairedPop = zip(pool, fitness)
    rankedPop = sorted(pairedPop, key=itemgetter(-1), reverse = True)
    
    return rankedPop

In [6]:
def iteratePop(rankedPop):
    fitness = [item[-1] for item in rankedPop]
    rankedPool = [item[0] for item in rankedPop]
   
    new_pool = []
    new_pool.extend(rankedPool[:int(poolSize/15)])
    
    while(len(new_pool)<poolSize):
        ch1, ch2 = selectFittest(fitness, rankedPool)
        ch1, ch2 = breed(ch1, ch2)
        
        new_pool.append(ch1)
        new_pool.append(ch2)
    
    return new_pool[:poolSize]

In [7]:
def geneticAlgoFit(datafile,measure):
    datafile = pd.read_csv(datafile, dtype={'buggy':np.bool})
    X     = datafile.iloc[ : , :-1]
    y = datafile['buggy']

    pool = np.random.randint(0, 2, (poolSize, X.shape[1]))  
    for iteration in range(iterations):
#         print iteration
        classifier = RandomForestClassifier()
        rankedPop = rankPop(pool, X, y, classifier,measure)
#         print rankedPop
        pool = []
        pool = iteratePop(rankedPop)
        
    best_chromosome = rankPop(pool, X, y, classifier, measure)[0][0]
    return best_chromosome

In [12]:
directory = 'dataset/dataset/'
for fileName in os.listdir(directory):
    print(fileName)
    print(geneticAlgoFit(directory+fileName,'accuracy'))
    print(geneticAlgoFit(directory+fileName,'f-measure'))
    print(geneticAlgoFit(directory+fileName,'g-mean'))

In [20]:
selectedFeaturesInEachSoftware = np.array([
    [[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1],
    [1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0],
    [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1]],
    
    [[1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
    [1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]],
    
    [[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
    [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0],
    [1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0]],
    
    [[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
    [1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],
    [1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1]],
    
    [[1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1],
    [0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]],
    
    [[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1],
    [0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0]],
    
    [[0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
    [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1],
    [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]],
    
    [[1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
    [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
    [1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0]],
    
    [[1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1],
    [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
    
    [[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1],
    [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0],
    [0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0]],
    
    [[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0],
    [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]],
    
    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]],
    
    [[0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0],
    [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
    [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0]],
    
    [[1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0],
    [0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
    [0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1]],
    
    [[0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0],
    [0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0]],
    
    [[0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0]],
    
    [[0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0],
    [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1]],
    
    [[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0],
    [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1],
    [1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]],
    
    [[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1],
    [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0],
    [1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]],
    
    [[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1],
    [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
    [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1]],
    
    [[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    [0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]],
    
    [[1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
    [0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0],
    [1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1]],
    
    [[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1]],
    
    [[0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
    [1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
    [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0]]
])      

(3,)


In [44]:
index = 0
for fileName in os.listdir(directory):
    data = pd.read_csv(directory+fileName)
    X  = data.iloc[ : , :-1]
    Y = data.iloc[:,-1]
    accuracyFeature = selectedFeaturesInEachSoftware[index][0]
    fMeasureFeature = selectedFeaturesInEachSoftware[index][1]
    gMeanFeature = selectedFeaturesInEachSoftware[index][2]
    classifier = RandomForestClassifier()
    
    chosen_idx_accuracy = [idx for gene, idx in zip(accuracyFeature, range(X.shape[1])) if gene==1]
    chosen_idx_fMeasure = [idx for gene, idx in zip(fMeasureFeature, range(X.shape[1])) if gene==1]
    chosen_idx_gMean = [idx for gene, idx in zip(gMeanFeature, range(X.shape[1])) if gene==1]
    
    data['Accuracy Prediction'],data['Accuracy Prob'] = getPredictions(X.iloc[:,chosen_idx_accuracy],Y)
    data['fMeasure Prediction'],data['fMeasure Prob'] = getPredictions(X.iloc[:,chosen_idx_fMeasure],Y)
    data['gMean Prediction'],data['gMean Prob'] = getPredictions(X.iloc[:,chosen_idx_gMean],Y)
    data.to_csv('dataset/annotated/'+fileName,index=False)    
    index+=1
    

In [None]:
def getPredictions(X,Y):
    classifier = RandomForestClassifier()
    npoints = X.shape[0]
   
    if npoints <= 100:
        kf = KFold(n_splits = npoints)
    else:
        kf = KFold(n_splits = 10)
        
    kf.get_n_splits(X)
    train_X = []
    train_Y  = []
    prediction   = []
    predict_prob = []
    X = np.array(X)
    Y = np.array(Y)
    for train_index, test_index in kf.split(X):
        classifier = RandomForestClassifier()
        for i in train_index:
            train_X.append(X[i])
            train_Y.append(Y[i])

        classifier.fit(train_X, train_Y)
        for j in test_index:
            prediction.append(classifier.predict([X[j]])[0])
            predict_prob.append(classifier.predict_proba([X[j]])[0][1])
        train_X  = []
        train_Y  = []

    return prediction,predict_prob

In [88]:
def getSelectedFitnessFunction():
    i=1
    for fileName in os.listdir('dataset/annotated'):
        selectedFitnessFunction = [] 
        data = pd.read_csv('dataset/annotated/'+fileName)
        auc_acc = f1_score(y_pred=data['Accuracy Prediction'],y_true=data['buggy'])
        auc_fMeas = f1_score(y_pred=data['fMeasure Prediction'],y_true=data['buggy'])
        auc_gMean = f1_score(y_pred=data['gMean Prediction'],y_true = data['buggy'])
        if auc_acc>auc_fMeas and auc_acc>auc_gMean:
            highest = 'Accuracy'
        elif auc_fMeas>auc_gMean and auc_fMeas>auc_acc:
            highest = 'FMeasure'
        else:
            highest = 'GMean'
        
        arr1 = np.array(data['Accuracy Prediction'])
        arr2 = np.array(data['fMeasure Prediction'])
        arr3 = np.array(data['gMean Prediction'])
        buggy = np.array(data['buggy'])
        
        for i in range(len(arr1)):
            flag = False
            if arr1[i]==buggy[i] and arr2[i] != buggy[i] and arr3[i] != buggy[i]:
                selectedFitnessFunction.append('Accuracy')
                flag = True
            elif arr1[i]!=buggy[i] and arr2[i] == buggy[i] and arr3[i] == buggy[i]:
                selectedFitnessFunction.append('FMeasure')
                flag = True
            elif arr1[i]!=buggy[i] and arr2[i] != buggy[i] and arr3[i] == buggy[i]:
                selectedFitnessFunction.append('GMean')
                flag = True
                
            if flag == False:
                selectedFitnessFunction.append(highest)
            
        data['Function to be selected'] = selectedFitnessFunction
        print data.head()
        data.to_csv('dataset/annotated/'+fileName,index=False)


            

In [89]:
getSelectedFitnessFunction()

   wmc  dit  noc  cbo  rfc  lcom  ca  ce  npm     lcom3  \
0    5    3    0    7   10     0   1   7    4  0.250000   
1    4    1    0    3    5     4   1   2    3  0.666667   
2   20    4    0   26   95   144   2  26   13  0.842105   
3    3    2    0    8   22     3   2   6    2  2.000000   
4    8    1    0   25   20    22  22   3    6  0.571429   

            ...             max_cc  avg_cc  buggy  Accuracy Prediction  \
0           ...                  1  0.6000  False                False   
1           ...                  1  0.5000  False                False   
2           ...                  3  1.0000  False                False   
3           ...                 15  5.3333   True                False   
4           ...                  1  0.7500   True                 True   

   Accuracy Prob  fMeasure Prediction  fMeasure Prob  gMean Prediction  \
0       0.100000                False            0.2             False   
1       0.066667                False            0.0

   wmc  dit  noc  cbo  rfc  lcom  ca  ce  npm     lcom3  \
0   13    1    0   18   47     0  17   1   10  0.541667   
1    6    2    0   22   50     0   1  21    2  1.066667   
2    0    1    0    0    0     0   0   0    0  2.000000   
3    9    1    0   15   32     0   1  14    6  0.562500   
4   13    1    0   13   52     0   1  12    6  0.541667   

            ...             max_cc  avg_cc  buggy  Accuracy Prediction  \
0           ...                  5  2.6154  False                False   
1           ...                  3  1.6667   True                 True   
2           ...                  0  0.0000  False                False   
3           ...                  3  1.3333  False                False   
4           ...                  4  1.4615  False                False   

   Accuracy Prob  fMeasure Prediction  fMeasure Prob  gMean Prediction  \
0            0.0                False            0.2             False   
1            0.7                 True            1.0

   wmc  dit  noc   cbo   rfc  lcom    ca   ce  npm  lcom3  \
0  3.0  1.0    0   6.0  26.0   3.0   1.0  5.0  3.0    2.0   
1  2.0  1.0    0  10.0   3.0   0.0   8.0  2.0  1.0    0.8   
2  5.0  1.0    0  11.0   9.0   0.0  10.0  3.0  4.0    0.5   
3  6.0  1.0    0  10.0  13.0   0.0   6.0  4.0  6.0    0.0   
4  3.0  2.0    0   7.0   7.0   0.0   1.0  6.0  3.0    0.0   

            ...             max_cc  avg_cc  buggy  Accuracy Prediction  \
0           ...                2.0  1.3333  False                False   
1           ...                1.0  0.5000  False                False   
2           ...                3.0  1.0000   True                False   
3           ...                1.0  0.6667  False                False   
4           ...                2.0  1.0000   True                 True   

   Accuracy Prob  fMeasure Prediction  fMeasure Prob  gMean Prediction  \
0            0.5                False            0.2             False   
1            0.1                False   

In [None]:
def svctrain():
    directory = 'dataset/annotated/'
    
    for projectName in os.listdir(directory):
        print(projectName)
        projectData = pd.read_csv(directory + projectName)
        
        #X contains software metrics and Y best ensemble selected
        X = np.array(projectData.iloc[ : , :-8])
        Y = np.array(projectData.iloc[ : , -1])
        
        npoints = X.shape[0]
        
        if npoints <= 100:
            kf = KFold(n_splits = npoints)
        else:
            kf = KFold(n_splits = 10)
        
        kf.get_n_splits(X)
        train_X = []
        train_Y = []
        
        predictedFunction = []
        predict_prob      = []
        final_prediction  = []  # this stores the prediction(bugginess) of the best ensemble predicted by SVC
        
        prediction_constant = '_Prediction'
        probab_constant = '_Pred_Prob'
        
        
        for train_index, test_index in kf.split(X):
            classifier = SVC(probability = True)
            
            for i in train_index:
                train_X.append(X[i])
                train_Y.append(Y[i])
            
            unique_labels = np.unique(train_Y)
            if unique_labels.size == 1:
                for j in test_index:
                    predictedEnsemble.append(unique_labels[0])
                    predict_prob.append(annotatedData.loc[j, unique_labels[0] + probab_constant])
                    final_prediction.append(annotatedData.loc[j, unique_labels[0] + prediction_constant])
           
            else:
                classifier.fit(train_X, train_Y)
                
                for j in test_index:
                    predictedBestEnsemble = classifier.predict([X[j]])[0]
                    predictedEnsemble.append(predictedBestEnsemble)
                    final_prediction.append(annotatedData.loc[j, predictedBestEnsemble + prediction_constant])
                    
            # total probability of available classifiers, i.e the classifiers reported in unique_labels predicting true
                    predict_proba_true = 0
                    
            # probability of classifiers being predicted
                    predict_proba_classifiers = classifier.predict_proba([X[j]])[0]
                    k = 0
            # class probabilities are always reported in a sorted by name fashion, i.e AdaBoost, RandomForest, Voting 
            # np.unique also reports labels in a sorted by name fashion
                    for classifierName in unique_labels:
                        predict_proba_true +=  predict_proba_classifiers[k] * annotatedData.loc[j, classifierName + probab_constant]
                        k += 1
                    predict_prob.append(predict_proba_true)
                    
        projectData['Predicted Function'] = predictedFunction
        annotatedData['DSE_Prediction'] = final_prediction
        annotatedData['DSE_Pred_Prob'] = predict_prob
        annotatedData.to_csv(DSE_directory + projectName, index = False)    
