## Using project elimination to reduce the number of projects required for bellwether discovery

In [1]:
import os, random, time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
random.seed(10)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from numpy import median
import math
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [2]:
# os.getcwd()
data_path = os.getcwd()+"\\defects\\src\\data\\Jureczko\\collated_data\\"
os.chdir(data_path)
all_files = os.listdir(data_path)
#all_files
projs = [x.split('_')[0] for x in all_files]
projs

['ant',
 'camel',
 'ivy',
 'jedit',
 'log4j',
 'lucene',
 'poi',
 'velocity',
 'xalan',
 'xerces']

In [3]:
def projectElimination(x): 
    #training on each project
    random.seed(x)

    results = []

    clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
    gScore_estimate = [0]*len(projs)

    for i in range(len(projs)):
        # print("\nTraining on project",projs[i])
        currData = pd.read_csv(data_path + projs[i]+"_merged.csv")
        #shuffling the dataset
        currData = currData.sample(frac=1).reset_index(drop=True)
        N = len(currData)

        X = currData.loc[:,currData.columns!='$<bug']
        y = currData.loc[:,currData.columns=='$<bug'] 

        completed = []
        #iteratively increasing the training set
        X_train = X.loc[:]
        y_train = y.loc[:]

        clf.fit(X_train, y_train)
        g_threshold = 0.42
        visited = 0
        g = []
        g_final = 0
        for j in range(len(projs)):
            if (i!=j and j not in completed):
                visited += 1
                # print(projs[i],projs[j])
                testData = pd.read_csv(data_path + projs[j]+"_merged.csv")
                testData = testData.sample(frac=1)
                X_test = testData.loc[:,testData.columns!='$<bug']
                y_test = testData.loc[:,testData.columns == '$<bug'] 

                y_pred = clf.predict(X_test)
                cm = confusion_matrix(y_test, y_pred)
                recall = cm[1][1]/(cm[1][1] + cm[1][0])
                pf = cm[0][1]/(cm[0][1] + cm[0][0])
                
                curr_g = 2/((1/recall) + (1/(1-pf)))
                g.append(curr_g)
                
            if(len(g) >= 3 and (sum(g)/len(g))<g_threshold):
                break
        
        if(len(g) >= 3 and median(g)<g_threshold):
            results.append([projs[i],0])
        else:
            results.append([projs[i],median(g)])

    return (results)


In [None]:
results = []
for i in range(30):
    print(i)
    start = time.time()
    res = projectElimination(i)
    runtime = time.time() - start
    res.append([i,runtime])
    results.append(res)
print (results)

In [5]:
df = pd.DataFrame(results)
cols = projs
cols.append('runtime')
df.columns = cols

In [6]:
df.shape

(30, 11)

In [7]:
def getValue(x):
    return x[1]

In [29]:
df.loc[1][1] = df.loc[1][1][1]

In [8]:
df = df.applymap(getValue)

In [9]:
# df['Iteration']=df.reset_index().index
df.insert(0,'iteration',range(30))

df

Unnamed: 0,iteration,ant,camel,ivy,jedit,log4j,lucene,poi,velocity,xalan,xerces,runtime
0,0,0,0,0,0,0,0.533795,0.621248,0.489207,0.568558,0.0,34.4364
1,1,0,0,0,0,0,0.535355,0.618182,0.496164,0.565802,0.0,35.0124
2,2,0,0,0,0,0,0.538684,0.616309,0.48609,0.571627,0.0,34.2364
3,3,0,0,0,0,0,0.537278,0.61527,0.489773,0.567695,0.0,34.3748
4,4,0,0,0,0,0,0.532396,0.617082,0.490579,0.566488,0.0,34.3366
5,5,0,0,0,0,0,0.538858,0.617907,0.485301,0.563686,0.0,34.2548
6,6,0,0,0,0,0,0.535549,0.608676,0.485278,0.56184,0.0,34.442
7,7,0,0,0,0,0,0.532231,0.612124,0.487443,0.568558,0.0,34.3086
8,8,0,0,0,0,0,0.532231,0.613543,0.487186,0.568558,0.420076,34.241
9,9,0,0,0,0,0,0.526036,0.616143,0.484929,0.571986,0.0,34.9982


In [10]:
df.to_csv("..//projectEliminationResults.csv",index=False)