### Using Hoeffding sampling to reduce the data required for training the classifier

In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

random.seed(10)


  from numpy.core.umath_tests import inner1d


In [2]:
os.getcwd()
data_path = "/defects/src/data/Jureczko/collated_data/"
os.chdir(os.getcwd() + data_path)

In [3]:
projList = os.listdir()
print("Files:",projList)
projs = [p.split('_')[0] for p in projList]
print("List of projects:",projs)

Files: ['ant_merged.csv', 'camel_merged.csv', 'ivy_merged.csv', 'jedit_merged.csv', 'log4j_merged.csv', 'lucene_merged.csv', 'poi_merged.csv', 'velocity_merged.csv', 'xalan_merged.csv', 'xerces_merged.csv']
List of projects: ['ant', 'camel', 'ivy', 'jedit', 'log4j', 'lucene', 'poi', 'velocity', 'xalan', 'xerces']


### Reading the sampling percentages obtained from Hoeffding bounds

In [4]:
df_n = pd.read_csv("..//samplingResults_n.csv")
fractions = []
for i in range(10):
    for j in range(10):
        fractions.append(df_n.iloc[i][j+1]/100)
len(fractions)
df_n

Unnamed: 0,projects,ant,camel,ivy,jedit,log4j,lucene,poi,velocity,xalan,xerces
0,ant,0.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
1,camel,5.0,0.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
2,ivy,5.0,6.0,0.0,7.0,8.0,9.0,10.0,11.5,12.0,12.5
3,jedit,5.0,6.0,7.0,0.0,8.0,9.0,10.0,8.0,12.5,12.0
4,log4j,5.0,6.0,7.0,8.0,0.0,9.0,10.0,11.0,12.0,13.0
5,lucene,5.0,6.0,7.0,8.0,9.0,0.0,10.0,11.0,12.0,13.0
6,poi,5.0,6.0,7.0,8.0,9.0,10.0,0.0,11.0,12.0,13.0
7,velocity,5.0,6.0,7.0,9.0,7.0,10.0,11.0,0.0,12.0,13.0
8,xalan,5.0,6.0,7.0,8.0,8.5,10.0,11.0,12.0,0.0,13.0
9,xerces,5.0,7.5,7.0,8.0,8.0,9.5,11.0,12.0,13.0,0.0


### Bellwether discovery using Hoeffding bounds and calculating the runtime

In [5]:
import time

def getRuntimes(randomSeed):
    random.seed(randomSeed)
    #tbl = [[0]*len(projs) for p in projs]
    count=-1
    startTime = time.time()
    for i in range(len(projs)):
        #print("\nIteration ",i)
        trainData = pd.read_csv(projList[i])

        for j in range(len(projs)):
            count+=1
            if (i != j):
                #print("Here: ",float(df_n.iloc[i][j+1]/100))
                trainData.sample(frac=fractions[count])
                X_train = trainData.loc[:,trainData.columns!='$<bug']
                y_train = trainData.loc[:,trainData.columns=='$<bug']

                testData = pd.read_csv(projList[j])
                X_test = testData.loc[:, testData.columns!='$<bug']
                y_test = testData.loc[:, testData.columns=='$<bug']
                clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
                clf.fit(X_train, y_train.values.ravel())
                y_pred = clf.predict(X_test)
                cm = confusion_matrix(y_test, y_pred)
                recall = cm[1][1]/(cm[1][1] + cm[1][0])
                pf = cm[0][1]/(cm[0][1] + cm[0][0])

                g = 2/((1/recall) + (1/(1-pf)))

                #print("Model trained on",projs[i],", testing on",projs[j])
                #acc = metrics.accuracy_score(y_test, y_pred)
                #print("Project: {}, Accuracy: {}, Precision: {}".format(projs[j],acc,
                #                                                       metrics.precision_score(y_test, y_pred)))
                tbl[i][j]=g
                #print("Test Project: {}, G-Score {}".format(projs[j], g))
    print(tbl)
    return (time.time() - startTime)

In [None]:
# repeating the process for 30 times
runtimes = []
for i in range(30):
    print(i)
    runtimes.append(getRuntimes(i))

In [7]:
runtimes

[215.2888000011444,
 213.28339982032776,
 213.3722002506256,
 212.66468214988708,
 213.51543951034546,
 225.45748281478882,
 213.79726576805115,
 212.6216790676117,
 277.16276264190674,
 217.06530475616455,
 212.72070050239563,
 212.80808901786804,
 216.19798278808594,
 212.26198053359985,
 212.39993262290955,
 212.60441613197327,
 213.21540451049805,
 212.7717170715332,
 213.1981496810913,
 212.75493574142456,
 212.8162760734558,
 218.50480365753174,
 218.9806010723114,
 212.95004272460938,
 213.10359978675842,
 212.50562191009521,
 212.88203763961792,
 212.26364159584045,
 212.54587292671204,
 212.3671998977661]

In [40]:
df = pd.DataFrame(tbl)
df.columns=projs
df.insert(0,'projects',projs)
print(df)
#print(os.getcwd())
df.to_csv("..//post_sampling_gscore.csv",index=False)

   projects       ant     camel       ivy     jedit     log4j    lucene  \
0       ant  0.000000  0.101226  0.385596  0.582279  0.109059  0.182377   
1     camel  0.398118  0.000000  0.375421  0.505455  0.242442  0.228678   
2       ivy  0.209302  0.034955  0.000000  0.276655  0.037732  0.123990   
3     jedit  0.157744  0.038393  0.111026  0.000000  0.007663  0.009091   
4     log4j  0.257380  0.338042  0.227812  0.236897  0.000000  0.355090   
5    lucene  0.462258  0.516806  0.533795  0.483632  0.637914  0.000000   
6       poi  0.645508  0.570165  0.649033  0.617694  0.621242  0.624541   
7  velocity  0.550314  0.502988  0.458872  0.447936  0.512429  0.454270   
8     xalan  0.602899  0.526973  0.587643  0.466730  0.583352  0.578249   
9    xerces  0.414248  0.349195  0.429603  0.392750  0.526491  0.525201   

        poi  velocity     xalan    xerces  
0  0.139191  0.093391  0.254520  0.217013  
1  0.202408  0.177863  0.259373  0.246078  
2  0.070911  0.047851  0.147351  0.162521 