In [2]:
import os, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
random.seed(10)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import math
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [3]:
# os.getcwd()
data_path = os.getcwd()+"\\defects\\src\\data\\Jureczko\\collated_data\\"
os.chdir(data_path)

In [4]:
all_files = os.listdir(data_path)

In [5]:
all_files

['ant_merged.csv',
 'camel_merged.csv',
 'ivy_merged.csv',
 'jedit_merged.csv',
 'log4j_merged.csv',
 'lucene_merged.csv',
 'poi_merged.csv',
 'velocity_merged.csv',
 'xalan_merged.csv',
 'xerces_merged.csv']

In [6]:
projs = [x.split('_')[0] for x in all_files]
projs

['ant',
 'camel',
 'ivy',
 'jedit',
 'log4j',
 'lucene',
 'poi',
 'velocity',
 'xalan',
 'xerces']

In [7]:
baseGScore = pd.read_csv('../baseline_gscore.csv')
baseGScore

Unnamed: 0,projects,ant,camel,ivy,jedit,log4j,lucene,poi,velocity,xalan,xerces
0,ant,0.0,0.101224,0.395724,0.581984,0.109059,0.178559,0.131901,0.083459,0.257046,0.221758
1,camel,0.401611,0.0,0.342505,0.517597,0.254153,0.218113,0.220043,0.177863,0.244938,0.222907
2,ivy,0.22282,0.041797,0.0,0.28129,0.030303,0.091404,0.068281,0.063302,0.14351,0.144343
3,jedit,0.143112,0.051989,0.111026,0.0,0.0,0.004556,0.02514,0.016216,0.065318,0.024168
4,log4j,0.250786,0.338887,0.211888,0.23476,0.0,0.351325,0.30883,0.354055,0.382202,0.494279
5,lucene,0.468854,0.520821,0.537119,0.483182,0.64,0.0,0.565226,0.550163,0.467843,0.554659
6,poi,0.647817,0.561975,0.648688,0.610694,0.621242,0.634279,0.0,0.513945,0.544997,0.544308
7,velocity,0.55092,0.509103,0.451266,0.461221,0.518141,0.481423,0.422589,0.0,0.489754,0.537467
8,xalan,0.613948,0.534689,0.584074,0.470141,0.557994,0.579936,0.558144,0.53221,0.0,0.567847
9,xerces,0.42027,0.350387,0.421277,0.393106,0.540284,0.549475,0.47557,0.471927,0.396941,0.0


In [7]:
x = list(baseGScore.iloc[1])
del x[0]
x

[0.4016107966913365,
 0.0,
 0.3425047078599143,
 0.5175968716672591,
 0.2541528843008251,
 0.2181126729055591,
 0.22004324749361115,
 0.17786255901568984,
 0.2449380904187013,
 0.22290739616919175]

In [8]:
def hoeffdingRaces(x): 
    #training on each project
    random.seed(7)

    results = []

    clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
    gScore_estimate = [0]*len(projs)

    for i in range(len(projs)):
        #print("\nTraining on project",projs[i])
        currData = pd.read_csv(data_path + projs[i]+"_merged.csv")
        #shuffling the dataset
        currData = currData.sample(frac=1,random_state=x).reset_index(drop=True)
        N = len(currData)

        X = currData.loc[:,currData.columns!='$<bug']
        y = currData.loc[:,currData.columns == '$<bug'] 

        itr = 0
        completed = []
        #iteratively increasing the training set
        for frac in range(5,100):
            itr += 1
            X_train = X.loc[:int(frac*N/100)]
            y_train = y.loc[:int(frac*N/100)]

            clf.fit(X_train, y_train)

            base_gScore = list(baseGScore.iloc[i])
            del base_gScore[0]

            for j in range(len(projs)):
                if (i!=j and j not in completed):
                    #print(projs[i],projs[j],curr_acc[j])
                    testData = pd.read_csv(data_path + projs[j]+"_merged.csv")
                    #testData = testData.sample(frac=1)
                    X_test = testData.loc[:,testData.columns!='$<bug']
                    y_test = testData.loc[:,testData.columns == '$<bug'] 

                    y_pred = clf.predict(X_test)
                    cm = confusion_matrix(y_test, y_pred)
                    recall = cm[1][1]/(cm[1][1] + cm[1][0])
                    pf = cm[0][1]/(cm[0][1] + cm[0][0])

                    g = 2/((1/recall) + (1/(1-pf)))                

                    gScore_estimate[j] = (gScore_estimate[j]*(itr-1) + g)/(itr)

                    eps = math.sqrt(math.log(2/0.05)/(2*int((1-(frac/100))*N)))

                    if (gScore_estimate[j] >= g):
                        #print ("Surpassed accuracy at n =",int(frac*N/100),"for",projs[j],"at",itr)
                        #print ("G-Score",g)
                        completed.append(j)
                        results.append([projs[i], projs[j], frac, g, gScore_estimate[j]])
                        break

                    elif ((g - gScore_estimate[j]) > eps):
                        #print ("Error within hoeffding bounds at n =",int(frac*N/100),"for",projs[j],"at",itr)
                        completed.append(j)
                        #print ("G-Score",g)
                        results.append([projs[i], projs[j], frac, g, gScore_estimate[j]])
                        break
                else:
                    pass

    return (results)


In [20]:
df = pd.DataFrame(results)
df.columns=['train','test','n','g','g_est']
#print(df)
#print(os.getcwd())
df.to_csv("..//currentImplementationResults.csv",index=False)

In [None]:
final_results = []
for i in range(30):
    print ("Iteration",i)
    final_results.append(hoeffdingRaces(i))
    print("Length =",len(final_results))

In [28]:
df1 = pd.DataFrame(final_results)
tempList = list(df1.loc[0])
tempList
df2 = pd.DataFrame(tempList)
df2

In [55]:
df1 = pd.DataFrame(final_results)
results_df = pd.DataFrame()
for i in range(df1.shape[0]):
    tempList = list(df1.loc[0])
    tempList
    temp_df = pd.DataFrame(tempList)
    results_df = results_df.append(temp_df)

results_df.columns=['train','test','n','g','g_est']

In [56]:
results_df

Unnamed: 0,train,test,n,g,g_est
0,ant,camel,5,0.041768,0.041768
1,ant,ivy,6,0.332856,0.166428
2,ant,jedit,7,0.460787,0.153596
3,ant,log4j,8,0.081181,0.020295
4,ant,lucene,9,0.124152,0.024830
5,ant,poi,10,0.101903,0.016984
6,ant,velocity,11,0.103198,0.014743
7,ant,xalan,12,0.251892,0.031486
8,ant,xerces,13,0.160005,0.017778
9,camel,ant,5,0.221487,0.221487


In [57]:
data1 = pd.read_csv("C://Users//USAKNAL//Desktop/result.csv")
data1.columns=['train','test','n','g','g_est']
data2 = pd.read_csv("C://Users//USAKNAL//Desktop/789_new.csv")
data2.columns=['train','test','n','g','g_est']
results_df = results_df.append(data1)
results_df = results_df.append(data2)

In [61]:
results_df
results_df.to_csv("..\\samplingResults.csv", index=False)