In [2]:
import os, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
random.seed(10)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import math
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [3]:
# os.getcwd()
data_path = os.getcwd()+"\\defects\\src\\data\\Jureczko\\collated_data\\"
os.chdir(data_path)

In [4]:
all_files = os.listdir(data_path)

In [5]:
all_files

['ant_merged.csv',
 'camel_merged.csv',
 'ivy_merged.csv',
 'jedit_merged.csv',
 'log4j_merged.csv',
 'lucene_merged.csv',
 'poi_merged.csv',
 'velocity_merged.csv',
 'xalan_merged.csv',
 'xerces_merged.csv']

In [6]:
projs = [x.split('_')[0] for x in all_files]
projs

['ant',
 'camel',
 'ivy',
 'jedit',
 'log4j',
 'lucene',
 'poi',
 'velocity',
 'xalan',
 'xerces']

In [7]:
baseGScore = pd.read_csv('../baseline_gscore.csv')
baseGScore

Unnamed: 0,projects,ant,camel,ivy,jedit,log4j,lucene,poi,velocity,xalan,xerces
0,ant,0.0,0.101224,0.395724,0.581984,0.109059,0.178559,0.131901,0.083459,0.257046,0.221758
1,camel,0.401611,0.0,0.342505,0.517597,0.254153,0.218113,0.220043,0.177863,0.244938,0.222907
2,ivy,0.22282,0.041797,0.0,0.28129,0.030303,0.091404,0.068281,0.063302,0.14351,0.144343
3,jedit,0.143112,0.051989,0.111026,0.0,0.0,0.004556,0.02514,0.016216,0.065318,0.024168
4,log4j,0.250786,0.338887,0.211888,0.23476,0.0,0.351325,0.30883,0.354055,0.382202,0.494279
5,lucene,0.468854,0.520821,0.537119,0.483182,0.64,0.0,0.565226,0.550163,0.467843,0.554659
6,poi,0.647817,0.561975,0.648688,0.610694,0.621242,0.634279,0.0,0.513945,0.544997,0.544308
7,velocity,0.55092,0.509103,0.451266,0.461221,0.518141,0.481423,0.422589,0.0,0.489754,0.537467
8,xalan,0.613948,0.534689,0.584074,0.470141,0.557994,0.579936,0.558144,0.53221,0.0,0.567847
9,xerces,0.42027,0.350387,0.421277,0.393106,0.540284,0.549475,0.47557,0.471927,0.396941,0.0


In [7]:
x = list(baseGScore.iloc[1])
del x[0]
x

[0.4016107966913365,
 0.0,
 0.3425047078599143,
 0.5175968716672591,
 0.2541528843008251,
 0.2181126729055591,
 0.22004324749361115,
 0.17786255901568984,
 0.2449380904187013,
 0.22290739616919175]

In [8]:
def hoeffdingRaces(x): 
    #training on each project
    random.seed(7)

    results = []

    clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
    gScore_estimate = [0]*len(projs)

    for i in range(len(projs)):
        #print("\nTraining on project",projs[i])
        currData = pd.read_csv(data_path + projs[i]+"_merged.csv")
        #shuffling the dataset
        currData = currData.sample(frac=1,random_state=x).reset_index(drop=True)
        N = len(currData)

        X = currData.loc[:,currData.columns!='$<bug']
        y = currData.loc[:,currData.columns == '$<bug'] 

        itr = 0
        completed = []
        #iteratively increasing the training set
        for frac in range(5,100):
            itr += 1
            X_train = X.loc[:int(frac*N/100)]
            y_train = y.loc[:int(frac*N/100)]

            clf.fit(X_train, y_train)

            base_gScore = list(baseGScore.iloc[i])
            del base_gScore[0]

            for j in range(len(projs)):
                if (i!=j and j not in completed):
                    #print(projs[i],projs[j],curr_acc[j])
                    testData = pd.read_csv(data_path + projs[j]+"_merged.csv")
                    #testData = testData.sample(frac=1)
                    X_test = testData.loc[:,testData.columns!='$<bug']
                    y_test = testData.loc[:,testData.columns == '$<bug'] 

                    y_pred = clf.predict(X_test)
                    cm = confusion_matrix(y_test, y_pred)
                    recall = cm[1][1]/(cm[1][1] + cm[1][0])
                    pf = cm[0][1]/(cm[0][1] + cm[0][0])

                    g = 2/((1/recall) + (1/(1-pf)))                

                    gScore_estimate[j] = (gScore_estimate[j]*(itr-1) + g)/(itr)

                    eps = math.sqrt(math.log(2/0.05)/(2*int((1-(frac/100))*N)))

                    if (gScore_estimate[j] >= g):
                        #print ("Surpassed accuracy at n =",int(frac*N/100),"for",projs[j],"at",itr)
                        #print ("G-Score",g)
                        completed.append(j)
                        results.append([projs[i], projs[j], frac, g, gScore_estimate[j]])
                        break

                    elif ((g - gScore_estimate[j]) > eps):
                        #print ("Error within hoeffding bounds at n =",int(frac*N/100),"for",projs[j],"at",itr)
                        completed.append(j)
                        #print ("G-Score",g)
                        results.append([projs[i], projs[j], frac, g, gScore_estimate[j]])
                        break
                else:
                    pass

    return (results)


In [20]:
df = pd.DataFrame(results)
df.columns=['train','test','n','g','g_est']
#print(df)
#print(os.getcwd())
df.to_csv("..//currentImplementationResults.csv",index=False)

In [26]:
final_results = []
for i in range(4):
    print ("Iteration",i)
    final_results.append(hoeffdingRaces(i))
    print("Length =",len(final_results))

Iteration 0
Length = 1
Iteration 1
Length = 2
Iteration 2
Length = 3
Iteration 3
Length = 4


[[['ant', 'camel', 5, 0.04176810411759287, 0.04176810411759287],
  ['ant', 'ivy', 6, 0.33285609352240336, 0.16642804676120168],
  ['ant', 'jedit', 7, 0.4607870406426275, 0.15359568021420916],
  ['ant', 'log4j', 8, 0.08118081180811808, 0.02029520295202952],
  ['ant', 'lucene', 9, 0.12415191648098542, 0.024830383296197085],
  ['ant', 'poi', 10, 0.10190266471019212, 0.016983777451698687],
  ['ant', 'velocity', 11, 0.1031975607849269, 0.014742508683560985],
  ['ant', 'xalan', 12, 0.25189176934396945, 0.03148647116799618],
  ['ant', 'xerces', 13, 0.1600048920473655, 0.017778321338596168],
  ['camel', 'ant', 5, 0.22148722995756764, 0.22148722995756764],
  ['camel', 'ivy', 6, 0.24742268041237112, 0.2069253635867864],
  ['camel', 'log4j', 7, 0.24651082648900122, 0.09570041079768675],
  ['camel', 'jedit', 8, 0.06379407864845568, 0.13877047664364653],
  ['camel', 'lucene', 9, 0.09946045814199456, 0.03975639826535658],
  ['camel', 'poi', 10, 0.07873341182946336, 0.027275383181326135],
  ['camel',

In [None]:
## Use this code to collate the results into a proper dataframe

In [28]:
df1 = pd.DataFrame(final_results)
tempList = list(df1.loc[0])
tempList
df2 = pd.DataFrame(tempList)
df2

In [55]:
df1 = pd.DataFrame(final_results)
results_df = pd.DataFrame()
for i in range(df1.shape[0]):
    tempList = list(df1.loc[0])
    tempList
    temp_df = pd.DataFrame(tempList)
    results_df = results_df.append(temp_df)

results_df.columns=['train','test','n','g','g_est']

In [56]:
results_df

Unnamed: 0,train,test,n,g,g_est
0,ant,camel,5,0.041768,0.041768
1,ant,ivy,6,0.332856,0.166428
2,ant,jedit,7,0.460787,0.153596
3,ant,log4j,8,0.081181,0.020295
4,ant,lucene,9,0.124152,0.024830
5,ant,poi,10,0.101903,0.016984
6,ant,velocity,11,0.103198,0.014743
7,ant,xalan,12,0.251892,0.031486
8,ant,xerces,13,0.160005,0.017778
9,camel,ant,5,0.221487,0.221487


In [57]:
data1 = pd.read_csv("C://Users//USAKNAL//Desktop/result.csv")
data1.columns=['train','test','n','g','g_est']
data2 = pd.read_csv("C://Users//USAKNAL//Desktop/789_new.csv")
data2.columns=['train','test','n','g','g_est']
results_df = results_df.append(data1)
results_df = results_df.append(data2)

In [61]:
results_df
results_df.to_csv("..\\samplingResults.csv", index=False)

In [None]:
res

In [60]:
os.getcwd()

'C:\\Users\\USAKNAL\\Desktop\\Bellwether-master\\defects\\src\\data\\Jureczko\\collated_data'

In [9]:
results = pd.read_csv("../samplingResults.csv")

In [10]:
results

Unnamed: 0,train,test,n,g,g_est
0,ant,camel,5,0.041768,0.041768
1,ant,ivy,6,0.332856,0.166428
2,ant,jedit,7,0.460787,0.153596
3,ant,log4j,8,0.081181,0.020295
4,ant,lucene,9,0.124152,0.024830
5,ant,poi,10,0.101903,0.016984
6,ant,velocity,11,0.103198,0.014743
7,ant,xalan,12,0.251892,0.031486
8,ant,xerces,13,0.160005,0.017778
9,camel,ant,5,0.221487,0.221487


In [11]:
medianResults = results.groupby(by=['train','test'])['g','n'].median()
medianResults

Unnamed: 0_level_0,Unnamed: 1_level_0,g,n
train,test,Unnamed: 2_level_1,Unnamed: 3_level_1
ant,camel,0.041768,5.0
ant,ivy,0.332856,6.0
ant,jedit,0.460787,7.0
ant,log4j,0.081181,8.0
ant,lucene,0.124152,9.0
ant,poi,0.119274,10.0
ant,velocity,0.105588,11.0
ant,xalan,0.240769,12.0
ant,xerces,0.160005,13.0
camel,ant,0.221487,5.0


In [12]:
medianResults.to_csv("..\\trial.csv",index=False)

In [13]:
medianResults['n']

train     test    
ant       camel        5.0
          ivy          6.0
          jedit        7.0
          log4j        8.0
          lucene       9.0
          poi         10.0
          velocity    11.0
          xalan       12.0
          xerces      13.0
camel     ant          5.0
          ivy          6.0
          jedit        7.0
          log4j        8.0
          lucene       9.0
          poi         10.0
          velocity    11.0
          xalan       12.0
          xerces      13.0
ivy       ant          5.0
          camel        6.0
          jedit        7.0
          log4j        8.0
          lucene       9.0
          poi         10.0
          velocity    11.5
          xalan       12.0
          xerces      12.5
jedit     ant          5.0
          camel        6.0
          ivy          7.0
                      ... 
poi       velocity    11.0
          xalan       12.0
          xerces      13.0
velocity  ant          5.0
          camel        6.0
         

In [26]:
list_n = list(medianResults['n'].values)
list_n

[5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.5,
 12.0,
 12.5,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 8.0,
 12.5,
 12.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 9.0,
 7.0,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 6.0,
 7.0,
 8.0,
 8.5,
 10.0,
 11.0,
 12.0,
 13.0,
 5.0,
 7.5,
 7.0,
 8.0,
 8.0,
 9.5,
 11.0,
 12.0,
 13.0]

In [29]:
nums = [[0 for x in range(10)] for y in range(10)] 
nums

k = 0
for i in range(10):
    for j in range(10):
        if (i!=j):
            nums[i][j] = list_n[k]
            k+=1
nums

[[0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0],
 [5.0, 0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0],
 [5.0, 6.0, 0, 7.0, 8.0, 9.0, 10.0, 11.5, 12.0, 12.5],
 [5.0, 6.0, 7.0, 0, 8.0, 9.0, 10.0, 8.0, 12.5, 12.0],
 [5.0, 6.0, 7.0, 8.0, 0, 9.0, 10.0, 11.0, 12.0, 13.0],
 [5.0, 6.0, 7.0, 8.0, 9.0, 0, 10.0, 11.0, 12.0, 13.0],
 [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 0, 11.0, 12.0, 13.0],
 [5.0, 6.0, 7.0, 9.0, 7.0, 10.0, 11.0, 0, 12.0, 13.0],
 [5.0, 6.0, 7.0, 8.0, 8.5, 10.0, 11.0, 12.0, 0, 13.0],
 [5.0, 7.5, 7.0, 8.0, 8.0, 9.5, 11.0, 12.0, 13.0, 0]]

In [34]:
df_nums = pd.DataFrame(nums,columns=projs)
df_nums.insert(0,'projects',projs)
df_nums.to_csv("..//samplingResults_n.csv",index=False)

In [15]:
ant_merged = pd.read_csv(data_path + "ant_merged.csv")

In [6]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)

In [7]:
acc_ant = [0.79,0.80,0.81,0.45,0.49,0.51,0.43,.50,0.62]

In [87]:
currProj = "ant"
all_files = os.listdir(data_path)
ant_merged = pd.read_csv(data_path + "ant_merged.csv")
acc_ant = [0.79,0.80,0.81,0.45,0.49,0.51,0.43,0.50,0.62]
avg_acc = [0]*len(all_files)
# ant_merged = ant_merged.sample(frac=1).reset_index(drop=True)

for i in range(int(0.05*len(ant_merged)),len(ant_merged)):

    print(i)
    if(len(all_files)<=1):
        break
    
    X = ant_merged.loc[0:i,ant_merged.columns!='$<bug']
    y = ant_merged.loc[0:i,ant_merged.columns == '$<bug'] 
    clf.fit(X,y)
    # Testing
    for j in range(1,len(all_files)):
        test = pd.read_csv(data_path+all_files[j])
        #test = pd.read_csv(data_path + all_files[3])
        X_test = test.loc[:,test.columns!='$<bug']
        y_test = test.loc[:,test.columns=='$<bug']
        y_pred = clf.predict(X_test)
        #print("Total 1s =" ,sum(y_pred))
        cm = confusion_matrix(y_test,y_pred)
        #print (cm)
        accuracy = accuracy_score(y_test, y_pred)
        error = 1 - accuracy
        
        avg_acc[j] = (avg_acc[j]*(i-1)+accuracy)/i
        if(i%100==0):
            print ("RAHKL: ",i,j)
        #print(error)
        #print(math.sqrt((math.log(2/0.05,10))/(2*(i+1))))
        #print(accuracy)
#         print("Acc1=",accuracy)
#         print("Acc2=",acc_ant[j-1])
#         print("Val1=",abs(acc_ant[j-1]-accuracy))
#         print("Val2=",math.sqrt(math.log(2/0.05,10))/(2*(len(ant_merged))))

        
        if (avg_acc[j] > acc_ant[j-1]): 
            print ("here1: \t i=",i)
            del all_files[j]
            del acc_ant[j-1]
            #print(all_files)
            break    
        elif (abs(acc_ant[j-1]-avg_acc[j]) <= math.sqrt((math.log(2/0.05,10)/(2*(len(ant_merged)))))):
            print ("here2: \t i =",i)
            del all_files[j]
            del acc_ant[j-1]
            #print(all_files)
            break
#         else:
#             i = i+5
        


84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
RAHKL:  100 1
RAHKL:  100 2
RAHKL:  100 3
RAHKL:  100 4
RAHKL:  100 5
RAHKL:  100 6
RAHKL:  100 7
RAHKL:  100 8
RAHKL:  100 9
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
RAHKL:  200 1
RAHKL:  200 2
RAHKL:  200 3
RAHKL:  200 4
RAHKL:  200 5
RAHKL:  200 6
RAHKL:  200 7
RAHKL:  200 8
RAHKL:  200 9
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274


1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
RAHKL:  1600 1
RAHKL:  1600 2
RAHKL:  1600 3
RAHKL:  1600 4
RAHKL:  1600 5
RAHKL:  1600 6
RAHKL:  1600 7
RAHKL:  1600 8
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691


In [89]:
currProj = "xerces"
all_files = os.listdir(data_path)
ant_merged = pd.read_csv(data_path + "xerces_merged.csv")
acc_ant = [0.79,0.80,0.81,0.45,0.49,0.51,0.43,0.50,0.62]
#new values
acc_ant = [0.40,0.35,0.38,0.36,0.60,0.63,0.55,0.58,0.51]
all_files = ['xerces_merged.csv','ant_merged.csv','camel_merged.csv','ivy_merged.csv','jedit_merged.csv','log4j_merged.csv','lucene_merged.csv',
             'poi_merged.csv','velocity_merged.csv','xalan_merged.csv']
avg_acc = [0]*len(all_files)
# ant_merged = ant_merged.sample(frac=1).reset_index(drop=True)

for i in range(int(0.05*len(ant_merged)),len(ant_merged)):

    print(i)
    if(len(all_files)<=1):
        break
    
    X = ant_merged.loc[0:i,ant_merged.columns!='$<bug']
    y = ant_merged.loc[0:i,ant_merged.columns == '$<bug'] 
    clf.fit(X,y)
    # Testing
    for j in range(1,len(all_files)):
        test = pd.read_csv(data_path+all_files[j])
        #test = pd.read_csv(data_path + all_files[3])
        X_test = test.loc[:,test.columns!='$<bug']
        y_test = test.loc[:,test.columns=='$<bug']
        y_pred = clf.predict(X_test)
        #print("Total 1s =" ,sum(y_pred))
        cm = confusion_matrix(y_test,y_pred)
        #print (cm)
        accuracy = accuracy_score(y_test, y_pred)
        error = 1 - accuracy
        
        avg_acc[j] = (avg_acc[j]*(i-1)+accuracy)/i
        if(i%100==0):
            print ("Check: ",i,j)
        #print(error)
        #print(math.sqrt((math.log(2/0.05,10))/(2*(i+1))))
        #print(accuracy)
#         print("Acc1=",accuracy)
#         print("Acc2=",acc_ant[j-1])
#         print("Val1=",abs(acc_ant[j-1]-accuracy))
#         print("Val2=",math.sqrt(math.log(2/0.05,10))/(2*(len(ant_merged))))

        
        if (avg_acc[j] > acc_ant[j-1]): 
            print ("here1: \t i=",i)
            del all_files[j]
            del acc_ant[j-1]
            #print(all_files)
            break    
        elif (abs(acc_ant[j-1]-avg_acc[j]) <= math.sqrt((math.log(2/0.05,10)/(2*(len(ant_merged)))))):
            print ("here2: \t i =",i)
            del all_files[j]
            del acc_ant[j-1]
            #print(all_files)
            break
#         else:
#             i = i+5
        


82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Check:  100 1
Check:  100 2
Check:  100 3
Check:  100 4
Check:  100 5
Check:  100 6
Check:  100 7
Check:  100 8
Check:  100 9
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
here2: 	 i = 151
152
153
154
155
156
157
158
159
160
161
162
163
here2: 	 i = 163
164
165
166
167
168
169
170
171
172
here2: 	 i = 172
173
here1: 	 i= 173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
Check:  200 1
Check:  200 2
Check:  200 3
Check:  200 4
Check:  200 5
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270

In [91]:
#for xerces
all_files

['xerces_merged.csv',
 'log4j_merged.csv',
 'lucene_merged.csv',
 'poi_merged.csv',
 'velocity_merged.csv']

In [73]:
ant_merged = pd.read_csv(data_path + "ant_merged.csv")
ant_merged = ant_merged.sample(frac=1).reset_index(drop=True)

In [76]:
X = ant_merged.loc[0:87,ant_merged.columns!='$<bug']
X.shape

(88, 20)