/
helpers.py
146 lines (110 loc) · 4.39 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""Helper functions."""
import math
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import auc
PRE, REC, SPEC, FPR, NPV, ACC, F1 = 7, 6, 5, 4, 3, 2, 1
def save_obj(obj, path):
with open(path, 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path):
with open(path, 'rb') as f:
return pickle.load(f)
"Given prediction and truth, get tp, fp, tn, fn. "
def get_abcd(predict, truth):
# pos > 0, neg == 0
n = len(predict)
tp, fp, tn, fn = 0, 0, 0, 0
for i in range(n):
if predict[i] > 0 and truth[i] > 0:
tp += 1
elif predict[i] > 0 and truth[i] == 0:
fp += 1
elif predict[i] == 0 and truth[i] == 0:
tn += 1
elif predict[i] == 0 and truth[i] > 0:
fn += 1
return tp, fp, tn, fn
"Given TP, FP, TN, FN, get all the other metrics. "
def get_performance(metrics):
tp, fp, tn, fn = metrics
pre = 1.0 * tp / (tp + fp) if (tp + fp) != 0 else 0
rec = 1.0 * tp / (tp + fn) if (tp + fn) != 0 else 0
spec = 1.0 * tn / (tn + fp) if (tn + fp) != 0 else 0
fpr = 1 - spec
npv = 1.0 * tn / (tn + fn) if (tn + fn) != 0 else 0
acc = 1.0 * (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0
f1 = 2.0 * tp / (2.0 * tp + fp + fn) if (2.0 * tp + fp + fn) != 0 else 0
return [round(x, 3) for x in [pre, rec, spec, fpr, npv, acc, f1]]
"Given the general metrics, return the score got by the specific criteria."
def get_score(criteria, metrics): # The smaller the better
tp, fp, tn, fn = metrics
pre, rec, spec, fpr, npv, acc, f1 = get_performance([tp, fp, tn, fn])
all_metrics = [tp, fp, tn, fn, pre, rec, spec, fpr, npv, acc, f1]
if criteria == "Accuracy":
score = -all_metrics[-ACC]
elif criteria == "Dist2Heaven":
score = all_metrics[-FPR] ** 2 + (1 - all_metrics[-REC]) ** 2
score = math.sqrt(score) / math.sqrt(2)
elif criteria == "Gini":
p1 = all_metrics[-PRE] # target == 1 for the positive split
p0 = 1 - all_metrics[-NPV] # target == 1 for the negative split
score = 1 - p0 ** 2 - p1 ** 2
else: # Information Gain
P, N = all_metrics[0] + all_metrics[3], all_metrics[1] + all_metrics[2]
p = 1.0 * P / (P + N) if P + N > 0 else 0 # before the split
p1 = all_metrics[-PRE] # the positive part of the split
p0 = 1 - all_metrics[-NPV] # the negative part of the split
I, I0, I1 = (-x * np.log2(x) if x != 0 else 0 for x in (p, p0, p1))
I01 = p * I1 + (1 - p) * I0
score = -(I - I01) # the smaller the better.
return round(score, 3)
def subtotal(x):
xx = [0]
for i, t in enumerate(x):
xx += [xx[-1] + t]
return xx[1:]
def get_recall(true):
total_true = float(len([i for i in true if i == 1]))
hit = 0.0
recall = []
for i in xrange(len(true)):
if true[i] == 1:
hit += 1
recall += [hit / total_true if total_true else 0.0]
return recall
def cut_position(pos, neg, percentage=0):
return int(pos["bug"].count() * percentage / 100), int(neg["bug"].count() * percentage / 100)
def divide_train_test(pos, neg, cut_pos, cut_neg):
data_train = pd.concat([pos.iloc[:cut_pos,:], neg.iloc[:cut_neg,:]],ignore_index=True)
data_test = pd.concat([pos.iloc[cut_pos:,:], neg.iloc[cut_neg:,:]], ignore_index=True)
return data_train, data_test
def split_two(corpus):
pos = corpus[corpus['bug']==1]
neg = corpus[corpus['bug'] != 1]
return {'pos': pos, 'neg': neg}
def get_auc(data):
"""The smaller the better"""
if len(data) == 1:
return 0
x_sum = float(sum(data['loc']))
x = data['loc'].apply(lambda t: t / x_sum)
xx = subtotal(x)
yy = get_recall(data['bug'].values)
try:
ret = -round(auc(xx, yy), 3)
except:
print "?"
ret = 0
return ret
def cut_position(pos, neg, percentage=0):
return int(pos["bug"].count() * percentage / 100), int(neg["bug"].count() * percentage / 100)
def divide_train_test(pos, neg, cut_pos, cut_neg):
data_train = pd.concat([pos.iloc[:cut_pos,:], neg.iloc[:cut_neg,:]],ignore_index=True)
data_test = pd.concat([pos.iloc[cut_pos:,:], neg.iloc[cut_neg:,:]], ignore_index=True)
return data_train, data_test
def split_two(corpus):
pos = corpus[corpus['bug']==1]
neg = corpus[corpus['bug'] != 1]
return {'pos': pos, 'neg': neg}