-
Notifications
You must be signed in to change notification settings - Fork 1
/
cut_evolution.py
415 lines (340 loc) · 15.1 KB
/
cut_evolution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
######### This script is used by the gif-helper script, to generate a series
######### of single image plots, which can be assembles to a gif, showing the
######### trainings evolution of our BDT
# Standard imports
import numpy as np
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import ROOT
import time
# sklearn
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.datasets import make_gaussian_quantiles, make_classification, make_blobs
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
# TTXPheno
from TTXPheno.Tools.user import plot_directory, tmp_directory
#Kullback Leibler Divergenz
from criterion import KullbackLeibler
#start the timer
start = time.time()
#Arguments
import argparse
argParser = argparse.ArgumentParser(description = "Argument Parser")
argParser.add_argument('--logLevel', action='store', default='INFO', nargs='?', choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'TRACE', 'NOTSET'], help="Log level for logging")
argParser.add_argument('--small', action='store_true', help='Use the small dataset')
argParser.add_argument('--data', action='store',default='data.h5')
argParser.add_argument('--data_version', action='store',default='v2',help='Version of the data to be used')
argParser.add_argument('--log_plot', action='store_true',help='Use a logarithmic plot')
argParser.add_argument('--save', action='store_true', help='Write the trained BDT to a file')
argParser.add_argument('--criterion', action='store', default='kule', nargs='?', choices=['gini', 'kule', 'entropy'] , help="Select the Criterion to be used")
argParser.add_argument('--export', action='store_true', help="Export the trainded tree as graphviz dot")
argParser.add_argument('--no_plot', action='store_true', help="Don't generate a plot")
argParser.add_argument('--max_depth', action='store', default=2, type=int,nargs='?', help="The max depth argument, which is given to the DecisionTreeClassifier")
argParser.add_argument('--n_est', action='store', default=100, type=int,nargs='?', help="The n_estimators argument, which is given to the AdaBoostClassifier")
argParser.add_argument('--boost_algorithm', action='store', default='SAMME', nargs='?', choices=['SAMME', 'SAMME.R'], help="Choose a boosting algorithm for the AdaBoostClassifier")
argParser.add_argument('--swap_hypothesis', action='store_true', help="Chance the Target Labels of SM and BSM Hypothesis")
argParser.add_argument('--random_state', action='store', default=0, type=int,nargs='?', help="The random state, which is given to the train_test_split method")
argParser.add_argument('--ptz_only', action='store_true', help='Only use the pTZ feature for training')
args = argParser.parse_args()
#Set the version of the script
vversion = 'v1'
#Logger
import RootTools.core.logger as Logger
logger = Logger.get_logger(args.logLevel, logFile = None)
#Kule algorithm
from kullback_leibler_divergence_criterion import KullbackLeiblerCriterion
kldc = KullbackLeiblerCriterion(1, np.array([2], dtype='int64'))
#setting up the file save name
version = vversion
version += '_' + args.criterion
if args.small:
args.data_version += '_small'
version += '_small'
if args.log_plot:
version += '_log'
if args.swap_hypothesis:
version += '_swap'
if args.ptz_only:
version +='_ptconly'
version += '_maxDepth' + str(args.max_depth) + '_n_est' + str(args.n_est) + '_BoostAlg' + str(args.boost_algorithm) + '_RandState' + str(args.random_state)
#find directory
input_directory = os.path.join(tmp_directory, args.data_version)
logger.debug('Import data from %s', input_directory)
#read data from file
df = pd.read_hdf(os.path.join(input_directory, args.data))
if args.ptz_only:
X1 = np.array(df['genZ_pt/F'])
else:
X1 = np.array(df[['genZ_pt/F','genZ_cosThetaStar/F']])
X = np.concatenate((X1,X1))
#generate targets
y0 = np.zeros(len(X1))
y1 = np.ones(len(X1))
y = np.concatenate((y0,y1))
#read weights, normalize weights, used the mean to scale it before
w0 = np.array(df['sm_weight'])
sm_weight_sum = np.sum(w0)
w0 /= sm_weight_sum
w1 = np.array(df['bsm_weight'])
bsm_weight_sum = np.sum(w1)
w1 /= bsm_weight_sum
if args.swap_hypothesis:
w = np.concatenate((w1,w0))
else:
w = np.concatenate((w0,w1))
logger.info('Sum of sm_weights: %f, Sum of bsm_weights: %f',sm_weight_sum, bsm_weight_sum )
#split the data into validation and trainings set
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X,y,w,test_size= 0.5, random_state=0)
#caluclate the sum of weights
w_test_sum_sm = 0.0
w_test_sum_bsm = 0.0
w_train_sum_sm = 0.0
w_train_sum_bsm = 0.0
#going through all weights and selecting them with their label
for i in xrange(len(w_train)):
if y_train[i] == 0:
w_train_sum_sm += w_train[i]
else:
w_train_sum_bsm += w_train[i]
for i in xrange(len(w_test)):
if y_test[i] == 0:
w_test_sum_sm += w_test[i]
else:
w_test_sum_bsm += w_test[i]
logger.info('Yields, Training SM: %f, Training BSM: %f, Testing SM: %f, Testing BSM %f', w_train_sum_sm, w_train_sum_bsm, w_test_sum_sm, w_test_sum_bsm)
#Create the tree
if args.criterion == 'kule':
dt = DecisionTreeClassifier(max_depth= args.max_depth, criterion=kldc)
elif args.criterion == 'gini':
dt = DecisionTreeClassifier(max_depth= args.max_depth, criterion='gini')
elif args.criterion == 'entropy':
dt = DecisionTreeClassifier(max_depth= args.max_depth, criterion='entropy')
else:
assert False, "You choose the wrong Classifier"
#reshape the data only the ptz feature should be used
if args.ptz_only:
X_train = np.reshape(X_train, (-1,1))
X_test = np.reshape(X_test, (-1,1))
#train the tree
bdt = AdaBoostClassifier(dt, algorithm = args.boost_algorithm, n_estimators=args.n_est)
bdt.fit(X_train, y_train, w_train)
ende_training = time.time()
logger.info('Time to train the tree ' + '{:5.3f}s'.format(ende_training-start))
#get the directory for data
output_dir = os.path.join(tmp_directory, args.data_version)
if not os.path.exists(output_dir):
os.makedirs( output_dir)
#save the data to file
if args.save:
logger.info("Save the trained plot to %s, it uses the %s criterion", output_dir, args.criterion)
joblib.dump(bdt, os.path.join(output_dir,"bdt-trainingdata-" + version))
logger.info("Dumped the tree to %s", os.path.join(output_dir,"bdt-trainingdata-" + version))
#export the tree TODO! NOT WORKING!!
if args.export:
export_graphviz(bdt, out_file= os.path.join(output_dir, version + "-tree.dot"))
logger.info("Exported the tree as .dot to %s", os.path.join(output_dir, version + "-tree.dot"))
#end if no_plot argument was choosen
if args.no_plot:
raise SystemExit
#get the output directory for plots
output_directory = os.path.join(plot_directory,'Kullback-Leibler-Plots', argParser.prog.split('.')[0], vversion)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
logger.info('Save to %s directory', output_directory)
#pyplot settings
class_names = ["SM Test", "BSM Test" , "SM Train", "BSM Train"]
plot_colors = ["#000cff","#ff0000", "#9ba0ff" , "#ff8d8d"]
plt.figure(figsize=(18,8)).suptitle("Decision Boundaries for test- (top) and trainings-dataset (bottom) \n n: " + str(args.n_est), fontsize=18)
plot_step = 0.075
#setup the decision functions, which will also be used by the histograms
test_decision_function = bdt.decision_function(X_test)
train_decision_function = bdt.decision_function(X_train)
#show the decision shape for test data
plt.subplot(2,1,1)
if args.ptz_only:
#generate the 1 values which will be used to generate the cut values
x_min, x_max = X_test.min() - 1, X_test.max() + 1
xx = np.arange(x_min, x_max, plot_step)
xx = np.reshape(xx, (-1,1))
#map the decision function
Z = bdt.decision_function(xx)
Z = np.reshape(Z, (-1,1))
#get the limits and plot the function
y_min, y_max = Z.min() - 1, Z.max() + 1
plt.plot(xx, Z)
else:
#generate the 2D Matrix, which will be used to generate the values
x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
y_min, y_max = X_test[:, 1].min() - 0.1 , X_test[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step))
#map the decision function
Z = bdt.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
#fill the plot with color and draw the contour
plt.pcolormesh(xx,yy,Z, cmap=plt.cm.coolwarm)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm) #plt.contour(.., linewidth=0.
#further plot settings
plt.axis("tight")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
if args.ptz_only:
plt.xlabel("p_T(Z) GeV")
plt.ylabel("Decision Function")
else:
plt.xlabel('p_T(Z) (GeV)')
plt.ylabel('Cos(Theta*)')
#Plot the decision boundaries and generate the decision data for training data
plt.subplot(2,1,2)
if args.ptz_only:
#generate the 1 values which will be used to generate the cut values
x_min, x_max = X_train.min() - 1, X_test.max() + 1
xx = np.arange(x_min, x_max, plot_step)
xx = np.reshape(xx,(-1,1))
#map the decision function
Z = bdt.decision_function(xx)
Z = np.reshape(Z, (-1,1))
#get the limits and plot the function
y_min, y_max = Z.min() - 1, Z.max() + 1
plt.plot(xx, Z)
else:
#generate the 2D Matrix, which will be used to generate the values
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 0.1, X_train[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step))
#map the decision function
Z = bdt.decision_function(np.c_[xx.ravel(), yy.ravel()]) #use bdt.predict for no
Z = Z.reshape(xx.shape)
#fill the plot with color and draw the contour
plt.pcolormesh(xx,yy,Z, cmap=plt.cm.coolwarm)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm) #plt.contour(.., linewidth=0.
#further plot settings
plt.axis("tight")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
if args.ptz_only:
plt.xlabel("p_T(Z) GeV")
plt.ylabel("Decision Function")
else:
plt.xlabel('p_T(T) (GeV)')
plt.ylabel('Cos(Theta*)')
#generate a number of the file and save the matlib plot
number = '%03d' % args.n_est
plt.savefig(os.path.join( output_directory, args.criterion, number + '.png'))
#setup the historgramms
h_dis_train_SM = ROOT.TH1D("dis_train_sm", "Discriminator", 12, -1, 1)
h_dis_train_BSM = ROOT.TH1D("dis_train_bsm", "Discriminator", 12, -1, 1)
h_dis_test_SM = ROOT.TH1D("dis_test_sm", "Discriminator", 12, -1, 1)
h_dis_test_BSM = ROOT.TH1D("dis_test_bsm", "Discriminator", 12, -1, 1)
#set the error calculationsmethod
ROOT.TH1.SetDefaultSumw2()
#set colors
h_dis_train_SM.SetLineColor(ROOT.kBlue+2)
h_dis_train_BSM.SetLineColor(ROOT.kRed+2)
h_dis_test_SM.SetLineColor(ROOT.kBlue-5)
h_dis_test_BSM.SetLineColor(ROOT.kRed-4)
#set line width
h_dis_train_SM.SetLineWidth(3)
h_dis_train_BSM.SetLineWidth(3)
h_dis_test_SM.SetLineWidth(3)
h_dis_test_BSM.SetLineWidth(3)
#set line style
h_dis_train_SM.SetLineStyle(7)
h_dis_train_BSM.SetLineStyle(7)
h_dis_test_SM.SetLineStyle(1)
h_dis_test_BSM.SetLineStyle(1)
#set colors
h_dis_train_SM.SetFillColor(ROOT.kBlue+3)
h_dis_train_BSM.SetFillColor(ROOT.kRed+3)
h_dis_test_SM.SetFillColor(ROOT.kBlue-8)
h_dis_test_BSM.SetFillColor(ROOT.kRed-9)
#set fill styles
h_dis_train_SM.SetFillStyle(0)
h_dis_train_BSM.SetFillStyle(0)
h_dis_test_SM.SetFillStyle(0)
h_dis_test_BSM.SetFillStyle(0)
#hide statbox
ROOT.gStyle.SetOptStat(0)
logger.info('Zeit bis vor der Loop: ' + '{:5.3f}s'.format(time.time()-start))
#loop over the feature vektor to fill the histogramms (test data)
for i in xrange(len(X_test)):
if y_test[i] == 0:
h_dis_test_SM.Fill(test_decision_function[i],w_test[i])
else:
h_dis_test_BSM.Fill(test_decision_function[i],w_test[i])
#fill with trainings data
for i in xrange(len(X_train)):
if y_train[i] == 0:
h_dis_train_SM.Fill(train_decision_function[i],w_train[i])
else:
h_dis_train_BSM.Fill(train_decision_function[i],w_train[i])
logger.info('Zeit bis vor nach der Loop: ' + '{:5.3f}s'.format(time.time()-start))
#setup the delta weights for comparison (only for DEBUGGING)
w_delta_test_sum_sm = w_test_sum_sm
w_delta_test_sum_bsm = w_test_sum_bsm
w_delta_train_sum_sm = w_train_sum_sm
w_delta_train_sum_bsm = w_train_sum_bsm
#calcuate the yields after fitting
w_train_sum_sm = h_dis_train_SM.Integral()
w_train_sum_bsm = h_dis_train_BSM.Integral()
w_test_sum_sm = h_dis_test_SM.Integral()
w_test_sum_bsm = h_dis_test_BSM.Integral()
#calculate the delta weights
w_delta_test_sum_sm -= w_test_sum_sm
w_delta_test_sum_bsm -= w_test_sum_bsm
w_delta_train_sum_sm -= w_train_sum_sm
w_delta_train_sum_bsm -= w_train_sum_bsm
logger.info("Yields after fitting: Training SM %f, Train BSM %f, Testing SM %f, Testing BSM %f",w_train_sum_sm, w_train_sum_bsm, w_test_sum_sm, w_test_sum_bsm )
logger.info("Difference of the weights before and after fitting: Test SM %f, Test BSM %f, Train SM %f, Train BSM %f", w_delta_test_sum_sm, w_delta_test_sum_bsm, w_delta_train_sum_sm, w_delta_train_sum_bsm)
#normalize the histograms
h_dis_train_SM.Scale(1/w_train_sum_sm)
h_dis_train_BSM.Scale(1/w_train_sum_bsm)
h_dis_test_SM.Scale(1/w_test_sum_sm)
h_dis_test_BSM.Scale(1/w_test_sum_bsm)
#Berechne die Kule Div
kl = KullbackLeibler(logger)
kule_test, error_test = kl.kule_div(h_dis_test_SM, h_dis_test_BSM)
kule_train, error_train = kl.kule_div(h_dis_train_SM, h_dis_train_BSM)
logger.info('Kullback-Leibler Divergenz:\nTraining: %f and error: %f \nTesting: %f and error: %f',kule_train,error_train, kule_test, error_test)
#Plot the diagramms
c = ROOT.TCanvas("Discriminator", "", 2880, 1620)
h_dis_train_SM.Draw("h e")
h_dis_train_BSM.Draw("h same e")
h_dis_test_SM.Draw("h same e")
h_dis_test_BSM.Draw("h same e")
#get the maximum value
max_value_hist = max(h_dis_train_SM.GetMaximum(), h_dis_train_BSM.GetMaximum(), h_dis_test_SM.GetMaximum(), h_dis_test_BSM.GetMaximum())
max_value_margin = max_value_hist / 10.0
#scale the plots
h_dis_train_SM.SetMaximum(max_value_hist + max_value_margin)
h_dis_train_SM.Draw("h e")
h_dis_train_BSM.Draw("h same e")
h_dis_test_SM.Draw("h same e")
h_dis_test_BSM.Draw("h same e")
#add a legend
leg = ROOT.TLegend(0.65, 0.9, 0.9, 0.65)
leg.AddEntry(h_dis_train_SM,"SM Training")
leg.AddEntry(h_dis_train_BSM,"BSM Training")
leg.AddEntry(h_dis_test_SM,"SM Testing")
leg.AddEntry(h_dis_test_BSM,"BSM Testing")
leg.AddEntry(0, "Kul.div. test: " + str(kule_test) + " train: " + str(kule_train), "")
leg.AddEntry(0, "Kul.div. error test: " + str(error_test) + " train: " + str(error_train), "")
leg.Draw()
#label the axes
h_dis_train_SM.GetXaxis().SetTitle("Discriminator")
h_dis_train_SM.GetYaxis().SetTitle("Normalized number of events")
#Save to file
c.Print(os.path.join( output_directory, 'ttz-kule' + version + '.png'))
#stop the timer
ende = time.time()
logger.info('Ende des Runs, Laufzeit: ' + '{:5.3f}s'.format(ende-start))
#berechne die KULE
kule = KullbackLeibler(logger)
kd = kule.kule_div(h_dis_train_SM, h_dis_train_BSM)
print "Kule %f, deviation %f" %(kd[0], kd[1])