In [29]:
import ROOT
ROOT.EnableImplicitMT(220)
import os
import import_ipynb
import setPath
from Input.OpenDataPandaFramework13TeV import *
#%jsroot on

In [2]:
indir = "/storage/shared/data/"
infile = '2lep_df_forML_bkg_signal_fromRDF.hdf5'
df = pd.read_hdf(indir+infile) 

In [3]:
df.category.unique()

array(['ZPrimett', 'Zjets', 'Diboson', 'Zjetsincl', 'dmV_Zll',
       'SUSYSlepSlep', 'ttbar', 'Wjets', 'SUSYC1C1', 'Higgs', 'topX',
       'SUSYC1N2', 'ZPrimemumu', 'GG_ttn1', 'RS_G_ZZ', 'Gmumu',
       'Wjetsincl', 'ZPrimeee', 'Gee', 'singleTop', 'TT_directTT'],
      dtype=object)

Select which BSM model we want to train our model on. See all possibilities above. 

In [4]:
susysig = df.loc[df['category'] == 'SUSYSlepSlep']

Define the SM background

In [5]:
bkgs = getBkgCategories()

###############################
#### Background categories ####
###############################
Category             N(samples)
-------------------------------
Diboson                      10
Higgs                        20
Wjets                        42
Wjetsincl                     6
Zjets                        42
Zjetsincl                     3
singleTop                     6
topX                          3
ttbar                         1


We choose to use the slices Zjets and Wjets samples so need to remove the inclusive ones

In [6]:
bkgs.remove('Wjetsincl')
bkgs.remove('Zjetsincl')

Check the backgrounds which we want to include in the training...

In [7]:
bkgs

['Diboson', 'Higgs', 'Wjets', 'Zjets', 'singleTop', 'topX', 'ttbar']

... and select them from the dataframe

In [8]:
i = 0
for b in bkgs:
    if i == 0:
        dfbkg = df.loc[df['category'] == b]
    else:
        dfbkg = pd.concat([dfbkg,df.loc[df['category'] == b]])
    i += 1

Some informatio about number of events and number of features

In [9]:
dfbkg.shape

(63642138, 38)

In [10]:
susysig.shape

(86383, 38)

In [11]:
susysig.insert(loc=1,column='isSignal',value=np.ones(susysig.shape[0]))

In [12]:
dfbkg.insert(loc=1,column='isSignal',value=np.zeros(dfbkg.shape[0]))

In [13]:
X = pd.concat([dfbkg,susysig])
y = X['isSignal'].to_numpy()

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

In [15]:
X_train.columns

Index(['njet20', 'isSignal', 'njet60', 'nbjet60', 'nbjet70', 'nbjet77',
       'nbjet85', 'isOS', 'isSF', 'mll', 'mt2', 'met_et', 'met_phi',
       'lep1_flav', 'lep1_pt', 'lep1_eta', 'lep1_phi', 'lep1_E',
       'lep1_ptcone30', 'lep1_etcone20', 'lep1_trackd0pvunbiased',
       'lep1_tracksigd0pvunbiased', 'lep1_isTightID', 'lep1_z0', 'lep2_flav',
       'lep2_pt', 'lep2_eta', 'lep2_phi', 'lep2_E', 'lep2_ptcone30',
       'lep2_etcone20', 'lep2_trackd0pvunbiased', 'lep2_tracksigd0pvunbiased',
       'lep2_isTightID', 'lep2_z0', 'channelNumber', 'costhstar', 'category',
       'physdescr'],
      dtype='object')

In [16]:
todrop = ['physdescr','category','channelNumber','isSignal','lep1_ptcone30','lep1_etcone20',
         'lep1_trackd0pvunbiased','lep1_tracksigd0pvunbiased','lep1_isTightID','lep2_etcone20',
         'lep2_trackd0pvunbiased','lep2_tracksigd0pvunbiased','lep2_isTightID']
X_train = X_train.drop(todrop,axis = 1)


In [22]:
X_test = X_test.drop(todrop,axis = 1)

In [17]:
import xgboost as xgb
xgbclassifier = xgb.XGBClassifier(
    max_depth=3, 
    n_estimators=120,
    learning_rate=0.1,
    n_jobs=4,
    use_label_encoder=False,
    #scale_pos_weight=sum_wbkg/sum_wsig,
    objective='binary:logistic')
    #missing=-999.0) 
xgbclassifier.fit(X_train, y_train) 



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=120, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='approx', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [20]:
xgbclassifier.save_model("mymodel.json")

In [27]:
import sys
!{sys.executable} -m pip install pyqt5

Defaulting to user installation because normal site-packages is not writeable
Collecting pyqt5
  Downloading PyQt5-5.15.9-cp37-abi3-manylinux_2_17_x86_64.whl (8.4 MB)
     |████████████████████████████████| 8.4 MB 5.6 MB/s            �█████▌                        | 2.0 MB 5.6 MB/s eta 0:00:02 eta 0:00:01
[?25hCollecting PyQt5-Qt5>=5.15.2
  Downloading PyQt5_Qt5-5.15.2-py3-none-manylinux2014_x86_64.whl (59.9 MB)
     |████████████████████████████████| 59.9 MB 71.2 MB/s            �▎                           | 7.9 MB 71.2 MB/s eta 0:00:01▊                          | 10.6 MB 71.2 MB/s eta 0:00:01��█████▋                     | 19.9 MB 71.2 MB/s eta 0:00:01��███████████▉                   | 24.1 MB 71.2 MB/s eta 0:00:01██████████████               | 31.8 MB 71.2 MB/s eta 0:00:01��██████████████████████████ | 57.9 MB 71.2 MB/s eta 0:00:01██▉| 59.6 MB 71.2 MB/s eta 0:00:01
[?25hCollecting PyQt5-sip<13,>=12.11
  Downloading PyQt5_sip-12.12.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_6

In [31]:
# Plot variable importance
import matplotlib.pyplot as plt
fig_size = plt.rcParams["figure.figsize"] 
ax = xgb.plot_importance(xgbclassifier)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(30)
fig_size[0] = 20
fig_size[1] = 15
plt.rcParams["figure.figsize"] = fig_size
plt.show()
y_pred = xgbclassifier.predict(X_test)
y_pred_prob = xgbclassifier.predict_proba(X_test)
plt.savefig("myplot.png")

  # Remove the CWD from sys.path while we load stuff.


In [44]:
#  histogram of the ML outputs
n_bkg, bins_bkg, patches_bkg = plt.hist(y_pred_prob[:,1][y_test==0], 100,  facecolor='blue', alpha=0.2,label="Background")
n_sig, bins_sig, patches_sig = plt.hist(y_pred_prob[:,1][y_test==1], 100,  facecolor='red', alpha=0.2, label="Signal")
plt.xlabel('ML output')
plt.ylabel('Events')
plt.yscale('log')
plt.title('ML output, OpenData dataset, validation data')
plt.grid(True)
plt.legend()
#plt.show()
plt.savefig("mydist.png")

In [51]:
for i in range(len(n_bkg)):
    tot_bkg = n_bkg[i:].sum()
    tot_sig = n_sig[i:].sum()
    print("%.2f = %.2f, %.2f, %.2f"%(bins_bkg[i],tot_bkg,tot_sig,tot_sig/np.sqrt(tot_bkg)))

0.00 = 31821245.00, 43016.00, 7.63
0.01 = 264242.00, 37129.00, 72.23
0.02 = 143029.00, 35142.00, 92.92
0.03 = 99602.00, 33844.00, 107.24
0.04 = 76268.00, 32874.00, 119.04
0.05 = 62045.00, 32067.00, 128.74
0.06 = 52277.00, 31400.00, 137.33
0.07 = 44785.00, 30803.00, 145.55
0.08 = 39332.00, 30316.00, 152.86
0.09 = 35011.00, 29841.00, 159.48
0.10 = 31416.00, 29421.00, 165.99
0.11 = 28341.00, 29017.00, 172.36
0.12 = 25919.00, 28640.00, 177.90
0.13 = 23783.00, 28303.00, 183.53
0.14 = 21986.00, 28000.00, 188.84
0.15 = 20397.00, 27722.00, 194.11
0.16 = 18837.00, 27444.00, 199.96
0.17 = 17643.00, 27135.00, 204.29
0.18 = 16580.00, 26878.00, 208.74
0.19 = 15658.00, 26602.00, 212.59
0.20 = 14712.00, 26342.00, 217.18
0.21 = 13931.00, 26127.00, 221.36
0.22 = 13196.00, 25896.00, 225.43
0.23 = 12521.00, 25670.00, 229.41
0.24 = 11963.00, 25470.00, 232.87
0.25 = 11433.00, 25284.00, 236.46
0.26 = 10880.00, 25055.00, 240.20
0.27 = 10432.00, 24868.00, 243.48
0.28 = 9932.00, 24675.00, 247.59
0.29 = 9546.00