# Training for All Regions

In [1]:
regions=['5j','INC_ge6j3b','INC_ge6jge4b']
invertTestTrain=False
hpmasses=[200,225,250,275,300,350,400,500,600,700,800,900,1000,1200,1400,1600,1800,2000]

In [2]:
#! rm -rf HplusML pandas_INC_*.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_ge6j3b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_ge6jge4b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_5j3b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_5jge4b.h5

## Train BDTS

In [2]:
import HpTrainingFrame
import HpAlgorithms
import HpHyperParameterOptimisation
from joblib import dump
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from HpMLWeightTransformer import MultiSBWeightsScaler
from HpMLFeatureNormalisation import WeightedStandardScaler
from HpMLPipeline import PipelineWithWeights
import HpKerasUtils
import numpy as np

def getCallbacks(model):
    """ standard callbacks for Keras """
    return [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        ModelCheckpoint(filepath='model_nn_'+str(model.configuration)+"_dropout"+str(model.dropout)+"_l2threshold"+str(model.l2threshold)+".hdf5",
                        monitor='val_loss',
                        save_best_only=True)
      ]

def trainBDTandNN(region, hpmass, invertTestTrain, df_mc):
    htf=HpTrainingFrame.HpTrainingFrame(df_mc)
    X_train, X_test, X_eval, y_train, y_test,y_eval, w_train, w_test, w_eval=htf.prepare(hpmass=hpmass, region=None)
    
    datalabel="train2mod0"
    if invertTestTrain:
        datalabel="train2mod1"
        X_tmp=X_train
        y_tmp=y_train
        w_tmp=w_train
        X_train=X_test
        y_train=y_test
        w_train=w_test
        X_test=X_tmp
        y_test=y_tmp
        w_test=w_tmp
    
    # BDT
    """clf=HpAlgorithms.getGradientBDTClassifier()
    opt=HpHyperParameterOptimisation.HpOptimise('StandardBDT_'+region+'_Hp'+str(hpmass)+'_'+datalabel,clf,X_train,y_train,w_train,X_test,y_test,w_test)
    test,train=opt.trainAndTest(silent=True)
    dump(clf, 'models/standardBDT_'+str(hpmass)+'_'+region+'_'+datalabel+'.joblib') """

    # NN
    msb=MultiSBWeightsScaler(backgroundclass=0)
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train.values,y_train.values, sample_weight=w_train.values)
    dump(pipe, 'models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_pipe.joblib')
    X_train,y_train,w_train=pipe.transform(X_train.values,y_train.values, sample_weight=w_train.values)
    X_test,y_test,w_test=pipe.transform(X_test.values,y_test.values, sample_weight=w_test.values)
    #nonzerovariance=np.where(wss.scale_!=0)
    #X_train=X_train[:,nonzerovariance[0]]    
    #print(X_train.shape, X_test.shape)
    model=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_train.shape[1])
    result=model.train((X_train, y_train, w_train),(X_test, y_test, w_test), patience=5,callbacks=getCallbacks(model))
    arch_file=open('models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_architecture.h5','w')
    arch_file.write(model.model.to_json())
    arch_file.close()
    model.model.save_weights('models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_weights.h5')
    

Using TensorFlow backend.


In [5]:
from HpMLMTL import HpMTLBackgroundAugmenter
from HpMLUtils import FeatureDivider
from joblib import load
from keras.models import model_from_json

def trainMassParameterisedBDTandNN(region, invertTestTrain, df_mc):

    htf=HpTrainingFrame.HpTrainingFrame(df_mc)
    X_train, X_test, X_eval, y_train, y_test,y_eval, w_train, w_test, w_eval=htf.prepare(hpmass="multi",region=None,addMass=True)

    return (X_train,X_test)
    datalabel="train2mod0"
    if invertTestTrain:
      datalabel="train2mod1"
      X_tmp=X_train
      y_tmp=y_train
      w_tmp=w_train
      X_train=X_test
      y_train=y_test
      w_train=w_test
      X_test=X_tmp
      y_test=y_tmp
      w_test=w_tmp

    y_train=X_train.hpmass.copy()
    y_test=X_test.hpmass.copy()
    print(X_train.hpmass.unique())

    #BDT
    """print("BDT1")
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    steps=[("msb",msb),("aug",aug),("fd",fd)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train,X_train.hpmass, sample_weight=w_train)
    dump(pipe, 'models/massparameterisedBDT_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train,y_train, sample_weight=w_train)
    X_ts,y_ts,w_ts=pipe.transform(X_test,y_test, sample_weight=w_test)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    print("BDT1a")
    clf=HpAlgorithms.getGradientBDTClassifier(options = {'n_estimators': 200, 'learning_rate': 0.1}) #let's get away from the default trees to get a better performance
    clf.fit(X_tr, y_tr, sample_weight=w_tr)
    print("BDT1b")
    dump(clf, 'models/massparameterisedBDT_'+region+'_'+datalabel+'_bdt.joblib') """
    
    #NN 1
    """print("NN1")
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("aug",aug),("fd",fd),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train.values,X_train.hpmass.values, sample_weight=w_train.values)
    dump(pipe, 'models/massparameterisedNN1_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train.values,y_train.values, sample_weight=w_train.values)
    X_ts,y_ts,w_ts=pipe.transform(X_test.values,y_test.values, sample_weight=w_test.values)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    modelNN1=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_train.shape[1])
    print("NN1a")
    resultNN1=modelNN1.train((X_tr, y_tr, w_tr),(X_ts, y_ts, w_ts), patience=5,callbacks=getCallbacks(modelNN1))
    print("NN1b")
    arch_file=open('models/massparameterisedNN1_'+region+'_'+datalabel+'_architecture.h5','w')
    arch_file.write(modelNN1.model.to_json())
    arch_file.close()
    modelNN1.model.save_weights('models/massparameterisedNN1_'+region+'_'+datalabel+'_weights.h5')"""

    #NN 3
    print("NN3")
    from HpMLWeightTransformer import WeightsMultiplier
    scales={
      200:16.,
      225:8.,
      250:8.,
      275:8.,
      300:16./3.,
      350:4.,
      400:8./3.,
      500:2.,
      600:2.,
      700:2.,
      800:2.,
      900:2.,
      1000:4./3.,
      1200:1.,
      1400:1.,
      1600:1.2,
      1800:1.5,
      2000:4.,
    }
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    scl=WeightsMultiplier(scales=scales,backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("scl",scl),("aug",aug),("fd",fd),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train,X_train.hpmass, sample_weight=w_train)
    dump(pipe, 'models/massparameterisedNN3_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train,y_train, sample_weight=w_train)
    X_ts,y_ts,w_ts=pipe.transform(X_test,y_test, sample_weight=w_test)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    print(y_tr.unique())
    modelNN3=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_tr.shape[1])
    print(X_tr.shape,X_tr.mean(axis=0),X_tr.var(axis=0))
    print(X_tr.hpmass.unique())
    
    #resultNN3=modelNN3.train((X_tr.values, y_tr.values, w_tr.values),(X_ts.values, y_ts.values, w_ts.values), patience=5,callbacks=getCallbacks(modelNN3))
    #arch_file=open('models/massparameterisedNN3_'+region+'_'+datalabel+'_architecture.h5','w')
    #arch_file.write(modelNN3.model.to_json())
    #arch_file.close()
    #modelNN3.model.save_weights('models/massparameterisedNN3_'+region+'_'+datalabel+'_weights.h5')
    
    json_file = open('/data/JulianGlatzer/HplusML/trainall/models/massparameterisedNN3_'+region+'_'+datalabel+'_architecture.h5')
    loaded_model_json=json_file.read()
    json_file.close()
    modelNN3.model = model_from_json(loaded_model_json)
    modelNN3.model.load_weights('/data/JulianGlatzer/HplusML/trainall/models/massparameterisedNN3_'+region+'_'+datalabel+'_weights.h5')
    
    roc_test = roc_auc_score(y_ts, modelNN3.model.predict(X_ts).ravel(), sample_weight=w_ts)
    roc_train= roc_auc_score(y_tr, modelNN3.model.predict(X_tr).ravel(), sample_weight=w_tr)
    
    y_test_pred = modelNN3.model.predict(X_ts).ravel()
    import matplotlib.pyplot as plt
    plt.figure()
    plt.hist(y_test_pred[y_ts==400].ravel(),alpha=0.5,color='r',bins=50,density=True)
    plt.hist(y_test_pred[y_ts==-1].ravel(),alpha=0.5,color='b',bins=50,density=True)
    plt.show()
    
    
    
    with open("output.txt","a+") as f:
        f.write(region+" "+datalabel+" test/train: "+str(roc_test)+" / "+str(roc_train))
    print(roc_test,roc_train)


In [59]:
import pandas as pd

df_test = pd.read_hdf('/data/JulianGlatzer/HplusML/trainall/pandas_INC_ge6jge4b.h5','INC_ge6jge4b')
df_test.head()

Unnamed: 0,index,Mbb_MindR_70,eventNumber,Muu_MindR_70,nBTags_70,Centrality_all,pT_jet5,dRlepbb_MindR_70,H1_all,Mjjj_MaxPt,...,dRbb_avg_70,nJets,Mbb_MaxPt_70,HT_jets,weight,process,group,region,pT_jet1,hpmass
1853266,0,78006.71875,26311789,93472.070312,4,0.64753,48389.015625,2.942365,0.428227,165577.28125,...,1.836845,6,78006.71875,410279.375,0.769165,ttlight,t#bar{t} + light,INC_ge6jge4b,94875.25,-1
1853267,1,144572.25,8057383,86068.453125,4,0.636615,38330.675781,2.980346,0.077838,184632.640625,...,2.357118,6,144572.25,470684.46875,0.867114,ttlight,t#bar{t} + light,INC_ge6jge4b,158391.453125,-1
1853268,2,45798.390625,21899308,283880.40625,4,0.649095,72458.78125,1.807999,0.004904,153976.984375,...,1.43068,6,81811.507812,577440.875,0.660478,ttlight,t#bar{t} + light,INC_ge6jge4b,182397.34375,-1
1853269,3,68210.140625,18972816,156986.078125,4,0.663991,33645.386719,3.255158,0.019155,251311.53125,...,2.663929,7,152639.4375,424901.6875,0.605552,ttlight,t#bar{t} + light,INC_ge6jge4b,154738.65625,-1
1853270,4,67664.296875,11299806,43554.640625,4,0.743293,45294.984375,3.130159,0.119582,156184.890625,...,1.951383,6,98741.585938,371469.625,0.350783,ttlight,t#bar{t} + light,INC_ge6jge4b,94807.296875,-1


In [8]:
Atrain, Atest = trainMassParameterisedBDTandNN("INC_ge6jge4b", False, df_test)

In [85]:
htf=HpTrainingFrame.HpTrainingFrame(df_test)
Btrain, Btest, X_eval, y_train, y_test,y_eval, w_train, w_test, w_eval=htf.prepare(hpmass=225,region="INC_ge6jge4b",addMass=True)

In [87]:
print("Multi",Atrain.iloc[0])
print(Atest.iloc[0])
print("only225",Btrain.iloc[0])
print(Btest.iloc[0])

Multi nJets                    8.000000
nBTags_70                4.000000
pT_jet1             279404.968750
Mbb_MindR_70         37746.167969
pT_jet5              32479.382812
H1_all                   0.035688
dRbb_avg_70              2.899867
dRlepbb_MindR_70         4.325050
Muu_MindR_70         34004.242188
HT_jets             650862.312500
Mbb_MaxPt_70        275263.500000
Mbb_MaxM_70         275263.500000
Mjjj_MaxPt          193024.156250
Centrality_all           0.589056
hpmass                  -1.000000
Name: 341420, dtype: float64
nJets                    8.000000
nBTags_70                4.000000
pT_jet1              57568.562500
Mbb_MindR_70         39790.410156
pT_jet5              37419.878906
H1_all                   0.034701
dRbb_avg_70              1.775290
dRlepbb_MindR_70         1.682945
Muu_MindR_70         58718.191406
HT_jets             310518.125000
Mbb_MaxPt_70         54199.289062
Mbb_MaxM_70         140217.625000
Mjjj_MaxPt           93796.710938
Centrality_al

In [94]:
#Atrain.iloc[0] 
#df_test[df_test.pT_jet1==279404.968750]

#Atest.iloc[0]
#df_test[df_test.pT_jet1==57568.562500]
#Btest[Btest.pT_jet1==57568.562500] found in the other sample!

#Btrain.iloc[0]
#df_test[df_test.Mbb_MindR_70==122081.218750]
#Btest.iloc[0]
df_test[df_test.pT_jet1==103241.984375]

Unnamed: 0,index,Mbb_MindR_70,eventNumber,Muu_MindR_70,nBTags_70,Centrality_all,pT_jet5,dRlepbb_MindR_70,H1_all,Mjjj_MaxPt,...,dRbb_avg_70,nJets,Mbb_MaxPt_70,HT_jets,weight,process,group,region,pT_jet1,hpmass
2882547,26738,76116.273438,8805121,70910.03125,4,0.615122,32460.152344,1.638928,0.053024,311236.96875,...,2.057365,6,285449.0,349346.71875,0.078197,ttb,t#bar{t} + #geq1b,INC_ge6jge4b,103241.984375,-1


In [11]:
Atest.head()

Unnamed: 0,nJets,nBTags_70,pT_jet1,Mbb_MindR_70,pT_jet5,H1_all,dRbb_avg_70,dRlepbb_MindR_70,Muu_MindR_70,HT_jets,Mbb_MaxPt_70,Mbb_MaxM_70,Mjjj_MaxPt,Centrality_all,hpmass
56325,8,4,57568.5625,39790.410156,37419.878906,0.034701,1.77529,1.682945,58718.191406,310518.1,54199.289062,140217.625,93796.710938,0.674556,225
255787,8,4,490074.21875,88260.671875,107423.710938,0.443266,2.848803,2.87499,37364.4375,1236182.0,470821.53125,470821.53125,551293.625,0.492596,1400
74343,7,4,89510.882812,60857.914062,30959.849609,0.340852,2.343738,2.942727,99635.265625,374636.3,60857.914062,151224.09375,186332.640625,0.462964,275
321467,8,4,416670.65625,199328.96875,102743.671875,0.602293,2.182712,1.818642,222023.828125,1416343.0,245616.109375,519289.71875,692134.125,0.609738,2000
3559,7,4,80325.28125,31465.021484,37764.40625,0.043899,2.064574,3.655863,24805.294922,321060.7,31465.021484,119803.640625,102765.1875,0.797069,-1


In [4]:
import pandas as pd

for region in regions[-1:]:
    if region=="5j":
        df_5j3b=pd.read_hdf('pandas_INC_5j3b.h5', 'INC_5j3b')
        df_5j4b=pd.read_hdf('pandas_INC_5jge4b.h5', 'INC_5jge4b')
        df_mc=pd.concat([df_5j3b,df_5j4b], ignore_index=False)
    else:
        df_mc=pd.read_hdf('pandas_'+region+'.h5', region)
    for invertTestTrain in [False,True]: #False=2mod0, True=2mod1
        #for hpmass in hpmasses:
        #    print("Region=",region,"invertTestTrain=", invertTestTrain,"H+ mass=",hpmass)
        #    trainBDTandNN(region, hpmass, invertTestTrain, df_mc)
        print("Region=",region,"invertTestTrain=", invertTestTrain,"H+ mass=all")
        trainMassParameterisedBDTandNN(region, invertTestTrain, df_mc)

Region= INC_ge6jge4b invertTestTrain= False H+ mass=all
        nJets  nBTags_70        pT_jet1   Mbb_MindR_70       pT_jet5  \
341420      8          4  279404.968750   37746.167969  32479.382812   
342063      9          4  123118.976562   69792.085938  44130.472656   
10942       6          4  107503.234375   38321.851562  43654.425781   
372226      6          4  372385.437500   67083.578125  52405.285156   
424086      6          4  385840.937500  129459.632812  47722.515625   

          H1_all  dRbb_avg_70  dRlepbb_MindR_70   Muu_MindR_70       HT_jets  \
341420  0.035688     2.899867          4.325050   34004.242188  6.508623e+05   
342063  0.206301     2.136989          3.747357   23325.519531  5.232717e+05   
10942   0.218321     1.553878          2.587819  142112.812500  3.711712e+05   
372226  0.291607     1.892945          1.934670  257937.781250  1.015704e+06   
424086  0.354913     2.581128          3.537677   69449.445312  7.486200e+05   

         Mbb_MaxPt_70    Mbb_M

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-7e81ea1c7bce>", line 15, in <module>
    trainMassParameterisedBDTandNN(region, invertTestTrain, df_mc)
  File "<ipython-input-3-2aa71b512fe3>", line 104, in trainMassParameterisedBDTandNN
    X_ts,y_ts,w_ts=pipe.transform(X_test,y_test, sample_weight=w_test)
  File "/workspace/HplusML/trainall/HpMLPipeline.py", line 41, in transform
    X,y,sample_weight=step.transform(X,y,sample_weight=sample_weight)
  File "/workspace/HplusML/trainall/HpMLWeightTransformer.py", line 103, in transform
    sample_weight[y==classlabel]*=self.scale_[classlabel]
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/series.py", line 1039, in __setitem__
    setitem(key, value)
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/series.py", line 1030, in setitem
    self._where(~

KeyboardInterrupt: 

In [None]:
df_5j3b.iloc[:,[1,2]].columns

In [None]:
df_5j3b.iloc[:,[2,1]].columns