# Training for All Regions

In [1]:
regions=['5j','INC_ge6j3b','INC_ge6jge4b']
invertTestTrain=False
hpmasses=[200,225,250,275,300,350,400,500,600,700,800,900,1000,1200,1400,1600,1800,2000]

In [2]:
#! rm -rf HplusML pandas_INC_*.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_ge6j3b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_ge6jge4b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_5j3b.h5
#! wget https://jglatzer.web.cern.ch/jglatzer/hpml/pandas_INC_5jge4b.h5

## Train BDTS

In [3]:
import HpTrainingFrame
import HpAlgorithms
import HpHyperParameterOptimisation
from joblib import dump
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from HpMLWeightTransformer import MultiSBWeightsScaler
from HpMLFeatureNormalisation import WeightedStandardScaler
from HpMLPipeline import PipelineWithWeights
import HpKerasUtils
import numpy as np

def getCallbacks(model):
    """ standard callbacks for Keras """
    return [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        ModelCheckpoint(filepath='model_nn_'+str(model.configuration)+"_dropout"+str(model.dropout)+"_l2threshold"+str(model.l2threshold)+".hdf5",
                        monitor='val_loss',
                        save_best_only=True)
      ]

def trainBDTandNN(region, hpmass, invertTestTrain, df_mc):
    htf=HpTrainingFrame.HpTrainingFrame(df_mc)
    X_train, X_test, X_eval, y_train, y_test,y_eval, w_train, w_test, w_eval=htf.prepare(hpmass=hpmass, region=None)
    
    datalabel="train2mod0"
    if invertTestTrain:
        datalabel="train2mod1"
        X_tmp=X_train
        y_tmp=y_train
        w_tmp=w_train
        X_train=X_test
        y_train=y_test
        w_train=w_test
        X_test=X_tmp
        y_test=y_tmp
        w_test=w_tmp
    
    # BDT
    """clf=HpAlgorithms.getGradientBDTClassifier()
    opt=HpHyperParameterOptimisation.HpOptimise('StandardBDT_'+region+'_Hp'+str(hpmass)+'_'+datalabel,clf,X_train,y_train,w_train,X_test,y_test,w_test)
    test,train=opt.trainAndTest(silent=True)
    dump(clf, 'models/standardBDT_'+str(hpmass)+'_'+region+'_'+datalabel+'.joblib') """

    # NN
    msb=MultiSBWeightsScaler(backgroundclass=0)
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train.values,y_train.values, sample_weight=w_train.values)
    dump(pipe, 'models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_pipe.joblib')
    X_train,y_train,w_train=pipe.transform(X_train.values,y_train.values, sample_weight=w_train.values)
    X_test,y_test,w_test=pipe.transform(X_test.values,y_test.values, sample_weight=w_test.values)
    #nonzerovariance=np.where(wss.scale_!=0)
    #X_train=X_train[:,nonzerovariance[0]]    
    #print(X_train.shape, X_test.shape)
    
    model=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_train.shape[1])
    result=model.train((X_train, y_train, w_train),(X_test, y_test, w_test), patience=5,callbacks=getCallbacks(model))
    arch_file=open('models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_architecture.h5','w')
    arch_file.write(model.model.to_json())
    arch_file.close()
    model.model.save_weights('models/standardNN_'+str(hpmass)+'_'+region+'_'+datalabel+'_weights.h5')
    

Using TensorFlow backend.


In [4]:
from HpMLMTL import HpMTLBackgroundAugmenter
from HpMLUtils import FeatureDivider

def trainMassParameterisedBDTandNN(region, invertTestTrain, df_mc):

    htf=HpTrainingFrame.HpTrainingFrame(df_mc)
    X_train, X_test, X_eval, y_train, y_test,y_eval, w_train, w_test, w_eval=htf.prepare(hpmass="multi",region=None,addMass=True)

    datalabel="train2mod0"
    if invertTestTrain:
      datalabel="train2mod1"
      X_tmp=X_train
      y_tmp=y_train
      w_tmp=w_train
      X_train=X_test
      y_train=y_test
      w_train=w_test
      X_test=X_tmp
      y_test=y_tmp
      w_test=w_tmp

    y_train=X_train.hpmass.copy()
    y_test=X_test.hpmass.copy()

    #BDT
    """print("BDT1")
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    steps=[("msb",msb),("aug",aug),("fd",fd)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train,X_train.hpmass, sample_weight=w_train)
    dump(pipe, 'models/massparameterisedBDT_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train,y_train, sample_weight=w_train)
    X_ts,y_ts,w_ts=pipe.transform(X_test,y_test, sample_weight=w_test)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    print("BDT1a")
    clf=HpAlgorithms.getGradientBDTClassifier(options = {'n_estimators': 200, 'learning_rate': 0.1}) #let's get away from the default trees to get a better performance
    clf.fit(X_tr, y_tr, sample_weight=w_tr)
    print("BDT1b")
    dump(clf, 'models/massparameterisedBDT_'+region+'_'+datalabel+'_bdt.joblib') """
    
    #NN 1
    """print("NN1")
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("aug",aug),("fd",fd),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train.values,X_train.hpmass.values, sample_weight=w_train.values)
    dump(pipe, 'models/massparameterisedNN1_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train.values,y_train.values, sample_weight=w_train.values)
    X_ts,y_ts,w_ts=pipe.transform(X_test.values,y_test.values, sample_weight=w_test.values)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    modelNN1=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_train.shape[1])
    print("NN1a")
    resultNN1=modelNN1.train((X_tr, y_tr, w_tr),(X_ts, y_ts, w_ts), patience=5,callbacks=getCallbacks(modelNN1))
    print("NN1b")
    arch_file=open('models/massparameterisedNN1_'+region+'_'+datalabel+'_architecture.h5','w')
    arch_file.write(modelNN1.model.to_json())
    arch_file.close()
    modelNN1.model.save_weights('models/massparameterisedNN1_'+region+'_'+datalabel+'_weights.h5')"""

    #NN 3
    print("NN3")
    from HpMLWeightTransformer import WeightsMultiplier
    scales={
      200:16.,
      225:8.,
      250:8.,
      275:8.,
      300:16./3.,
      350:4.,
      400:8./3.,
      500:2.,
      600:2.,
      700:2.,
      800:2.,
      900:2.,
      1000:4./3.,
      1200:1.,
      1400:1.,
      1600:1.2,
      1800:1.5,
      2000:4.,
    }
    msb=MultiSBWeightsScaler(backgroundclass=-1)
    scl=WeightsMultiplier(scales=scales,backgroundclass=-1)
    aug=HpMTLBackgroundAugmenter(backgroundclass=-1)
    fd=FeatureDivider("hpmass")
    wss=WeightedStandardScaler()
    steps=[("msb",msb),("scl",scl),("aug",aug),("fd",fd),("wss",wss)]
    pipe=PipelineWithWeights(steps)
    pipe.fit(X_train,X_train.hpmass, sample_weight=w_train)
    dump(pipe, 'models/massparameterisedNN3_'+region+'_'+datalabel+'_pipe.joblib')
    X_tr,y_tr,w_tr=pipe.transform(X_train,y_train, sample_weight=w_train)
    X_ts,y_ts,w_ts=pipe.transform(X_test,y_test, sample_weight=w_test)
    y_tr=(y_tr>0)
    y_ts=(y_ts>0)
    modelNN3=HpKerasUtils.HpFeedForwardModel(configuration=[64,64],dropout=0.1, verbose=True, input_dim=X_tr.shape[1])
    resultNN3=modelNN3.train((X_tr.values, y_tr.values, w_tr.values),(X_ts.values, y_ts.values, w_ts.values), patience=5,callbacks=getCallbacks(modelNN3))
    arch_file=open('models/massparameterisedNN3_'+region+'_'+datalabel+'_architecture.h5','w')
    arch_file.write(modelNN3.model.to_json())
    arch_file.close()
    modelNN3.model.save_weights('models/massparameterisedNN3_'+region+'_'+datalabel+'_weights.h5')

In [None]:
import pandas as pd

for region in regions:
    if region=="5j":
        df_5j3b=pd.read_hdf('pandas_INC_5j3b.h5', 'INC_5j3b')
        df_5j4b=pd.read_hdf('pandas_INC_5jge4b.h5', 'INC_5jge4b')
        df_mc=pd.concat([df_5j3b,df_5j4b], ignore_index=False)
    else:
        df_mc=pd.read_hdf('pandas_'+region+'.h5', region)
    for invertTestTrain in [False,True]: #False=2mod0, True=2mod1
        #for hpmass in hpmasses:
        #    print("Region=",region,"invertTestTrain=", invertTestTrain,"H+ mass=",hpmass)
        #    trainBDTandNN(region, hpmass, invertTestTrain, df_mc)
        print("Region=",region,"invertTestTrain=", invertTestTrain,"H+ mass=all")
        trainMassParameterisedBDTandNN(region, invertTestTrain, df_mc)

Region= 5j invertTestTrain= False H+ mass=all
NN3
(3401795, 27) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]
(3401795, 27) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]
(3413977, 27) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                1792      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_______

In [None]:
df_5j3b.iloc[:,[1,2]].columns

In [None]:
df_5j3b.iloc[:,[2,1]].columns