In [3]:
from collections import OrderedDict
from copy import deepcopy

import numpy as np
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn import preprocessing
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.models import Input, Model, load_model
from keras.optimizers import Adam
from keras.activations import relu, sigmoid
from keras.callbacks import ReduceLROnPlateau
from keras.losses import binary_crossentropy
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

Using TensorFlow backend.


In [4]:
file_name = '/home/aga/Fizyka/licencjat/htt_features_test.pkl'

In [5]:
def preprocess(file_name, enable_data=True, enable_dicriminators=True, mask_columns=[]):
    legs, jets, global_params, properties = pd.read_pickle(file_name)
    properties = OrderedDict(sorted(properties.items(), key=lambda t: t[0]))

    sampleType = np.array(global_params["sampleType"])
    sampleType = np.reshape(sampleType, (-1, 1))
    features = np.array(list(properties.values()))
    features = np.transpose(features)
    feature_names = list(properties.keys())

    # Redefine DPF output to be 1 for signal
    discName = "leg_2_DPFTau_2016_v1tauVSall"
    DPF_index = feature_names.index(discName)
    features[:, DPF_index] *= -1
    features[:, DPF_index] += 1
    indexes = features[:, DPF_index] > 1
    features[indexes, DPF_index] = 0.0
    # Filter features to be usedfor training
    column_mask = np.full(features.shape[1], enable_data)
    oldMVA_discriminators = [
        "leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2",
        "leg_2_DPFTau_2016_v1tauVSall",
        "leg_2_deepTau2017v1tauVSall",
        "leg_2_deepTau2017v1tauVSjet",
    ]
    for discName in oldMVA_discriminators:
        index = feature_names.index(discName)
        column_mask[index] = enable_dicriminators
    
    for col_name in mask_columns:
        index = feature_names.index(col_name)
        column_mask[index] = False
        
    features = features[:, column_mask]
    not_shuffled_features = deepcopy(features)
    not_shuffled_labels = deepcopy(sampleType)

    features = np.hstack((sampleType, features))
    np.random.shuffle(features)

    labels = features[:, 0]
    features = features[:, 1:]

    print("Input data shape:", features.shape)
    print("Number of positive examples:", (labels > 0.5).sum())
    print("Number of negative examples:", (labels < 0.5).sum())

    assert features.shape[0] == labels.shape[0]

    tmp = np.array(feature_names)
    tmp = tmp[column_mask]
    feature_names = list(tmp)
    return features, not_shuffled_features, labels, not_shuffled_labels, feature_names

In [6]:
def train_model(features, labels, layers, scheduler=[], lr=0.001, epochs=2, batch_size=128):
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    input_dense = Input((n_features,))
    output = input_dense
    for l in layers:
        output = l(output)
    model = Model(inputs=[input_dense], outputs=output)
    model.compile(optimizer=Adam(lr=lr), loss=binary_crossentropy)
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=scheduler)
    
    pred = model.predict(X_test)
    print('Accuracy on test:', accuracy_score(y_test, pred > 0.5))
    print('ROC AUC on test:', roc_auc_score(y_test, pred))
    
    return model

In [7]:
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    return roc_auc_score(y, pred)

# Only discriminators

In [62]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, False, True)
n_features = features.shape[1]

layers = [Dense(1, activation=sigmoid)]
model = train_model(features, labels, layers, epochs=4)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("model_only_disc.h5")
model.save('cpp_model_only_disc.h5', include_optimizer=False)
np.save('pred_only_disc', pred_whole)

Input data shape: (345390, 4)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy on test: 0.984099226386251
ROC AUC on test: 0.9957906307645559
Accuracy on whole: 0.9841570398679753
ROC AUC on whole: 0.9959549764952281


In [7]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [8]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.0836  ± 0.0006,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0041  ± 0.0001,leg_2_DPFTau_2016_v1tauVSall
0.0022  ± 0.0001,leg_2_deepTau2017v1tauVSjet
0.0013  ± 0.0001,leg_2_deepTau2017v1tauVSall


# Without discriminators

In [63]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, False)
n_features = features.shape[1]

layers = [Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(1, activation=sigmoid)]

model = train_model(features, labels, layers)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("model_without_disc.h5")
model.save('cpp_model_without_disc.h5', include_optimizer=False)
np.save('pred_without_disc', pred_whole)

Input data shape: (345390, 19)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/2
Epoch 2/2
Accuracy on test: 0.9783781905776625
ROC AUC on test: 0.9943221333559349
Accuracy on whole: 0.9780769564839746
ROC AUC on whole: 0.994412711608926


In [10]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [11]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3139  ± 0.0026,leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits
0.1676  ± 0.0016,leg_2_chargedIsoPtSum
0.0705  ± 0.0003,leg_2_neutralIsoPtSum
0.0155  ± 0.0003,leg_2_gjAngleDiff
0.0069  ± 0.0005,leg_2_photonPtSumOutsideSignalCone
0.0067  ± 0.0001,leg_2_nPhoton
0.0032  ± 0.0001,leg_2_flightLengthSig
0.0017  ± 0.0000,leg_2_puCorrPtSum
0.0005  ± 0.0000,leg_2_decayMode
0.0002  ± 0.0000,leg_2_dxy_Sig


# Whole data

In [64]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, True)
n_features = features.shape[1]

layers = [Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(1, activation=sigmoid)]

model = train_model(features, labels, layers)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("model_whole_data.h5")
model.save('cpp_model_whole_data.h5', include_optimizer=False)
np.save('pred_whole_data', pred_whole)

Input data shape: (345390, 23)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/2
Epoch 2/2
Accuracy on test: 0.9834043637374346
ROC AUC on test: 0.9971052677030932
Accuracy on whole: 0.9835113929181505
ROC AUC on whole: 0.9971924178814443


In [14]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [15]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3784  ± 0.0008,leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits
0.0798  ± 0.0004,leg_2_neutralIsoPtSum
0.0420  ± 0.0009,leg_2_chargedIsoPtSum
0.0031  ± 0.0000,leg_2_photonPtSumOutsideSignalCone
0.0028  ± 0.0001,leg_2_gjAngleDiff
0.0020  ± 0.0000,leg_2_DPFTau_2016_v1tauVSall
0.0014  ± 0.0000,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0013  ± 0.0001,leg_2_puCorrPtSum
0.0010  ± 0.0000,leg_2_nPhoton
0.0006  ± 0.0000,leg_2_flightLengthSig


# Without discriminators and 'byCombinedIsolation...'

In [67]:
# features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(
#     file_name, True, False, mask_columns=['leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits'],
# )

# n_features = features.shape[1]

# layers = [Dense(32, activation=relu), 
#           Dense(32, activation=relu), 
#           Dense(32, activation=relu), 
#           Dense(32, activation=relu), 
#           Dense(32, activation=relu), 
#           Dense(32, activation=relu), 
#           Dense(1, activation=sigmoid)]

# model = train_model(features, labels, layers)

# pred_whole = model.predict(not_shuffled_features)
# print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
# print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

# model.save_weights("model_without_disc_and_byCombined.h5")
# np.save('pred_without_disc_and_byCombined', pred_whole)

Input data shape: (345390, 18)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/2
Epoch 2/2
Accuracy on test: 0.9680826423310326
ROC AUC on test: 0.9927469507239571
Accuracy on whole: 0.968548597237905
ROC AUC on whole: 0.9929334331810683


In [22]:
# perm = PermutationImportance(
#     model, random_state=1, scoring=scorer, n_iter=3,
# ).fit(not_shuffled_features, not_shuffled_labels)

In [23]:
# eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3402  ± 0.0033,leg_2_chargedIsoPtSum
0.2345  ± 0.0032,leg_2_neutralIsoPtSum
0.0083  ± 0.0002,leg_2_photonPtSumOutsideSignalCone
0.0078  ± 0.0001,leg_2_gjAngleDiff
0.0062  ± 0.0001,leg_2_nPhoton
0.0017  ± 0.0001,leg_2_flightLengthSig
0.0014  ± 0.0000,leg_2_puCorrPtSum
0.0006  ± 0.0000,leg_2_decayDistMag
0.0003  ± 0.0000,leg_2_decayMode
0.0002  ± 0.0000,leg_2_dxy_Sig


# Whole data without 'byCombinedIsolation...'

In [65]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(
    file_name, True, True, mask_columns=['leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits'],
)

n_features = features.shape[1]

layers = [Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(32, activation=relu), 
          Dense(1, activation=sigmoid)]

model = train_model(features, labels, layers)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("model_without_byCombined.h5")
model.save('cpp_model_without_byCombined.h5', include_optimizer=False)
np.save('pred_without_byCombined', pred_whole)

Input data shape: (345390, 22)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/2
Epoch 2/2
Accuracy on test: 0.9841918747394265
ROC AUC on test: 0.9940992834841411
Accuracy on whole: 0.9839138365326153
ROC AUC on whole: 0.9936996435304544


In [27]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [28]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3268  ± 0.0029,leg_2_chargedIsoPtSum
0.1223  ± 0.0018,leg_2_neutralIsoPtSum
0.0040  ± 0.0001,leg_2_photonPtSumOutsideSignalCone
0.0018  ± 0.0000,leg_2_DPFTau_2016_v1tauVSall
0.0016  ± 0.0000,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0015  ± 0.0001,leg_2_puCorrPtSum
0.0013  ± 0.0001,leg_2_gjAngleDiff
0.0009  ± 0.0000,leg_2_nPhoton
0.0005  ± 0.0000,leg_2_flightLengthSig
0.0002  ± 0.0000,leg_2_deepTau2017v1tauVSall


# XGBoost on whole data

In [8]:
def train_xgb(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels)

    model = XGBClassifier(objective="multi:softprob", num_class=2)
    model.fit(X_train, y_train)
    
    pred = model.predict_proba(X_test)[:, 1]
    print('Accuracy on test:', accuracy_score(y_test, pred > 0.5))
    print('ROC AUC on test:', roc_auc_score(y_test, pred))
    
    return model

In [19]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, True)

model = train_xgb(features, labels)

pred_whole = model.predict_proba(not_shuffled_features)[:, 1]
# pred_whole = model.predict(not_shuffled_features)[:, 1]
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

bst = model.get_booster()
bst.dump_model("cpp_xgb_whole_data")
model.save_model("model_whole_data_xgb")
np.save('pred_whole_data_xgb', pred_whole)

Input data shape: (345390, 23)
Number of positive examples: 35260
Number of negative examples: 310130
Accuracy on test: 0.9891948858109048
ROC AUC on test: 0.9985360228492558
Accuracy on whole: 0.9898057268594922
ROC AUC on whole: 0.9986516818308989


In [71]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [72]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.2218  ± 0.0018,leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits
0.0594  ± 0.0004,leg_2_deepTau2017v1tauVSjet
0.0567  ± 0.0007,leg_2_DPFTau_2016_v1tauVSall
0.0311  ± 0.0001,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0112  ± 0.0002,leg_2_chargedIsoPtSum
0.0098  ± 0.0004,leg_2_deepTau2017v1tauVSall
0.0019  ± 0.0000,leg_2_gjAngleDiff
0.0017  ± 0.0001,leg_2_ptWeightedDetaStrip
0.0016  ± 0.0001,leg_2_eRatio
0.0010  ± 0.0002,leg_2_nPhoton


# XGBoost on data without discriminators

In [28]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, False)

model = train_xgb(features, labels)

pred_whole = model.predict_proba(not_shuffled_features)[:, 1]
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

bst = model.get_booster()
bst.dump_model("cpp_xgb_without_disc")
model.save_model("model_without_disc_xgb")
np.save('pred_without_disc_xgb', pred_whole)

Input data shape: (345390, 19)
Number of positive examples: 35260
Number of negative examples: 310130
Accuracy on test: 0.9803469680826423
ROC AUC on test: 0.9956927691299164
Accuracy on whole: 0.9813659920669389
ROC AUC on whole: 0.9959852515236186


In [74]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [75]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3542  ± 0.0023,leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits
0.1116  ± 0.0013,leg_2_chargedIsoPtSum
0.0442  ± 0.0005,leg_2_gjAngleDiff
0.0385  ± 0.0004,leg_2_nPhoton
0.0080  ± 0.0004,leg_2_ptWeightedDetaStrip
0.0046  ± 0.0004,leg_2_dxy_Sig
0.0035  ± 0.0003,leg_2_photonPtSumOutsideSignalCone
0.0021  ± 0.0001,leg_2_decayMode
0.0014  ± 0.0002,leg_2_flightLengthSig
0.0014  ± 0.0003,leg_2_ip3d


# XGBoost on whole data without 'byCombined...'

In [29]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(
    file_name, True, True,  mask_columns=['leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits'],
)

model = train_xgb(features, labels)

pred_whole = model.predict_proba(not_shuffled_features)[:, 1]
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

bst = model.get_booster()
bst.dump_model("cpp_xgb_without_byCombined")
model.save_model("model_without_byCombined_xgb")
np.save('pred_without_byCombined_xgb', pred_whole)

Input data shape: (345390, 22)
Number of positive examples: 35260
Number of negative examples: 310130
Accuracy on test: 0.9898086811506925
ROC AUC on test: 0.9986278294512737
Accuracy on whole: 0.9897854599148789
ROC AUC on whole: 0.9986448768241096


In [21]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [22]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.1744  ± 0.0015,leg_2_chargedIsoPtSum
0.0661  ± 0.0007,leg_2_deepTau2017v1tauVSjet
0.0563  ± 0.0009,leg_2_DPFTau_2016_v1tauVSall
0.0367  ± 0.0011,leg_2_neutralIsoPtSum
0.0238  ± 0.0003,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0104  ± 0.0002,leg_2_deepTau2017v1tauVSall
0.0023  ± 0.0003,leg_2_eRatio
0.0018  ± 0.0004,leg_2_gjAngleDiff
0.0018  ± 0.0001,leg_2_nPhoton
0.0016  ± 0.0002,leg_2_ptWeightedDetaStrip


# New neural network on whole data

best_params = {
    'width': 256,
    'scheduler': [reduce_lr],
    'lr': 0.0005,
    'epochs': 6,
    'deep': 2,
    'batch_size': 256,
    'batch_norm': True,
    'activation': sigmoid,
    'dropout': 0.0,
}

In [68]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, True)
n_features = features.shape[1]

layers = [
    Dense(256, activation=sigmoid),
    BatchNormalization(),
    Dense(256, activation=sigmoid),
    Dense(1, activation=sigmoid),
]
scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=0.00001)

model = train_model(features, labels, layers, 
                    scheduler=[scheduler], lr=0.0005, epochs=6, batch_size=256)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("new_whole_data.h5")
model.save('cpp_new_whole_data.h5', include_optimizer=False)
np.save('pred_new_whole_data', pred_whole)

Input data shape: (345390, 23)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/6
Epoch 2/6
  8960/259042 [>.............................] - ETA: 4s - loss: 0.0456



Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy on test: 0.9876661879835086
ROC AUC on test: 0.9981560172321211
Accuracy on whole: 0.987767451286951
ROC AUC on whole: 0.9981564765742666


In [36]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [37]:
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.3995  ± 0.0007,leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits
0.0598  ± 0.0009,leg_2_chargedIsoPtSum
0.0476  ± 0.0005,leg_2_neutralIsoPtSum
0.0071  ± 0.0002,leg_2_DPFTau_2016_v1tauVSall
0.0020  ± 0.0001,leg_2_gjAngleDiff
0.0012  ± 0.0000,leg_2_photonPtSumOutsideSignalCone
0.0009  ± 0.0000,leg_2_byIsolationMVArun2v1DBnewDMwLTraw2017v2
0.0007  ± 0.0000,leg_2_deepTau2017v1tauVSjet
0.0005  ± 0.0000,leg_2_nPhoton
0.0005  ± 0.0000,leg_2_puCorrPtSum


# New neural network on data without disc

In [66]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(file_name, True, False)
n_features = features.shape[1]

layers = [
    Dense(256, activation=sigmoid),
    BatchNormalization(),
    Dense(256, activation=sigmoid),
    Dense(1, activation=sigmoid),
]
scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=0.00001)

model = train_model(features, labels, layers, 
                    scheduler=[scheduler], lr=0.0005, epochs=6, batch_size=256)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("new_data_without_disc.h5")
model.save('cpp_new_data_without_disc.h5', include_optimizer=False)
np.save('pred_new_data_without_dics', pred_whole)

Input data shape: (345390, 19)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/6
Epoch 2/6
  9984/259042 [>.............................] - ETA: 4s - loss: 0.0554



Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy on test: 0.9766757770880623
ROC AUC on test: 0.9945886236584468
Accuracy on whole: 0.9770462375864964
ROC AUC on whole: 0.9946723565816974


In [None]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [None]:
eli5.show_weights(perm, feature_names = feature_names)

# New neural network on data without byCombined

In [67]:
features, not_shuffled_features, labels, not_shuffled_labels, feature_names = preprocess(
    file_name, True, True, mask_columns=['leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits'],
)
n_features = features.shape[1]

layers = [
    Dense(256, activation=sigmoid),
    BatchNormalization(),
    Dense(256, activation=sigmoid),
    Dense(1, activation=sigmoid),
]
scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=0.00001)

model = train_model(features, labels, layers, 
                    scheduler=[scheduler], lr=0.0005, epochs=6, batch_size=256)

pred_whole = model.predict(not_shuffled_features)
print('Accuracy on whole:', accuracy_score(not_shuffled_labels, pred_whole > 0.5))
print('ROC AUC on whole:', roc_auc_score(not_shuffled_labels, pred_whole))

model.save_weights("new_data_without_byCombined.h5")
model.save('cpp_new_data_without_byCombined.h5', include_optimizer=False)
np.save('pred_new_data_without_byCombined', pred_whole)

Input data shape: (345390, 22)
Number of positive examples: 35260
Number of negative examples: 310130
Epoch 1/6
Epoch 2/6
  9984/259042 [>.............................] - ETA: 4s - loss: 0.0482



Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy on test: 0.980404873303377
ROC AUC on test: 0.9971738968169591
Accuracy on whole: 0.9806219056718493
ROC AUC on whole: 0.9972362447625251


In [None]:
perm = PermutationImportance(
    model, random_state=1, scoring=scorer, n_iter=3,
).fit(not_shuffled_features, not_shuffled_labels)

In [None]:
eli5.show_weights(perm, feature_names = feature_names)