In [1]:
import torch
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from train_utils import (
    VAE_EFT,
    find_optimal_latent_dim,
    compute_errors,
    ROC_curve,
    train_with_early_stopping,
)

M_Z = 91.1876

### Processing training data

In [2]:
dataset = pd.read_csv("./data/SM_100k.csv")
# Apply cuts
df = dataset[
    (abs(dataset["m_ll"] - M_Z) < 15)
    & (dataset["m_jj"] > 300)
    & (abs(dataset["delta_eta_jj"]) > 2.5)
]

df.count()

m_ll            69489
m_jj            69489
pt_l1           69489
pt_l2           69489
pt_j1           69489
pt_j2           69489
pt_ll           69489
eta_l1          69489
eta_l2          69489
eta_j1          69489
eta_j2          69489
delta_eta_jj    69489
delta_phi_jj    69489
dtype: int64

In [3]:
# select variables
selection = [
    "m_ll",
    "m_jj",
    "pt_l1",
    "pt_l2",
    "pt_j1",
    "pt_j2",
    "pt_ll",
    "eta_l1",
    "eta_l2",
    "eta_j1",
    "eta_j2",
    "delta_eta_jj",
    "delta_phi_jj",
]
df = df[selection]
for vars in ["m_ll", "m_jj", "pt_l1", "pt_l2", "pt_j1", "pt_j2", "pt_ll"]:
    df[vars] = df[vars].apply(np.log10)

In [4]:
X_train, X_test = train_test_split(df, test_size=0.2)
X_train, X_valid = train_test_split(df, test_size=0.2)

In [5]:
# MinMax scaling
t = MinMaxScaler()
t.fit(X_train)
X_train = t.transform(X_train)
X_test = t.transform(X_test)
X_valid = t.transform(X_valid)

In [6]:
# create train and test dataloaders
batch_size = 50
train_loader = DataLoader(dataset=X_train, batch_size=batch_size)
test_loader = DataLoader(dataset=X_test, batch_size=batch_size)
valid_loader = DataLoader(dataset=X_valid, batch_size=batch_size)

device = torch.device("cpu")

In [9]:
model = VAE_EFT(latent_dim=8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
train_with_early_stopping(model, train_loader, valid_loader, optimizer, device)

=== EPOCH 0 ===
Training loss: 8.19e+00, 7.04e-05
Validation loss: 8.07e+00, 9.25e-05
=== EPOCH 1 ===
Training loss: 8.04e+00, 8.52e-05
Validation loss: 7.95e+00, 9.54e-05
=== EPOCH 2 ===
Training loss: 7.94e+00, 8.84e-05
Validation loss: 7.93e+00, 7.86e-05
=== EPOCH 3 ===
Training loss: 7.90e+00, 8.09e-05
Validation loss: 7.86e+00, 8.28e-05
=== EPOCH 4 ===
Training loss: 7.85e+00, 8.12e-05
Validation loss: 7.85e+00, 8.13e-05
=== EPOCH 5 ===
Training loss: 7.84e+00, 7.74e-05
Validation loss: 7.84e+00, 7.23e-05
=== EPOCH 6 ===
Training loss: 7.83e+00, 7.57e-05
Validation loss: 7.81e+00, 7.97e-05
=== EPOCH 7 ===
Training loss: 7.80e+00, 7.56e-05
Validation loss: 7.80e+00, 7.17e-05
=== EPOCH 8 ===
Training loss: 7.79e+00, 6.87e-05
Validation loss: 7.78e+00, 7.24e-05
=== EPOCH 9 ===
Training loss: 7.78e+00, 7.22e-05
Validation loss: 7.77e+00, 6.75e-05
=== EPOCH 10 ===
Training loss: 7.77e+00, 6.80e-05
Validation loss: 7.79e+00, 7.20e-05
=== EPOCH 11 ===
Training loss: 7.77e+00, 6.77e-05
Va

(17,
 7.693613755685191,
 [8.074785459652816,
  7.954852456544578,
  7.926547638696628,
  7.863202182169654,
  7.851895946776343,
  7.839263036304742,
  7.809954918826993,
  7.803728455519532,
  7.779602137829755,
  7.770687249631796,
  7.785743110517463,
  7.769460087836708,
  7.740938469582452,
  7.741780721296167,
  7.733222281273443,
  7.709460833571355,
  7.7004190145237095,
  7.693613755685191,
  7.703255670650654,
  7.6970931572405155,
  7.700410294914331,
  7.695030946569582,
  7.6939261614351615])

### Processing test data (sm vs bsm)

In [None]:
dataset = pd.read_csv("./data/SM_10k.csv")

# Apply cuts
df = dataset[
    (abs(dataset["m_ll"] - M_Z) < 15)
    & (dataset["m_jj"] > 300)
    & (abs(dataset["delta_eta_jj"]) > 2.5)
]

df = df[selection]
for vars in ["m_ll", "m_jj", "pt_l1", "pt_l2", "pt_j1", "pt_j2", "pt_ll"]:
    df[vars] = df[vars].apply(np.log10)

X_sm = t.transform(df)

model.eval()
T_sm = torch.from_numpy(X_sm).float().to(device)
X_sm_hat, mean, log_var = model.forward(T_sm)
X_sm_hat = X_sm_hat.detach().cpu()
print(T_sm.size(), X_sm_hat.size())

In [None]:
sm = pd.read_csv("./data/SM_10k.csv")
o1 = pd.read_csv("./data/cW_1_10k.csv")
o2 = pd.read_csv("./data/cW_2_10k.csv")

bsm = pd.concat([o1, o2], axis=0)

# Apply cuts
sm = sm[
    (abs(sm["m_ll"] - M_Z) < 15) & (sm["m_jj"] > 300) & (abs(sm["delta_eta_jj"]) > 2.5)
]
bsm = bsm[
    (abs(bsm["m_ll"] - M_Z) < 15)
    & (bsm["m_jj"] > 300)
    & (abs(bsm["delta_eta_jj"]) > 2.5)
]

sm = sm[selection]
for vars in ["m_ll", "m_jj", "pt_l1", "pt_l2", "pt_j1", "pt_j2", "pt_ll"]:
    sm[vars] = sm[vars].apply(np.log10)

bsm = bsm[selection]
for vars in ["m_ll", "m_jj", "pt_l1", "pt_l2", "pt_j1", "pt_j2", "pt_ll"]:
    bsm[vars] = bsm[vars].apply(np.log10)

X_sm = t.transform(sm)
T_sm = torch.from_numpy(X_bsm).float().to(device)

X_bsm = t.transform(bsm)
T_bsm = torch.from_numpy(X_bsm).float().to(device)

In [None]:
find_optimal_latent_dim(VAE_EFT, train_loader, valid_loader, device, 12)