In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df = pd.read_parquet('/Users/ye/Desktop/df.parquet', engine='pyarrow')

In [3]:
# categorise to signal = 1 or backgournd = 0
df['proc'] = df['proc'].apply(lambda x: 1 if x != 0 else 0)

In [4]:
df

Unnamed: 0,leadPhotonEn,leadPhotonMass,leadPhotonPt,leadPhotonEta,leadPhotonPhi,leadPhotonIDMVA,leadPhotonSigmaE,leadPhotonHoE,leadPhotonPfRelIsoAll,leadPhotonPfRelIsoChg,...,leadJetDiphoDEta,subleadJetDiphoDPhi,subleadJetDiphoDEta,nSoftJets,metPt,metPhi,metSumET,metSignificance,weight,proc
1111356,77.388908,0.000001,57.133667,0.818970,-1.590332,0.914551,0.937500,0.000000,0.027066,0.027066,...,-1.645295,0.412073,-0.237702,6.0,22.016317,-0.238159,1211.0,0.869141,1.312993e-06,0
708637,129.196991,-0.000003,129.048645,0.047943,1.322998,0.972168,1.296875,0.000000,0.000000,0.000000,...,0.605590,2.341702,0.726195,6.0,77.595505,0.022762,775.5,21.265625,9.591657e-11,1
846317,129.661087,0.000002,124.301880,-0.292603,-2.690918,0.744141,1.562500,0.015930,0.021015,0.021015,...,-1.327674,0.620501,0.540246,6.0,17.334482,1.477539,1763.0,0.518555,1.260631e-06,0
2577373,89.011086,0.000001,89.006615,-0.010021,-2.511719,0.708984,0.890625,0.000000,0.000000,0.000000,...,-999.000000,-999.000000,-999.000000,6.0,20.119852,0.106323,1245.0,0.977051,7.116355e-06,0
213346,155.278717,0.000000,147.924973,-0.314026,2.483398,0.963379,1.531250,0.000000,0.008058,0.008058,...,-2.512229,-1.419967,3.073708,6.0,108.228378,-1.519043,2620.0,8.085938,6.632386e-11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667971,73.866203,0.000000,73.475899,-0.103027,-1.945801,0.977539,0.757812,0.000000,0.000000,0.000000,...,1.846709,-2.697640,2.885161,6.0,33.445145,-2.115723,2037.0,1.334961,9.410649e-11,1
316710,107.925034,0.000000,72.095215,-0.959717,-2.927734,0.838379,1.453125,0.011169,0.017436,0.001089,...,-999.000000,-999.000000,-999.000000,6.0,19.165186,-2.899414,1018.0,1.063477,9.897219e-08,1
298067,149.264282,0.000002,89.225029,-1.103271,1.434570,0.954590,3.406250,0.000000,0.000000,0.000000,...,-999.000000,-999.000000,-999.000000,6.0,39.433796,-2.859375,753.5,4.476562,1.483928e-08,1
2053454,112.139229,-0.000002,111.939842,0.059677,-3.135254,0.981445,1.375000,0.009338,0.007979,0.000000,...,0.912704,-0.989709,-2.197648,6.0,50.148216,2.472168,2754.0,2.589844,1.436982e-06,0


In [5]:
Y = df['proc']
X = df.drop(columns=['proc'])

In [7]:
X.shape

(60000, 142)

In [9]:
x = torch.tensor(X.values, dtype=torch.float)
y = torch.tensor(Y.values, dtype=torch.float).unsqueeze(1)

In [None]:
# save the tensors
torch.save(x, 'tensor_x.pt')
torch.save(y, 'tensor_y.pt')
# call the tensor
# x = torch.load('tensor_x.pt')
# y = torch.load('tensor_y.pt')

In [7]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# creat TensorDatasets
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)
# creat DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
def plot_loss(train_loss:list, test_loss:list):
    df = pd.DataFrame({"train_loss":train_loss, "test_loss":test_loss})
    plt.grid(True)
    sns.lineplot(data=df)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Testing Loss VS. Epoch ')

In [None]:
def train_model(model, batch_size, learning_rate, train_dataset, test_dataset, epochs):
    """
    Train a neural network model using the specified parameters.

    Args:
    - model (torch.nn.Module): The neural network model to be trained.
    - batch_size (int): The size of mini-batches used for training.
    - learning_rate (float): The learning rate for the optimizer.
    - train_dataset (torch.utils.data.Dataset): The training dataset.
    - test_dataset (torch.utils.data.Dataset): The testing dataset.
    - epochs (int): The number of training epochs.
    """

    # Ensure the model is on the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # KL Divergence loss.
    criterion = nn.KLDivLoss()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Lists to store loss values
    train_loss_list = []
    test_loss_list = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for data, targets in train_loader:
            data, targets = data.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for data, targets in test_loader:
                data, targets = data.to(device), targets.to(device)
                outputs = model(data)
                loss = criterion(outputs, targets)
                test_loss += loss.item()

        test_loss /= len(test_loader)
        train_loss_list.append(train_loss)
        test_loss_list.append(test_loss)

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

    plot_loss(train_loss_list, test_loss_list)


In [None]:
class SimpleFeedforward(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(SimpleFeedforward, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        return self.network(x)

class CouplingLayer(nn.Module):
    def __init__(self, input_dim=142, hidden_layer=8):
        super(CouplingLayer, self).__init__()

        # using as the 'self.coupling == "additive":' 
        self.s_net = SimpleFeedforward(input_dim // 2, input_dim // 2, hidden_layer)
        self.t_net = SimpleFeedforward(input_dim // 2, input_dim // 2, hidden_layer)

    def forward(self, x, reverse=False):
        xa, xb = x.chunk(2, dim=1)

        if not reverse:
            s = torch.sigmoid(self.s_net(xb) + 2)
            t = self.t_net(xb)
            ya = s * xa + t
            y = torch.cat([ya, xb], dim=1)
        else:
            s = torch.sigmoid(self.s_net(xb) + 2)
            t = self.t_net(xb)
            ya = (xa - t) / s
            y = torch.cat([ya, xb], dim=1)

        return y

class INN(nn.Module):
    def __init__(self, input_dim=142, num_coupling_layers=15, hidden_layer=8):
        super(INN, self).__init__()

        self.coupling_layers = nn.ModuleList([
            CouplingLayer(input_dim=input_dim, hidden_layer=hidden_layer)
            for _ in range(num_coupling_layers)
        ])

    def forward(self, x):
        for coupling_layer in self.coupling_layers:
            x = coupling_layer(x)
        return x

    def inverse(self, y):
        for coupling_layer in reversed(self.coupling_layers):
            y = coupling_layer(y, reverse=True)
        return y


NameError: name 'nn' is not defined

In [None]:
model1 = INN()

In [None]:
lr = 0.001
EPOCH =  8
batch_size = 32

In [None]:
train_model1(model1, batch_size, lr, train_dataset, test_dataset, EPOCH)