In [1]:
import torch

print(torch.__version__)
print(torch.version.cuda)

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import time
import networkx as nx
from itertools import combinations, permutations

from torch_geometric.data import Data, Dataset

sys.path.append("..")
from gnn_tools.graphs import customDataset, CreateTorchGraphsTensor
from gnn_tools.data import apply_reweighting, add_sample_weights, add_HT_ratio, add_sum_rcjets_d12_d23

%matplotlib inline

1.12.1
None


In [11]:
# Load in the samples
def loadSamples(path_to_samples, channel):
    if channel == "1L":
        j = 9
    if channel == "2L":
        j = 7
    with open(path_to_samples + f"{channel}_ttbar_nominal_ge{j}jge3b.pkl", "rb") as fh:
        df_ttbar = pickle.load(fh)
    with open(path_to_samples + f"{channel}_nominal_newRew_ge{j}jge3b.pkl", "rb") as fh:
        df_NNRew = pickle.load(fh)
    with open(path_to_samples + f"{channel}_signal_ge{j}jge3b.pkl", "rb") as fh:
        df_signal = pickle.load(fh)

    # combine reweighting
    df_ttbar = df_ttbar.merge(df_NNRew, on=["eventNumber", "runNumber", "mcChannelNumber", "nJets", "nBTags_DL1r_70"])

    df_ttbar["IsSig"] = 0
    df_ttbar["mH_label"] = 0

    df_signal["IsSig"] = 1
    df_signal["mH_label"] = None
    dsids_map = {312440: 400, 312441: 500, 312442: 600, 312443: 700, 312444: 800, 312445: 900, 312446: 1000}
    for dsid in dsids_map:
        mass = dsids_map[dsid]
        df_signal.loc[df_signal["mcChannelNumber"] == dsid, "mH_label"] = mass

    return pd.concat([df_ttbar, df_signal])


def preprocessSamples(df):
    df = apply_reweighting(df)
    df = add_sample_weights(df)
    df = add_HT_ratio(df)
    df = add_sum_rcjets_d12_d23(df)
    return df


def createDataset(df, path_to_save, dir_name, channel):
    node_scale = np.asarray([200000, np.pi, np.pi, 200000, 5, 1])
    edge_scale = np.asarray([np.pi, np.pi, np.pi])
    global_features_final_1L = [
        "HT_all",
        "mtw",
        "nJets",
        "nRCJetsM100",
        "Mbbb_Avg_DL1r_70",
        "dRbb_MindR_DL1r_70",
        "dRbl_MindR_DL1r_70",
        "Centrality_all",
        "dRjj_Avg",
        "Sum__jet_pcb_DL1r_btag_ordered_T__Iteration__6__",
        "Sum_rcjet_d12",
        "Sum_rcjet_d23",
        "HT_ratio",
        "Mjjj_AvgdRs3",
    ]
    global_features_final_2L = [
        "HT_all",
        "mll",
        "nJets",
        "nRCJetsM100",
        "Mbbb_Avg_DL1r_70",
        "dRbb_MindR_DL1r_70",
        "dRbl_MindR_DL1r_70",
        "Centrality_all",
        "dRjj_Avg",
        "Sum__jet_pcb_DL1r_btag_ordered_T__Iteration__6__",
        "Sum_rcjet_d12",
        "Sum_rcjet_d23",
        "HT_ratio",
        "Mjjj_AvgdRs3",
    ]
    global_scale = np.asarray([5000000, 1000000, 20, 5, 4000000, 3, 4, 1, 3, 30, 1000000, 400000, 20, 1000000])

    if channel == "1L":
        global_features = global_features_final_1L
    if channel == "2L":
        global_features = global_features_final_2L

    df["eventNumber"] = df["eventNumber"].astype(int)

    graphs, booking = CreateTorchGraphsTensor(
        df.query("eventNumber%2==0"), global_features, global_scale, node_scale, edge_scale
    )
    dataset_train = customDataset(graphs, booking)
    dataset_train.save_to(path_to_save + dir_name + "_Even")

    graphs, booking = CreateTorchGraphsTensor(
        df.query("eventNumber%2==1"), global_features, global_scale, node_scale, edge_scale
    )
    dataset_test = customDataset(graphs, booking)
    dataset_test.save_to(path_to_save + dir_name + "_Odd")

In [12]:
path_to_samples = "../../BSM4t-212169-1LOS/"
df_1L = loadSamples(path_to_samples, "1L")
df_2L = loadSamples(path_to_samples, "2L")

In [13]:
# prep data by generating sample weights and filling missing variables
df_1L = preprocessSamples(df_1L)
df_2L = preprocessSamples(df_2L)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["weight_rw"] = df["weight_rw"] * df["nnRewFactor"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["jet_truthflav5_sum"] = [
  df = df_bkg.append(df_sig, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["weight_rw"] = df["weight_rw"] * df["nnRewFactor"]
A value is trying to

In [5]:
createDataset(df_1L, "./", "1L", "1L")

446675/446675 complete. Time elapsed: 1287.2s,   Estimated time remaining: 0.0sss
Done
Creating graph data...
447056/447056 complete. Time elapsed: 1254.4s,   Estimated time remaining: 0.0sss
Done
Creating graph data...
446675/446675 complete. Time elapsed: 1019.3s,   Estimated time remaining: 0.0ss
Done
Creating graph data...
447056/447056 complete. Time elapsed: 1027.0s,   Estimated time remaining: 0.0ss
Done


In [15]:
createDataset(df_2L, "./", "2L", "2L")

Creating graph data...
109740/109740 complete. Time elapsed: 219.1s,   Estimated time remaining: 0.0ss
Done
Creating graph data...
110436/110436 complete. Time elapsed: 219.1s,   Estimated time remaining: 0.0ss
Done
109740/109740 complete. Time elapsed: 220.4s,   Estimated time remaining: 0.0ss
Done
Creating graph data...
110436/110436 complete. Time elapsed: 266.4s,   Estimated time remaining: 0.0ss
Done


In [16]:
# test load
from gnn_tools.graphs import customDataset
from torch_geometric.data import Data, Dataset

dataset_train = customDataset()
dataset_test = customDataset()

path_to_save = "./"
dir_name = "2L_nominal"

dataset_train.download_from("{}/{}_{}".format(path_to_save, dir_name, "Even"))
dataset_test.download_from("{}/{}_{}".format(path_to_save, dir_name, "Odd"))

Downloading file 1/10...
Downloading file 2/10...
Downloading file 3/10...
Downloading file 4/10...
Downloading file 5/10...
Downloading file 6/10...
Downloading file 7/10...
Downloading file 8/10...
Downloading file 9/10...
Downloading file 10/10...
Done
Downloading file 1/10...
Downloading file 2/10...
Downloading file 3/10...
Downloading file 4/10...
Downloading file 5/10...
Downloading file 6/10...
Downloading file 7/10...
Downloading file 8/10...
Downloading file 9/10...
Downloading file 10/10...
Done
Downloading file 1/10...
Downloading file 2/10...
Downloading file 3/10...
Downloading file 4/10...
Downloading file 5/10...
Downloading file 6/10...
Downloading file 7/10...
Downloading file 8/10...
Downloading file 9/10...
Downloading file 10/10...
Done
Downloading file 1/10...
Downloading file 2/10...
Downloading file 3/10...
Downloading file 4/10...
Downloading file 5/10...
Downloading file 6/10...
Downloading file 7/10...
Downloading file 8/10...
Downloading file 9/10...
Downloa

In [17]:
dataset_train.booking

Unnamed: 0,eventNumber,runNumber,mcChannelNumber,mH_label,pseudo_mH,nBTags_DL1r_70,nJets
0,9609540,284500,407344,0,400,3,7
1,11990012,284500,407344,0,700,3,7
2,15624166,284500,407344,0,900,4,7
3,18057350,284500,407344,0,500,3,7
4,12860832,284500,407344,0,900,3,7
...,...,...,...,...,...,...,...
109735,323722,310000,312445,900,900,3,8
109736,322396,310000,312445,900,900,4,8
109737,323036,310000,312445,900,900,3,9
109738,322310,310000,312445,900,900,4,11


Unnamed: 0,eventNumber,runNumber,mcChannelNumber,mH_label,pseudo_mH,nBTags_DL1r_70,nJets
0,9609540,284500,407344,0,400,3,7
1,11990012,284500,407344,0,700,3,7
2,15624166,284500,407344,0,900,4,7
3,18057350,284500,407344,0,500,3,7
4,12860832,284500,407344,0,900,3,7
...,...,...,...,...,...,...,...
109735,323722,310000,312445,900,900,3,8
109736,322396,310000,312445,900,900,4,8
109737,323036,310000,312445,900,900,3,9
109738,322310,310000,312445,900,900,4,11


In [18]:
dataset_train[0]

Data(x=[10, 5], edge_index=[2, 90], edge_attr=[90, 3], y=0.0, w=80.60401153564453, u=[1, 14], pseudo_mH=0.4000000059604645)

Data(x=[10, 5], edge_index=[2, 90], edge_attr=[90, 3], y=0.0, w=80.60401153564453, u=[1, 14], pseudo_mH=0.4000000059604645)